From dfef8d164ade704b4c97286d025a881446923e37 Mon Sep 17 00:00:00 2001
From: thibault allenet <thibault.allenet@cea.fr>
Date: Fri, 13 Dec 2024 15:27:09 +0000
Subject: [PATCH] Add Implementations for low bit kernels

---
 .../kernels/Convolution/CustomConv.hpp        |  146 +
 .../kernels/FullyConnected/CustomFc.hpp       |   83 +
 .../kernels/Pooling/CustomPooling.hpp         |  114 +
 .../_Aidge_Arm/kernels/Utils/Macs.hpp         | 3371 ++++++++++++++++-
 .../kernels/Utils/nn_scaling_functions.hpp    |   18 +-
 .../kernels/Utils/subkernels_functions.hpp    |  312 ++
 .../_Aidge_Arm/kernels/Utils/swar_arm_acle.h  |  356 ++
 .../_Aidge_Arm/kernels/Utils/utils.hpp        |    2 +-
 8 files changed, 4232 insertions(+), 170 deletions(-)
 create mode 100644 aidge_export_arm_cortexm/_Aidge_Arm/kernels/Convolution/CustomConv.hpp
 create mode 100644 aidge_export_arm_cortexm/_Aidge_Arm/kernels/FullyConnected/CustomFc.hpp
 create mode 100644 aidge_export_arm_cortexm/_Aidge_Arm/kernels/Pooling/CustomPooling.hpp
 create mode 100644 aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/subkernels_functions.hpp
 create mode 100644 aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/swar_arm_acle.h

diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Convolution/CustomConv.hpp b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Convolution/CustomConv.hpp
new file mode 100644
index 0000000..5f35aaf
--- /dev/null
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Convolution/CustomConv.hpp
@@ -0,0 +1,146 @@
+/*
+    (C) Copyright 2017 CEA LIST. All Rights Reserved.
+    Contributor(s): N2D2 Team
+
+    This software is governed by the CeCILL-C license under French law and
+    abiding by the rules of distribution of free software.  You can  use,
+    modify and/ or redistribute the software under the terms of the CeCILL-C
+    license as circulated by CEA, CNRS and INRIA at the following URL
+    "http://www.cecill.info".
+
+    As a counterpart to the access to the source code and  rights to copy,
+    modify and redistribute granted by the license, users are provided only
+    with a limited warranty  and the software's author,  the holder of the
+    economic rights,  and the successive licensors  have only  limited
+    liability.
+
+    The fact that you are presently reading this means that you have had
+    knowledge of the CeCILL-C license and that you accept its terms.
+*/
+
+#ifndef __N2D2_EXPORT_ARM_CONV_CUSTOM_HPP__
+#define __N2D2_EXPORT_ARM_CONV_CUSTOM_HPP__
+
+#include <cmath>
+
+#include "kernels/typedefs.hpp"
+#include "assert.h"
+#include "utils.hpp"
+#include "kernels/Macs.hpp"
+#include "kernels/subkernels_functions.hpp"
+
+namespace N2D2_Export {
+
+template<int NB_CHANNELS, 
+         int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
+         int NB_OUTPUTS, 
+         int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
+         int PADDING_Y, int PADDING_X,
+         int STRIDE_Y, int STRIDE_X,
+         int KERNEL_HEIGHT, int KERNEL_WIDTH,
+         ActivationFunction_T ACTIVATION,
+        //  // Memory mapping: inputs
+        //  int INPUT_MEM_CONT_OFFSET,
+        //  int INPUT_MEM_CONT_SIZE,
+        //  int INPUT_MEM_WRAP_OFFSET,
+        //  int INPUT_MEM_WRAP_SIZE,
+        //  int INPUT_MEM_STRIDE,
+        //  // Memory mapping: outputs
+        //  int OUTPUT_MEM_CONT_OFFSET,
+        //  int OUTPUT_MEM_CONT_SIZE,
+        //  int OUTPUT_MEM_WRAP_OFFSET,
+        //  int OUTPUT_MEM_WRAP_SIZE,
+        //  int OUTPUT_MEM_STRIDE,
+         typename Sum_T, typename Input_T, typename Output_T, 
+         typename Weight_T, typename Bias_T, typename Rescaling_T>
+__attribute__((always_inline)) inline static
+void customconvcellPropagate(const Input_T* __restrict inputs,
+                                      Output_T* __restrict outputs,
+                                      const Bias_T* __restrict biasses,
+                                      const Weight_T* __restrict weights,
+                                      const Rescaling_T& __restrict rescaling) 
+{
+    PackSupport infoPack = {0, 0};
+
+    constexpr int bits_norm_in = (std::numeric_limits<Input_T>::digits >= 8) 
+                        ? 8/std::ceil(8/(float)std::numeric_limits<Input_T>::digits) 
+                        : 8/std::floor(8/(float)std::numeric_limits<Input_T>::digits);
+
+    constexpr int bits_norm_wt = (std::numeric_limits<Weight_T>::digits >= 8) 
+                        ? 8/std::ceil(8/(float)std::numeric_limits<Weight_T>::digits) 
+                        : 8/std::floor(8/(float)std::numeric_limits<Weight_T>::digits);
+
+    constexpr int INPUTS_BYTE
+        = std::ceil(((NB_CHANNELS * bits_norm_in)
+          + (NB_CHANNELS * bits_norm_in) % 8) / (float)8);
+    constexpr int WEIGHTS_BYTE 
+        = std::ceil(((NB_CHANNELS * bits_norm_wt)
+          + (NB_CHANNELS * bits_norm_wt) % 8) / (float)8);
+
+    int outputOffset = 0;
+
+    int iy = 0;
+    for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
+        const int syMin = (PADDING_Y == 0) ? 0 : max(PADDING_Y - iy, 0);
+        const int syMax = (PADDING_Y == 0) ? KERNEL_HEIGHT 
+                                           : clamp(CHANNELS_HEIGHT + PADDING_Y - iy, 
+                                                   0, KERNEL_HEIGHT);
+
+        int ix = 0;
+        for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
+            const int sxMin = (PADDING_X == 0) ? 0 : max(PADDING_X - ix, 0);
+            const int sxMax = (PADDING_X == 0) ? KERNEL_WIDTH 
+                                               : clamp(CHANNELS_WIDTH + PADDING_X - ix,  
+                                                       0, KERNEL_WIDTH);
+
+            for (int och = 0; och < NB_OUTPUTS; ++och) {
+                Sum_T weightedSum = biasses[och];
+
+                for (int sy = 0; sy < KERNEL_HEIGHT; ++sy) {
+
+                    if (PADDING_Y != 0 && (sy < syMin || sy >= syMax)) {
+                        continue;
+                    }
+                    const int inputsOffset = (iy + sy - PADDING_Y) * CHANNELS_WIDTH * INPUTS_BYTE
+                                             + (ix - PADDING_X) * INPUTS_BYTE;
+
+                    const int weightsOffset = och * KERNEL_HEIGHT * KERNEL_WIDTH * WEIGHTS_BYTE
+                                              + sy * KERNEL_WIDTH * WEIGHTS_BYTE;
+
+                    // if (PADDING_X == 0
+                    //     && (NB_CHANNELS * std::numeric_limits<Weight_T>::digits % 8 == 0)
+                    //     && (NB_CHANNELS * std::numeric_limits<Input_T>::digits % 8 == 0)) {
+                    if (PADDING_X == 0
+                        && (NB_CHANNELS * bits_norm_wt % 8 == 0)
+                        && (NB_CHANNELS * bits_norm_in % 8 == 0)) {
+
+                        macsOnRange<KERNEL_WIDTH * NB_CHANNELS>(inputs + inputsOffset,
+                                                                weights + weightsOffset,
+                                                                weightedSum);
+                    } 
+                    else {
+                        for (int sx = 0; sx < KERNEL_WIDTH; ++sx) {
+                            if(sx < sxMin || sx >= sxMax) {
+                                continue;
+                            }
+                            macsOnRange<NB_CHANNELS>(inputs + inputsOffset + sx * INPUTS_BYTE,
+                                                     weights + weightsOffset + sx * WEIGHTS_BYTE,
+                                                     weightedSum);
+                        }
+                    }
+                }
+                Output_T output = sat<Output_T>(weightedSum,och, ACTIVATION, rescaling);
+                compact_data_during_loop(output, outputs, outputOffset, infoPack);
+            }
+            compact_data_end_loop(outputs, outputOffset, infoPack);
+
+            ix += STRIDE_X;
+        }
+        iy += STRIDE_Y;
+    }
+}
+
+
+}   // N2D2_Export
+
+#endif  // __N2D2_EXPORT_ARM_CONV_CUSTOM_HPP__
diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/FullyConnected/CustomFc.hpp b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/FullyConnected/CustomFc.hpp
new file mode 100644
index 0000000..dc36818
--- /dev/null
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/FullyConnected/CustomFc.hpp
@@ -0,0 +1,83 @@
+/*
+    (C) Copyright 2017 CEA LIST. All Rights Reserved.
+    Contributor(s): N2D2 Team
+
+    This software is governed by the CeCILL-C license under French law and
+    abiding by the rules of distribution of free software.  You can  use,
+    modify and/ or redistribute the software under the terms of the CeCILL-C
+    license as circulated by CEA, CNRS and INRIA at the following URL
+    "http://www.cecill.info".
+
+    As a counterpart to the access to the source code and  rights to copy,
+    modify and redistribute granted by the license, users are provided only
+    with a limited warranty  and the software's author,  the holder of the
+    economic rights,  and the successive licensors  have only  limited
+    liability.
+
+    The fact that you are presently reading this means that you have had
+    knowledge of the CeCILL-C license and that you accept its terms.
+*/
+
+#ifndef __N2D2_EXPORT_CPP_CUSTOMFC_HPP__
+#define __N2D2_EXPORT_CPP_CUSTOMFC_HPP__
+
+#include <cmath>
+
+#include "kernels/typedefs.hpp"
+#include "assert.h"
+#include "utils.hpp"
+#include "kernels/Macs.hpp"
+#include "kernels/subkernels_functions.hpp"
+
+namespace N2D2_Export {
+
+template<int NB_CHANNELS, int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
+         int NB_OUTPUTS, int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
+         ActivationFunction_T ACTIVATION,
+         typename Sum_T, typename Input_T, typename Output_T, 
+         typename Weight_T, typename Bias_T, typename Rescaling_T>
+__attribute__((always_inline)) inline static
+void fccellPropagate(const Input_T* __restrict inputs,
+                                    Output_T* __restrict outputs,
+                                    const Bias_T* __restrict biasses,
+                                    const Weight_T* __restrict weights,
+                                    const Rescaling_T& __restrict rescaling)
+{
+    static_assert(OUTPUTS_HEIGHT == 1, "Outputs height should be 1");
+    static_assert(OUTPUTS_WIDTH == 1, "Outputs width should be 1");
+
+    PackSupport infoPack = {0, 0};
+
+    constexpr int INPUTS_BYTE
+        = std::ceil(((NB_CHANNELS * std::numeric_limits<Input_T>::digits)
+          + (NB_CHANNELS * std::numeric_limits<Input_T>::digits) % 8) / (float)8);
+    constexpr int WEIGHTS_BYTE 
+        = std::ceil(((NB_CHANNELS * std::numeric_limits<Weight_T>::digits)
+          + (NB_CHANNELS * std::numeric_limits<Weight_T>::digits) % 8) / (float)8);
+
+    int outputOffset = 0;
+    for (int och = 0; och < NB_OUTPUTS; ++och) {
+        Sum_T weightedSum = biasses[och];
+
+        for (int iy = 0; iy < CHANNELS_HEIGHT; ++iy) {
+
+            for (int ix = 0; ix < CHANNELS_WIDTH; ++ix) {
+
+                const int weightsOffset = CHANNELS_HEIGHT * CHANNELS_WIDTH * WEIGHTS_BYTE * och 
+                                            + (CHANNELS_WIDTH * iy + ix) * WEIGHTS_BYTE;
+                const int inputsOffset = (CHANNELS_WIDTH * iy + ix) * INPUTS_BYTE;
+
+                macsOnRange<NB_CHANNELS>(inputs + inputsOffset,
+                                         weights + weightsOffset, 
+                                         weightedSum);
+            }
+        }
+        Output_T output = sat<Output_T>(weightedSum,och, ACTIVATION, rescaling);
+        compact_data_during_loop(output, outputs, outputOffset, infoPack);
+    }
+    compact_data_end_loop(outputs, outputOffset, infoPack);
+}
+
+}   // N2D2_Export
+
+#endif  // __N2D2_EXPORT_CPP_FC_HPP__
diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Pooling/CustomPooling.hpp b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Pooling/CustomPooling.hpp
new file mode 100644
index 0000000..bcc6a09
--- /dev/null
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Pooling/CustomPooling.hpp
@@ -0,0 +1,114 @@
+/*
+    (C) Copyright 2017 CEA LIST. All Rights Reserved.
+    Contributor(s): N2D2 Team
+
+    This software is governed by the CeCILL-C license under French law and
+    abiding by the rules of distribution of free software.  You can  use,
+    modify and/ or redistribute the software under the terms of the CeCILL-C
+    license as circulated by CEA, CNRS and INRIA at the following URL
+    "http://www.cecill.info".
+
+    As a counterpart to the access to the source code and  rights to copy,
+    modify and redistribute granted by the license, users are provided only
+    with a limited warranty  and the software's author,  the holder of the
+    economic rights,  and the successive licensors  have only  limited
+    liability.
+
+    The fact that you are presently reading this means that you have had
+    knowledge of the CeCILL-C license and that you accept its terms.
+*/
+
+#ifndef __N2D2_EXPORT_CPP_CUSTOMPOOLING_HPP__
+#define __N2D2_EXPORT_CPP_CUSTOMPOOLING_HPP__
+
+#include <cmath>
+
+#include "kernels/typedefs.hpp"
+#include "assert.h"
+#include "utils.hpp"
+#include "kernels/Macs.hpp"
+#include "kernels/subkernels_functions.hpp"
+
+
+namespace N2D2_Export {
+
+template<int NB_CHANNELS, int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
+        int NB_OUTPUTS, int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
+        int PADDING_Y, int PADDING_X,
+        int STRIDE_Y, int STRIDE_X,
+        int KERNEL_HEIGHT, int KERNEL_WIDTH,
+        Pooling_T POOLING, ActivationFunction_T ACTIVATION,
+        typename Input_T, typename Output_T>
+__attribute__((always_inline)) inline static
+void custompoolcellPropagate(const Input_T* __restrict inputs,
+                                    Output_T* __restrict outputs)
+{
+    static_assert(std::is_same<Input_T, Output_T>::value, "Input_T and Output_T must be the same.");
+    static_assert(NB_CHANNELS == NB_OUTPUTS, "nb_channels should be equal to nb_outputs.");
+    static_assert(POOLING == Max , "Only supports Max and Average pooling.");
+    static_assert(ACTIVATION == Linear, "Only supports a Linear activation.");
+
+    PackSupport infoPack = {0, 0};
+
+    constexpr int INPUTS_BYTE
+        = std::ceil(((NB_CHANNELS * std::numeric_limits<Input_T>::digits)
+        + (NB_CHANNELS * std::numeric_limits<Input_T>::digits) % 8) / (float)8);
+    constexpr int OUTPUTS_BYTE
+        = std::ceil(((NB_OUTPUTS * std::numeric_limits<Output_T>::digits)
+        + (NB_OUTPUTS * std::numeric_limits<Output_T>::digits) % 8) / (float)8);
+
+    int outputOffset = 0;
+
+    int iy = 0;
+    for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
+        const int syMin = (PADDING_Y == 0) ? 0 : max(PADDING_Y - iy, 0);
+        const int syMax = (PADDING_Y == 0) ? KERNEL_HEIGHT 
+                                        : clamp(CHANNELS_HEIGHT + PADDING_Y - iy, 
+                                                0, KERNEL_HEIGHT);
+
+        int ix = 0;
+        for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
+            const int sxMin = (PADDING_X == 0) ? 0 : max(PADDING_X - ix, 0);
+            const int sxMax = (PADDING_X == 0) ? KERNEL_WIDTH 
+                                            : clamp(CHANNELS_WIDTH + PADDING_X - ix,  
+                                                    0, KERNEL_WIDTH);
+
+            int och_c = 0;
+            while (och_c < OUTPUTS_BYTE) {
+
+                // typename std::conditional<(!std::is_unsigned<Input_T>::value && 
+                //         std::numeric_limits<Input_T>::digits == 32), data<32>, udata<32>>::type maxVal;
+                // maxVal = decltype(maxVal)::lowest();
+                typename std::conditional<(!std::is_unsigned<Input_T>::value && 
+                        std::numeric_limits<Input_T>::digits == 32), int32_t, uint32_t>::type maxVal;
+                maxVal = std::numeric_limits<decltype(maxVal)>::lowest();
+                
+                int nb_data = min(OUTPUTS_BYTE-och_c, get_pool_nbData(std::numeric_limits<Input_T>::digits));
+
+                for (int sy = 0; sy < KERNEL_HEIGHT; ++sy) {
+
+                    if (PADDING_Y != 0 && (sy < syMin || sy >= syMax)) {
+                        continue;
+                    }
+                    const int inputsOffset = (iy + sy - PADDING_Y) * CHANNELS_WIDTH * INPUTS_BYTE
+                                            + (ix - PADDING_X) * INPUTS_BYTE + och_c;
+
+                    for (int sx = 0; sx < KERNEL_WIDTH; ++sx) {
+                        if(sx < sxMin || sx >= sxMax) {
+                            continue;
+                        }
+                        parallelMaxPooling(inputs + inputsOffset + sx*INPUTS_BYTE, maxVal, nb_data);
+                    }
+                }
+                storeMaxPooling(outputs, outputOffset, maxVal, nb_data);
+                och_c += nb_data;
+            }
+
+            ix += STRIDE_X;
+        }
+        iy += STRIDE_Y;
+    }
+}
+
+}
+#endif
\ No newline at end of file
diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/Macs.hpp b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/Macs.hpp
index 5aa5183..a0a1f85 100644
--- a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/Macs.hpp
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/Macs.hpp
@@ -1,211 +1,3262 @@
-/*
-    (C) Copyright 2017 CEA LIST. All Rights Reserved.
-    Contributor(s): N2D2 Team
+// /*
+//     (C) Copyright 2017 CEA LIST. All Rights Reserved.
+//     Contributor(s): N2D2 Team
 
-    This software is governed by the CeCILL-C license under French law and
-    abiding by the rules of distribution of free software.  You can  use,
-    modify and/ or redistribute the software under the terms of the CeCILL-C
-    license as circulated by CEA, CNRS and INRIA at the following URL
-    "http://www.cecill.info".
+//     This software is governed by the CeCILL-C license under French law and
+//     abiding by the rules of distribution of free software.  You can  use,
+//     modify and/ or redistribute the software under the terms of the CeCILL-C
+//     license as circulated by CEA, CNRS and INRIA at the following URL
+//     "http://www.cecill.info".
 
-    As a counterpart to the access to the source code and  rights to copy,
-    modify and redistribute granted by the license, users are provided only
-    with a limited warranty  and the software's author,  the holder of the
-    economic rights,  and the successive licensors  have only  limited
-    liability.
+//     As a counterpart to the access to the source code and  rights to copy,
+//     modify and redistribute granted by the license, users are provided only
+//     with a limited warranty  and the software's author,  the holder of the
+//     economic rights,  and the successive licensors  have only  limited
+//     liability.
 
-    The fact that you are presently reading this means that you have had
-    knowledge of the CeCILL-C license and that you accept its terms.
-*/
+//     The fact that you are presently reading this means that you have had
+//     knowledge of the CeCILL-C license and that you accept its terms.
+// */
 
-#ifndef __N2D2_EXPORT_CPP_MACS_HPP__
-#define __N2D2_EXPORT_CPP_MACS_HPP__
+// #ifndef __N2D2_EXPORT_CPP_MACS_HPP__
+// #define __N2D2_EXPORT_CPP_MACS_HPP__
 
-#include <cstdint>
-#include <limits>
-#include <type_traits>
-#include <cmsis_compiler.h>
+// #include <cstdint>
+// #include <limits>
+// #include <type_traits>
+// #include <cmsis_compiler.h>
 
-namespace N2D2_Export {
+// #include "swar_arm_acle.h"
 
+// namespace N2D2_Export {
 
-template<typename Input_T>
-inline static
-uint32_t XTB16(uint32_t val) 
+
+// template<typename Input_T>
+// inline static
+// uint32_t XTB16(uint32_t val) 
+// {
+//     return std::is_unsigned<Input_T>::value ? __UXTB16(val) : __SXTB16(val);
+// }
+
+// template<int INPUTS_INC = 1,
+//          int WEIGHTS_INC = 1,
+//          typename Input_T,
+//          typename Weight_T,
+//          typename Sum_T>
+// inline static
+// Sum_T dualMac(const Input_T* __restrict inputs, 
+//               const Weight_T* __restrict weights, 
+//               Sum_T weightedSum) 
+// {
+//     weightedSum += inputs[0] * weights[0]
+//         + inputs[INPUTS_INC] * weights[WEIGHTS_INC];
+
+//     return weightedSum;
+// }
+
+// template<int INPUTS_INC = 1,
+//          int WEIGHTS_INC = 1,
+//          typename Input_T,
+//          typename Weight_T,
+//          typename Sum_T,
+//          typename std::enable_if<std::is_floating_point<Input_T>::value>::type* = nullptr>
+// inline static
+// Sum_T quadMac(const Input_T* __restrict inputs, 
+//               const Weight_T* __restrict weights, 
+//               Sum_T weightedSum) 
+// {
+//     weightedSum += inputs[0*INPUTS_INC] * weights[0*WEIGHTS_INC]
+//         + inputs[1*INPUTS_INC] * weights[1*WEIGHTS_INC]
+//         + inputs[2*INPUTS_INC] * weights[2*WEIGHTS_INC]
+//         + inputs[3*INPUTS_INC] * weights[3*WEIGHTS_INC];
+
+//     return weightedSum;
+// }
+
+// template<int INPUTS_INC = 1,
+//          int WEIGHTS_INC = 1,
+//          typename Input_T,
+//          typename Weight_T,
+//          typename Sum_T,
+//          typename std::enable_if<!std::is_floating_point<Input_T>::value>::type* = nullptr>
+// inline static
+// Sum_T quadMac(const Input_T* __restrict inputs, 
+//               const Weight_T* __restrict weights, 
+//               Sum_T weightedSum) 
+// {
+//     if(INPUTS_INC != 1 || WEIGHTS_INC != 1) {
+//         weightedSum += inputs[0*INPUTS_INC] * weights[0*WEIGHTS_INC]
+//             + inputs[1*INPUTS_INC] * weights[1*WEIGHTS_INC]
+//             + inputs[2*INPUTS_INC] * weights[2*WEIGHTS_INC]
+//             + inputs[3*INPUTS_INC] * weights[3*WEIGHTS_INC];
+
+//         return weightedSum;
+//     }
+
+//     // Inputs loading & preparation
+//     uint32_t in;
+//     memcpy((void*) &in, inputs, sizeof(in));
+    
+//     uint32_t in1 = XTB16<Input_T>(in);
+//     uint32_t in2 = XTB16<Input_T>(in >> 8);
+    
+//     // Weights loading & preparation
+//     uint32_t wt;
+//     memcpy((void*) &wt, weights, sizeof(wt));
+    
+//     uint32_t wt1 = XTB16<Weight_T>(wt);
+//     uint32_t wt2 = XTB16<Weight_T>(wt >> 8);
+
+//     // Computation
+//     if(std::is_same<Sum_T, int32_t>::value) {
+//         weightedSum = __SMLAD(in1, wt1, weightedSum);
+//         weightedSum = __SMLAD(in2, wt2, weightedSum);
+//     }
+//     else {
+//         weightedSum = __SMLALD(in1, wt1, weightedSum);
+//         weightedSum = __SMLALD(in2, wt2, weightedSum);
+        
+//     }
+    
+//     return weightedSum;
+// }
+
+
+
+
+// // ----------------------------------------------------------------------------
+// // -------------- MAC computing functions for kernel 4W-4A --------------------
+// // ----------------------------------------------------------------------------
+
+// /**
+//  * @brief   Unsigned mono mac operation (4W/4A version)
+//  * @details Performs one mac operation for signed 4-bits weights
+//  *          and unsigned 4-bits inputs.
+//  * 
+//  * @tparam  Input_T     Input type (should be udata<4>)
+//  * @tparam  Weight_T    Weight type (should be data<4>)
+//  * 
+//  * @param[in]      inputs          Pointer to input vector
+//  * @param[in]      weights         Pointer to kernel weights
+//  * @param[in,out]  weightedSum     Accumulating sum from the 
+//  *                                 previous mac operations
+//  * @returns                        Updated weightedSum with 
+//  *                                 the result of the dual mac operation
+//  */
+// template<typename Input_T, typename Weight_T, typename Sum_T,
+//          typename std::enable_if<(std::is_unsigned<Input_T>::value
+//          && std::numeric_limits<Weight_T>::digits == 4
+//          && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
+// __attribute__((always_inline)) static inline
+// Sum_T monoMac(const Input_T* __restrict inputs,
+//               const Weight_T* __restrict weights,
+//               Sum_T weightedSum)
+// {
+//     weightedSum += __UBFX(inputs[0], 4, 4) * __SBFX(weights[0], 4, 4);
+//     return weightedSum;
+// }
+
+// /**
+//  * @brief   Signed mono mac operation (4W/4A version)
+//  * @details Performs one mac operation for signed 4-bits weights
+//  *          and signed 4-bits inputs.
+//  * 
+//  * @tparam  Input_T     Input type (should be data<4>)
+//  * @tparam  Weight_T    Weight type (should be data<4>)
+//  * 
+//  * @param[in]      inputs          Pointer to input vector
+//  * @param[in]      weights         Pointer to kernel weights
+//  * @param[in,out]  weightedSum     Accumulating sum from the 
+//  *                                 previous mac operations
+//  * @returns                        Updated weightedSum with 
+//  *                                 the result of the dual mac operation
+//  */
+// template<typename Input_T, typename Weight_T, typename Sum_T,
+//          typename std::enable_if<(!std::is_unsigned<Input_T>::value
+//          && std::numeric_limits<Weight_T>::digits == 4
+//          && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
+// __attribute__((always_inline)) static inline
+// Sum_T monoMac(const Input_T* __restrict inputs,
+//               const Weight_T* __restrict weights,
+//               Sum_T weightedSum)
+// {
+//     weightedSum += __SBFX(inputs[0], 4, 4) * __SBFX(weights[0], 4, 4);
+//     return weightedSum;
+// }
+
+// /**
+//  * @brief   Unsigned dual mac operation (4W/4A version)
+//  * @details Performs two mac operations for signed 4-bits weights
+//  *          and unsigned 4-bits inputs. Extracts the two 4-bits weights
+//  *          from a stored 8-bits weight and associates them into 
+//  *          a 32-bits value. Then extracts the two 4-bits inputs
+//  *          from a stored 8-bits input and associates them into 
+//  *          a 32-bits value. Finally performs a dual mac operation 
+//  *          with the __SMLAD instruction
+//  * 
+//  * @tparam  Input_T     Input type (should be udata<4>)
+//  * @tparam  Weight_T    Weight type (should be data<4>)
+//  * 
+//  * @param[in]      inputs          Pointer to compressed input vector
+//  * @param[in]      weights         Pointer to compressed kernel weights
+//  * @param[in,out]  weightedSum     Accumulating sum from the 
+//  *                                 previous mac operations
+//  * @returns                        Updated weightedSum with 
+//  *                                 the result of the dual mac operation
+//  */
+// template<typename Input_T, typename Weight_T, typename Sum_T,
+//          typename std::enable_if<(std::is_unsigned<Input_T>::value
+//          && std::numeric_limits<Weight_T>::digits == 4
+//          && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
+// __attribute__((always_inline)) static inline
+// Sum_T dualMac(const Input_T* __restrict inputs,
+//               const Weight_T* __restrict weights,
+//               Sum_T weightedSum)
+// {
+//     uint8_t wt;
+//     std::memcpy((void*) &wt, weights, sizeof(wt));
+
+//     int32_t w0 = __SBFX(wt, 0, 4);
+//     int32_t w1 = __SBFX(wt, 4, 4);
+//     uint32_t wght = __BFI(w1, w0, 16, 16);
+
+//     uint8_t in;
+//     std::memcpy((void*) &in, inputs, sizeof(in));
+
+//     int32_t a0 = __UBFX(in, 0, 4);
+//     int32_t a1 = __UBFX(in, 4, 4);
+//     uint32_t act = __BFI(a1, a0, 16, 16);
+
+//     weightedSum = __SMLAD(act, wght, weightedSum);
+
+//     return weightedSum;
+// }
+
+// /**
+//  * @brief   Signed dual mac operation (4W/4A version)
+//  * @details Performs two mac operations for signed 4-bits weights
+//  *          and signed 4-bits inputs. Extracts the two 4-bits weights
+//  *          from a stored 8-bits weight and associates them into 
+//  *          a 32-bits value. Then extracts the two 4-bits inputs
+//  *          from a stored 8-bits input and associates them into 
+//  *          a 32-bits value. Finally performs a dual mac operation 
+//  *          with the __SMLAD instruction
+//  * 
+//  * @tparam  Input_T     Input type (should be data<4>)
+//  * @tparam  Weight_T    Weight type (should be data<4>)
+//  * 
+//  * @param[in]      inputs          Pointer to compressed input vector
+//  * @param[in]      weights         Pointer to compressed kernel weights
+//  * @param[in,out]  weightedSum     Accumulating sum from the 
+//  *                                 previous mac operations
+//  * @returns                        Updated weightedSum with 
+//  *                                 the result of the dual mac operation
+//  */
+// template<typename Input_T, typename Weight_T, typename Sum_T,
+//          typename std::enable_if<(!std::is_unsigned<Input_T>::value
+//          && std::numeric_limits<Weight_T>::digits == 4
+//          && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
+// __attribute__((always_inline)) static inline
+// Sum_T dualMac(const Input_T* __restrict inputs,
+//               const Weight_T* __restrict weights,
+//               Sum_T weightedSum)
+// {
+//     uint8_t wt;
+//     std::memcpy((void*) &wt, weights, sizeof(wt));
+
+//     int32_t w0 = __SBFX(wt, 0, 4);
+//     int32_t w1 = __SBFX(wt, 4, 4);
+//     uint32_t wght = __BFI(w1, w0, 16, 16);
+
+//     uint8_t in;
+//     std::memcpy((void*) &in, inputs, sizeof(in));
+
+//     int32_t a0 = __SBFX(in, 0, 4);
+//     int32_t a1 = __SBFX(in, 4, 4);
+//     uint32_t act = __BFI(a1, a0, 16, 16);
+
+//     weightedSum = __SMLAD(act, wght, weightedSum);
+
+//     return weightedSum;
+// }
+
+// /**
+//  * @brief   Unsigned quad mac operation (4W/4A version)
+//  * @details Performs four mac operations for signed 4-bits weights
+//  *          and unsigned 4-bits inputs. Extracts the four 4-bits weights
+//  *          from two stored 8-bits weights and associates them into 
+//  *          two 32-bits values. Then extracts the four 4-bits inputs
+//  *          from two stored 8-bits inputs and associates them into 
+//  *          two 32-bits values. Finally performs a double dual mac operation 
+//  *          with the __SMLAD instruction
+//  * 
+//  * @tparam  Input_T     Input type (should be udata<4>)
+//  * @tparam  Weight_T    Weight type (should be data<4>)
+//  * 
+//  * @param[in]      inputs          Pointer to compressed input vector
+//  * @param[in]      weights         Pointer to compressed kernel weights
+//  * @param[in,out]  weightedSum     Accumulating sum from the 
+//  *                                 previous mac operations
+//  * @returns                        Updated weightedSum with 
+//  *                                 the result of the quad mac operation
+//  */
+// template<typename Input_T, typename Weight_T, typename Sum_T,
+//          typename std::enable_if<(std::is_unsigned<Input_T>::value
+//          && std::numeric_limits<Weight_T>::digits == 4
+//          && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
+// __attribute__((always_inline)) static inline
+// Sum_T quadMac(const Input_T* __restrict inputs,
+//               const Weight_T* __restrict weights,
+//               Sum_T weightedSum)
+// {
+//     uint16_t wt;
+//     std::memcpy((void*) &wt, weights, sizeof(wt));
+
+//     int32_t w0 = __SBFX(wt, 0, 4);
+//     int32_t w1 = __SBFX(wt, 4, 4);
+//     int32_t w2 = __SBFX(wt, 8, 4);
+//     int32_t w3 = __SBFX(wt, 12, 4);
+
+//     uint32_t evenW1 = __BFI(w2, w0, 16, 16);
+//     uint32_t oddW1  = __BFI(w3, w1, 16, 16);
+
+//     uint16_t in;
+//     std::memcpy((void*) &in, inputs, sizeof(in));
+
+//     int32_t a0 = __UBFX(in, 0, 4);
+//     int32_t a1 = __UBFX(in, 4, 4);
+//     int32_t a2 = __UBFX(in, 8, 4);
+//     int32_t a3 = __UBFX(in, 12, 4);
+
+//     uint32_t evenA1 = __BFI(a2, a0, 16, 16);
+//     uint32_t oddA1  = __BFI(a3, a1, 16, 16);
+
+//     weightedSum = __SMLAD(evenA1, evenW1, weightedSum);
+//     weightedSum = __SMLAD(oddA1, oddW1, weightedSum);
+
+//     return weightedSum;
+// }
+
+// /**
+//  * @brief   Signed quad mac operation (4W/4A version)
+//  * @details Performs four mac operations for signed 4-bits weights
+//  *          and signed 4-bits inputs. Extracts the four 4-bits weights
+//  *          from two stored 8-bits weights and associates them into 
+//  *          two 32-bits values. Then extracts the four 4-bits inputs
+//  *          from two stored 8-bits inputs and associates them into 
+//  *          two 32-bits values. Finally performs a double dual mac operation 
+//  *          with the __SMLAD instruction
+//  * 
+//  * @tparam  Input_T     Input type (should be data<4>)
+//  * @tparam  Weight_T    Weight type (should be data<4>)
+//  * 
+//  * @param[in]      inputs          Pointer to compressed input vector
+//  * @param[in]      weights         Pointer to compressed kernel weights
+//  * @param[in,out]  weightedSum     Accumulating sum from the 
+//  *                                 previous mac operations
+//  * @returns                        Updated weightedSum with 
+//  *                                 the result of the quad mac operation
+//  */
+// template<typename Input_T, typename Weight_T, typename Sum_T,
+//          typename std::enable_if<(!std::is_unsigned<Input_T>::value
+//          && std::numeric_limits<Weight_T>::digits == 4
+//          && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
+// __attribute__((always_inline)) static inline
+// Sum_T quadMac(const Input_T* __restrict inputs,
+//               const Weight_T* __restrict weights,
+//               Sum_T weightedSum)
+// {
+//     uint16_t wt;
+//     std::memcpy((void*) &wt, weights, sizeof(wt));
+
+//     int32_t w0 = __SBFX(wt, 0, 4);
+//     int32_t w1 = __SBFX(wt, 4, 4);
+//     int32_t w2 = __SBFX(wt, 8, 4);
+//     int32_t w3 = __SBFX(wt, 12, 4);
+
+//     uint32_t evenW1 = __PKHBT(w2, w0, 16);
+//     uint32_t oddW1  = __PKHBT(w3, w1, 16);
+
+//     uint16_t in;
+//     std::memcpy((void*) &in, inputs, sizeof(in));
+
+//     int32_t a0 = __SBFX(in, 0, 4);
+//     int32_t a1 = __SBFX(in, 4, 4);
+//     int32_t a2 = __SBFX(in, 8, 4);
+//     int32_t a3 = __SBFX(in, 12, 4);
+    
+//     uint32_t evenA1 = __PKHBT(a2, a0, 16);
+//     uint32_t oddA1  = __PKHBT(a3, a1, 16);
+
+//     weightedSum = __SMLAD(evenA1, evenW1, weightedSum);
+//     weightedSum = __SMLAD(oddA1, oddW1, weightedSum);
+
+//     return weightedSum;
+// }
+
+// /**
+//  * @brief   Unsigned octo mac operation (4W/4A version)
+//  * @details Performs eight mac operations for signed 4-bits weights
+//  *          and unsigned 4-bits inputs. Extracts the eight 4-bits weights
+//  *          from four stored 8-bits weights and associates them into 
+//  *          four 32-bits values. Then extracts the eight 4-bits inputs
+//  *          from four stored 8-bits inputs and associates them into 
+//  *          four 32-bits values. Finally performs a quadruple dual mac operation 
+//  *          with the __SMLAD instruction
+//  * 
+//  * @tparam  Input_T     Input type (should be udata<4>)
+//  * @tparam  Weight_T    Weight type (should be data<4>)
+//  * 
+//  * @param[in]      inputs          Pointer to compressed input vector
+//  * @param[in]      weights         Pointer to compressed kernel weights
+//  * @param[in,out]  weightedSum     Accumulating sum from the 
+//  *                                 previous mac operations
+//  * @returns                        Updated weightedSum with 
+//  *                                 the result of the octo mac operation
+//  */
+// // template<typename Input_T, typename Weight_T,
+// //          typename std::enable_if<(std::is_unsigned<Input_T>::value
+// //          && std::numeric_limits<Weight_T>::digits == 4
+// //          && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
+// // __attribute__((always_inline)) static inline
+// // Sum_T octoMac(const Input_T* __restrict inputs,
+// //               const Weight_T* __restrict weights,
+// //               Sum_T weightedSum)
+// // {
+// //     uint32_t wt;
+// //     std::memcpy((void*) &wt, weights, sizeof(wt));
+
+// //     int32_t w0 = __SBFX(wt, 0, 4);
+// //     int32_t w1 = __SBFX(wt, 4, 4);
+// //     int32_t w2 = __SBFX(wt, 8, 4);
+// //     int32_t w3 = __SBFX(wt, 12, 4);
+// //     int32_t w4 = __SBFX(wt, 16, 4);
+// //     int32_t w5 = __SBFX(wt, 20, 4);
+// //     int32_t w6 = __SBFX(wt, 24, 4);
+// //     int32_t w7 = __SBFX(wt, 28, 4);
+
+// //     // uint32_t weight0 = __BFI(w4, w0, 16, 16);
+// //     // uint32_t weight1 = __BFI(w5, w1, 16, 16);
+// //     // uint32_t weight2 = __BFI(w6, w2, 16, 16);
+// //     // uint32_t weight3 = __BFI(w7, w3, 16, 16);
+
+// //     uint32_t weight0 = __PKHBT(w0, w4, 16);
+// //     uint32_t weight1 = __PKHBT(w1, w5, 16);
+// //     uint32_t weight2 = __PKHBT(w2, w6, 16);
+// //     uint32_t weight3 = __PKHBT(w3, w7, 16);
+
+// //     uint32_t in;
+// //     std::memcpy((void*) &in, inputs, sizeof(in));
+
+// //     uint32_t act0 = in & 0xF000F;
+// //     uint32_t act1 = (in >> 4) & 0xF000F;
+// //     uint32_t act2 = (in >> 8) & 0xF000F;
+// //     uint32_t act3 = (in >> 12) & 0xF000F;
+
+// //     weightedSum = __SMLAD(act0, weight0, weightedSum);
+// //     weightedSum = __SMLAD(act1, weight1, weightedSum);
+// //     weightedSum = __SMLAD(act2, weight2, weightedSum);
+// //     weightedSum = __SMLAD(act3, weight3, weightedSum);
+
+// //     return weightedSum;
+// // }
+
+// // template<typename Input_T, typename Weight_T,
+// //          typename std::enable_if<(std::is_unsigned<Input_T>::value
+// //          && std::numeric_limits<Weight_T>::digits == 4
+// //          && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
+// // __attribute__((always_inline)) static inline
+// // Sum_T octoMac(const Input_T* __restrict inputs,
+// //               const Weight_T* __restrict weights,
+// //               Sum_T weightedSum)
+// // {
+// //     union n2d2_dataword wt;
+// //     std::memcpy((void*) &wt, weights, sizeof(wt));
+
+// //     union n2d2_udataword in;
+// //     std::memcpy((void*) &in, inputs, sizeof(in));
+
+// //     for (int i = 0; i < 4; ++i) {
+// //         weightedSum += (data<32>)(in.half_bytes[i].fields.op0) * wt.half_bytes[i].fields.op0;
+// //         weightedSum += (data<32>)(in.half_bytes[i].fields.op1) * wt.half_bytes[i].fields.op1;
+// //     }
+
+// //     // weightedSum += (data<32>)(in.half_bytes[0].fields.op0) * wt.half_bytes[0].fields.op0;
+// //     // weightedSum += (data<32>)(in.half_bytes[0].fields.op1) * wt.half_bytes[0].fields.op1;
+// //     // weightedSum += (data<32>)(in.half_bytes[1].fields.op0) * wt.half_bytes[1].fields.op0;
+// //     // weightedSum += (data<32>)(in.half_bytes[1].fields.op1) * wt.half_bytes[1].fields.op1;
+// //     // weightedSum += (data<32>)(in.half_bytes[2].fields.op0) * wt.half_bytes[2].fields.op0;
+// //     // weightedSum += (data<32>)(in.half_bytes[2].fields.op1) * wt.half_bytes[2].fields.op1;
+// //     // weightedSum += (data<32>)(in.half_bytes[3].fields.op0) * wt.half_bytes[3].fields.op0;
+// //     // weightedSum += (data<32>)(in.half_bytes[3].fields.op1) * wt.half_bytes[3].fields.op1;
+
+// //     return weightedSum;
+// // }
+
+// template<typename Input_T, typename Weight_T, typename Sum_T,
+//          typename std::enable_if<(std::is_unsigned<Input_T>::value
+//          && std::numeric_limits<Weight_T>::digits == 4
+//          && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
+// __attribute__((always_inline)) static inline
+// Sum_T octoMac(const Input_T* __restrict inputs,
+//               const Weight_T* __restrict weights,
+//               Sum_T weightedSum)
+// {
+//     uint32_t wt;
+//     memcpy((void*) &wt, weights, sizeof(wt));
+
+//     // Works with weights * 4096 (weights << 12)
+//     const uint32_t WeightMask = 0xF000F000;
+//     uint32_t weight0 = WeightMask & (wt << 12);
+//     uint32_t weight1 = WeightMask & (wt << 8);
+//     uint32_t weight2 = WeightMask & (wt << 4);
+//     uint32_t weight3 = WeightMask & (wt);
+
+//     uint32_t in;
+//     memcpy((void*) &in, inputs, sizeof(in));
+
+//     const uint32_t ActMask = 0x000F000F; // to explicit instructions
+//     uint32_t act0 = in & ActMask;
+//     // Expect second operand shift
+//     uint32_t act1 = ActMask & (in >> 4);
+//     uint32_t act2 = ActMask & (in >> 8);
+//     uint32_t act3 = ActMask & (in >> 12);
+
+//     Sum_T sum = 0;
+//     sum = __SMLAD(act0, weight0, sum);
+//     sum = __SMLAD(act1, weight1, sum);
+//     sum = __SMLAD(act2, weight2, sum);
+//     sum = __SMLAD(act3, weight3, sum);
+
+//     return weightedSum + (sum >> 12);
+// }
+
+// /**
+//  * @brief   Signed octo mac operation (4W/4A version)
+//  * @details Performs eight mac operations for signed 4-bits weights
+//  *          and signed 4-bits inputs. Extracts the eight 4-bits weights
+//  *          from four stored 8-bits weights and associates them into 
+//  *          four 32-bits values. Then extracts the eight 4-bits inputs
+//  *          from four stored 8-bits inputs and associates them into 
+//  *          four 32-bits values. Finally performs a quadruple dual mac operation 
+//  *          with the __SMLAD instruction
+//  * 
+//  * @tparam  Input_T     Input type (should be data<4>)
+//  * @tparam  Weight_T    Weight type (should be data<4>)
+//  * 
+//  * @param[in]      inputs          Pointer to compressed input vector
+//  * @param[in]      weights         Pointer to compressed kernel weights
+//  * @param[in,out]  weightedSum     Accumulating sum from the 
+//  *                                 previous mac operations
+//  * @returns                        Updated weightedSum with 
+//  *                                 the result of the octo mac operation
+//  */
+// template<typename Input_T, typename Weight_T, typename Sum_T,
+//          typename std::enable_if<(!std::is_unsigned<Input_T>::value
+//          && std::numeric_limits<Weight_T>::digits == 4
+//          && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
+// __attribute__((always_inline)) static inline
+// Sum_T octoMac(const Input_T* __restrict inputs,
+//               const Weight_T* __restrict weights,
+//               Sum_T weightedSum)
+// {
+//     uint32_t wt;
+//     std::memcpy((void*) &wt, weights, sizeof(wt));
+
+//     int32_t w0 = __SBFX(wt, 0, 4);
+//     int32_t w1 = __SBFX(wt, 4, 4);
+//     int32_t w2 = __SBFX(wt, 8, 4);
+//     int32_t w3 = __SBFX(wt, 12, 4);
+//     int32_t w4 = __SBFX(wt, 16, 4);
+//     int32_t w5 = __SBFX(wt, 20, 4);
+//     int32_t w6 = __SBFX(wt, 24, 4);
+//     int32_t w7 = __SBFX(wt, 28, 4);
+
+//     uint32_t evenW1 = __PKHBT(w2, w0, 16);
+//     uint32_t oddW1  = __PKHBT(w3, w1, 16);
+//     uint32_t evenW2 = __PKHBT(w6, w4, 16);
+//     uint32_t oddW2  = __PKHBT(w7, w5, 16);
+
+//     uint32_t in;
+//     std::memcpy((void*) &in, inputs, sizeof(in));
+
+//     int32_t a0 = __SBFX(in, 0, 4);
+//     int32_t a1 = __SBFX(in, 4, 4);
+//     int32_t a2 = __SBFX(in, 8, 4);
+//     int32_t a3 = __SBFX(in, 12, 4);
+//     int32_t a4 = __SBFX(in, 16, 4);
+//     int32_t a5 = __SBFX(in, 20, 4);
+//     int32_t a6 = __SBFX(in, 24, 4);
+//     int32_t a7 = __SBFX(in, 28, 4);
+
+//     uint32_t evenA1 = __PKHBT(a2, a0, 16);
+//     uint32_t oddA1  = __PKHBT(a3, a1, 16);
+//     uint32_t evenA2 = __PKHBT(a6, a4, 16);
+//     uint32_t oddA2  = __PKHBT(a7, a5, 16);
+
+//     weightedSum = __SMLAD(evenA1, evenW1, weightedSum);
+//     weightedSum = __SMLAD(oddA1, oddW1, weightedSum);
+//     weightedSum = __SMLAD(evenA2, evenW2, weightedSum);
+//     weightedSum = __SMLAD(oddA2, oddW2, weightedSum);
+
+//     return weightedSum;
+// }
+
+
+// // template<typename Input_T, typename Weight_T, typename Sum_T,
+// //          typename std::enable_if<(std::is_unsigned<Input_T>::value
+// //          && std::numeric_limits<Weight_T>::digits == 4
+// //          && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
+// // void macsOnParallel(const Input_T* __restrict inputs,
+// //                     const Weight_T* __restrict weights,
+// //                     Sum_T* weightedSums,
+// //                     const int nb_data)
+// // {
+// //     uint32_t wt = 0;
+// //     std::memcpy((void*) &wt, weights, ceil((double)nb_data/2));
+
+// //     uint32_t in = 0;
+// //     std::memcpy((void*) &in, inputs, ceil((double)nb_data/2));
+
+// //     for (int i = 0; i < nb_data; ++i) {
+// //         weightedSums[i] += __SBFX(wt, 4*i, 4) * __UBFX(in, 4*i, 4);
+// //     }
+// // }
+
+// // template<typename Input_T, typename Weight_T, typename Sum_T,
+// //          typename std::enable_if<(!std::is_unsigned<Input_T>::value
+// //          && std::numeric_limits<Weight_T>::digits == 4
+// //          && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
+// // void macsOnParallel(const Input_T* __restrict inputs,
+// //                     const Weight_T* __restrict weights,
+// //                     Sum_T* weightedSums,
+// //                     const int nb_data)
+// // {
+// //     uint32_t wt = 0;
+// //     std::memcpy((void*) &wt, weights, ceil((double)nb_data/2));
+
+// //     uint32_t in = 0;
+// //     std::memcpy((void*) &in, inputs, ceil((double)nb_data/2));
+
+// //     for (int i = 0; i < nb_data; ++i) {
+// //         weightedSums[i] += __SBFX(wt, 4*i, 4) * __SBFX(in, 4*i, 4);
+// //     }
+// // }
+
+
+
+
+// // **************************************************************************
+// // * Multiply-accumulate the values in inputs and weights for NB_ITERATIONS *
+// // **************************************************************************
+
+// template<int NB_ITERATIONS,
+//          int INPUTS_INC = 1,
+//          int WEIGHTS_INC = 1,
+//          class Input_T, 
+//          class Weight_T,
+//          class Sum_T,
+//          typename std::enable_if<(NB_ITERATIONS == 0)>::type* = nullptr>
+// inline static 
+// void macsOnRange(const Input_T* __restrict /*inputs*/, 
+//                  const Weight_T* __restrict /*weights*/, 
+//                  Sum_T& __restrict /*weightedSum*/) 
+// {
+//     // Nothing to do
+// }
+
+// template<int NB_ITERATIONS,
+//          int INPUTS_INC = 1,
+//          int WEIGHTS_INC = 1,
+//          class Input_T, 
+//          class Weight_T,
+//          class Sum_T,
+//          typename std::enable_if<(NB_ITERATIONS == 1)>::type* = nullptr>
+// inline static 
+// void macsOnRange(const Input_T* __restrict inputs, 
+//                  const Weight_T* __restrict weights, 
+//                  Sum_T& __restrict weightedSum) 
+// {
+//     weightedSum += (*weights) * (*inputs);
+// }
+
+// template<int NB_ITERATIONS,
+//          int INPUTS_INC = 1,
+//          int WEIGHTS_INC = 1,
+//          class Input_T, 
+//          class Weight_T,
+//          class Sum_T,
+//          typename std::enable_if<(NB_ITERATIONS >= 2 && NB_ITERATIONS < 4)>::type* = nullptr>
+// inline static 
+// void macsOnRange(const Input_T* __restrict inputs, 
+//                  const Weight_T* __restrict weights, 
+//                  Sum_T& __restrict weightedSum) 
+// {
+//     weightedSum = dualMac<INPUTS_INC, WEIGHTS_INC>(inputs, weights, weightedSum);
+//     macsOnRange<NB_ITERATIONS - 2, INPUTS_INC, WEIGHTS_INC>(inputs + 2*INPUTS_INC, 
+//                                                             weights + 2*WEIGHTS_INC, 
+//                                                             weightedSum);
+// }
+
+// /**
+//  * @brief   MACs Processing
+//  * @details Performs NB_ITERATIONS MACs operations, storing results into the
+//  *          weightedSum variable. 
+//  * 
+//  * @tparam  NB_ITERATIONS   Number of MACs to perform
+//  * @tparam  INPUTS_INC      Input Stride
+//  * @tparam  WEIGHTS_INC     Weights Stride
+//  * @tparam  Input_T         Input Type
+//  * 
+//  * @param   inputs          Pointer to inputs vector
+//  * @param   weights         Pointer to weights vector
+//  * @param   weightedSum     Pointer to weightedSum
+// */
+// template<int NB_ITERATIONS,
+//          int INPUTS_INC = 1,
+//          int WEIGHTS_INC = 1,
+//          class Input_T, 
+//          class Weight_T,
+//          class Sum_T,
+//          typename std::enable_if<(NB_ITERATIONS >= 4)>::type* = nullptr>
+// inline static 
+// void macsOnRange(const Input_T* __restrict inputs, 
+//                  const Weight_T* __restrict weights, 
+//                  Sum_T& __restrict weightedSum) 
+// {
+//     weightedSum = quadMac<INPUTS_INC, WEIGHTS_INC>(inputs, weights, weightedSum);
+//     macsOnRange<NB_ITERATIONS - 4, INPUTS_INC, WEIGHTS_INC>(inputs + 4*INPUTS_INC, 
+//                                                             weights + 4*WEIGHTS_INC, 
+//                                                             weightedSum);
+// }
+
+
+// template<int NB_ITERATIONS, typename Input_T, typename Weight_T, typename Sum_T,
+//          typename std::enable_if<(NB_ITERATIONS >= 2 && NB_ITERATIONS < 4 && std::numeric_limits<Weight_T>::digits > 1)>::type* = nullptr>
+// __attribute__((always_inline)) static inline
+// void macsOnRange(const Input_T* __restrict inputs,
+//                  const Weight_T* __restrict weights,
+//                  Sum_T& weightedSum)
+// {
+//     constexpr unsigned int idxI 
+//         = (std::numeric_limits<Input_T>::digits > 4) ? 2 : 1;
+//     constexpr unsigned int idxW 
+//         = (std::numeric_limits<Weight_T>::digits > 4) ? 2 : 1;
+
+//     weightedSum = dualMac(inputs, weights, weightedSum);
+//     macsOnRange<NB_ITERATIONS - 2>(inputs + idxI, weights + idxW, weightedSum);
+// }
+
+// template<int NB_ITERATIONS, typename Input_T, typename Weight_T, typename Sum_T,
+//          typename std::enable_if<NB_ITERATIONS >= 4 
+//          && (std::numeric_limits<Weight_T>::digits > 4)>::type* = nullptr>
+// __attribute__((always_inline)) static inline
+// void macsOnRange(const Input_T* __restrict inputs,
+//                  const Weight_T* __restrict weights,
+//                  Sum_T& weightedSum)
+// {
+//     constexpr unsigned int idxI 
+//         = (std::numeric_limits<Input_T>::digits > 4) 
+//           ? 4 : (std::numeric_limits<Input_T>::digits == 4) ? 2 : 1;
+
+//     constexpr unsigned int idxW = 4;
+
+//     weightedSum = quadMac(inputs, weights, weightedSum);
+//     macsOnRange<NB_ITERATIONS - 4>(inputs + idxI, weights + idxW, weightedSum);
+// }
+
+// template<int NB_ITERATIONS, typename Input_T, typename Weight_T, typename Sum_T,
+//          typename std::enable_if<(NB_ITERATIONS >= 4 && NB_ITERATIONS < 8) 
+//          && (std::numeric_limits<Weight_T>::digits == 4)>::type* = nullptr>
+// __attribute__((always_inline)) static inline
+// void macsOnRange(const Input_T* __restrict inputs,
+//                  const Weight_T* __restrict weights,
+//                  Sum_T& weightedSum)
+// {
+//     constexpr unsigned int idxI 
+//         = (std::numeric_limits<Input_T>::digits > 4) 
+//           ? 4 : (std::numeric_limits<Input_T>::digits == 4) ? 2 : 1;
+
+//     constexpr unsigned int idxW = 2;
+
+//     weightedSum = quadMac(inputs, weights, weightedSum);
+//     macsOnRange<NB_ITERATIONS - 4>(inputs + idxI, weights + idxW, weightedSum);
+// }
+
+// template<int NB_ITERATIONS, typename Input_T, typename Weight_T, typename Sum_T,
+//          typename std::enable_if<NB_ITERATIONS >= 8 
+//          && (std::numeric_limits<Weight_T>::digits == 4)>::type* = nullptr>
+// __attribute__((always_inline)) static inline
+// void macsOnRange(const Input_T* __restrict inputs,
+//                  const Weight_T* __restrict weights,
+//                  Sum_T& weightedSum)
+// {
+//     constexpr unsigned int idxI 
+//         = (std::numeric_limits<Input_T>::digits > 4) 
+//           ? 8 : (std::numeric_limits<Input_T>::digits == 4) 
+//             ? 4 : (std::numeric_limits<Input_T>::digits == 2)
+//               ? 2 : 1;
+
+//     constexpr unsigned int idxW = 4;
+
+//     weightedSum = octoMac(inputs, weights, weightedSum);
+//     macsOnRange<NB_ITERATIONS - 8>(inputs + idxI, weights + idxW, weightedSum);
+// }
+
+
+// }   // N2D2_Export
+
+// #endif  // __N2D2_EXPORT_CPP_MACS_HPP__
+
+
+
+
+/**
+ ******************************************************************************
+ * @file     mac_functions.hpp
+ * @brief    Mac operation functions for ARM Cortex m7 and m4
+ *           This file provides different functions to perform
+ *           signed and unsigned mac operations. Those functions can calculate
+ *           up to eight mac operations at once.
+ *           The file also provides two general mac operation which can be
+ *           used in other files, especially in Network.hpp
+ * 
+ ******************************************************************************
+ * @attention
+ * 
+ * (C) Copyright 2021 CEA LIST. All Rights Reserved.
+ *  Contributor(s): Vincent TEMPLIER (vincent.templier@cea.fr)
+ *                  Philippe DORE (philippe.dore@cea.fr)
+ *                  David BRIAND (david.briand@cea.fr)
+ * 
+ * This file is not part of the open source version of N2D2 and is NOT under
+ * the CeCILL-C license. This code is the property of the CEA. It can not be
+ * copied or disseminated without its authorization.
+ * 
+ ******************************************************************************
+ */
+
+#ifndef __N2D2_MAC_FUNCTIONS_HPP__
+#define __N2D2_MAC_FUNCTIONS_HPP__
+
+#include <cstring>
+#include "swar_arm_acle.h"
+#include "kernels/typedefs.hpp"
+
+
+// ----------------------------------------------------------------------------
+// --------------- MAC computing functions for all kernels --------------------
+// ----------------------------------------------------------------------------
+
+
+// ----------------------------------------------------------------------------
+// -------------- MAC computing functions for kernel 8W-8A --------------------
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief   Mono mac operation (8W/8A version)
+ * @details Performs one mac operation for signed 8-bits weights
+ *          and 8-bits inputs (signed or not).
+ * 
+ * @tparam  Input_T     Input type (udata<8> or data<8>)
+ * @tparam  Weight_T    Weight type (should be data<8>)
+ * 
+ * @param[in]      inputs          Pointer to input vector
+ * @param[in]      weights         Pointer to kernel weights
+ * @param[in,out]  weightedSum     Accumulating sum from the 
+ *                                 previous mac operations
+ * @returns                        Updated weightedSum with 
+ *                                 the result of the dual mac operation
+ */
+template<typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 8
+         && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+Sum_T monoMac(const Input_T* __restrict inputs,
+              const Weight_T* __restrict weights,
+              Sum_T weightedSum)
+{
+    weightedSum += (Sum_T)inputs[0] * weights[0];
+    return weightedSum;
+}
+
+/**
+ * @brief   Dual mac operation (8W/8A version)
+ * @details Performs two mac operations for signed 8-bits weights
+ *          and 8-bits inputs (signed or not).
+ * 
+ * @tparam  Input_T     Input type (udata<8> or data<8>)
+ * @tparam  Weight_T    Weight type (should be data<8>)
+ * 
+ * @param[in]      inputs          Pointer to input vector
+ * @param[in]      weights         Pointer to kernel weights
+ * @param[in,out]  weightedSum     Accumulating sum from the 
+ *                                 previous mac operations
+ * @returns                        Updated weightedSum with 
+ *                                 the result of the dual mac operation
+ */
+template<typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 8
+         && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+Sum_T dualMac(const Input_T* __restrict inputs,
+              const Weight_T* __restrict weights,
+              Sum_T weightedSum)
+{
+    weightedSum += (Sum_T)inputs[0] * weights[0] + (Sum_T)inputs[1] * weights[1];
+    return weightedSum;
+}
+
+/**
+ * @brief   Unsigned quad mac operation (8W/8A version)
+ * @details Performs four mac operations for signed 8-bits weights
+ *          and unsigned 8-bits inputs. Sign extends four 8-bits weights
+ *          and associates them into two 32-bits values. Then zero extends 
+ *          four 8-bits inputs and associates them into two 32-bits values. 
+ *          Finally performs a double dual mac operation 
+ *          with the __SMLAD instruction.
+ * 
+ * @tparam  Input_T     Input type (should be udata<8>)
+ * @tparam  Weight_T    Weight type (should be data<8>)
+ * 
+ * @param[in]      inputs          Pointer to input vector
+ * @param[in]      weights         Pointer to kernel weights
+ * @param[in,out]  weightedSum     Accumulating sum from the 
+ *                                 previous mac operations
+ * @returns                        Updated weightedSum with 
+ *                                 the result of the quad mac operation
+ */
+template<typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::is_unsigned<Input_T>::value
+         && std::numeric_limits<Weight_T>::digits == 8
+         && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+Sum_T quadMac(const Input_T* __restrict inputs,
+              const Weight_T* __restrict weights,
+              Sum_T weightedSum)
+{
+    uint32_t in;
+    std::memcpy((void*) &in, inputs, sizeof(in));
+
+    uint32_t in1 = __UXTB16(in);
+    uint32_t in2 = __UXTB16_RORn(in, 8);
+
+    uint32_t wt;
+    std::memcpy((void*) &wt, weights, sizeof(wt));
+
+    uint32_t wt1 = __SXTB16(wt);
+    uint32_t wt2 = __SXTB16_RORn(wt, 8);
+
+    weightedSum = __SMLAD(in1, wt1, weightedSum);
+    weightedSum = __SMLAD(in2, wt2, weightedSum);
+    
+    return weightedSum;
+}
+
+/**
+ * @brief   Signed quad mac operation (8W/8A version)
+ * @details Performs four mac operations for signed 8-bits weights
+ *          and signed 8-bits inputs. Sign extends four 8-bits weights
+ *          and associates them into two 32-bits values. Then sign extends 
+ *          four 8-bits inputs and associates them into two 32-bits values. 
+ *          Finally performs a double dual mac operation 
+ *          with the __SMLAD instruction.
+ * 
+ * @tparam  Input_T     Input type (should be data<8>)
+ * @tparam  Weight_T    Weight type (should be data<8>)
+ * 
+ * @param[in]      inputs          Pointer to input vector
+ * @param[in]      weights         Pointer to kernel weights
+ * @param[in,out]  weightedSum     Accumulating sum from the 
+ *                                 previous mac operations
+ * @returns                        Updated weightedSum with 
+ *                                 the result of the quad mac operation
+ */
+template<typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(!std::is_unsigned<Input_T>::value
+         && std::numeric_limits<Weight_T>::digits == 8
+         && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+Sum_T quadMac(const Input_T* __restrict inputs,
+              const Weight_T* __restrict weights,
+              Sum_T weightedSum)
+{
+    uint32_t in;
+    std::memcpy((void*) &in, inputs, sizeof(in));
+
+    uint32_t in1 = __SXTB16(in);
+    uint32_t in2 = __SXTB16_RORn(in, 8);
+
+    uint32_t wt;
+    std::memcpy((void*) &wt, weights, sizeof(wt));
+
+    uint32_t wt1 = __SXTB16(wt);
+    uint32_t wt2 = __SXTB16_RORn(wt, 8);
+
+    weightedSum = __SMLAD(in1, wt1, weightedSum);
+    weightedSum = __SMLAD(in2, wt2, weightedSum);
+    
+    return weightedSum;
+}
+
+template<typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 8
+         && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr>
+void macsOnParallel(const Input_T* __restrict inputs,
+                    const Weight_T* __restrict weights,
+                    Sum_T* weightedSums,
+                    const int nb_data)
+{
+    union n2d2_dataword wt = {0};
+    std::memcpy((void*) &wt, weights, nb_data);
+
+    typename std::conditional<(!std::is_unsigned<Input_T>::value), 
+            union n2d2_dataword, union n2d2_udataword>::type in = {0};
+    std::memcpy((void*) &in, inputs, nb_data);
+
+    for (int i = 0; i < nb_data; ++i) {
+        weightedSums[i] += (Sum_T)wt.bytes[i] * in.bytes[i];
+    }
+}
+
+
+
+// ----------------------------------------------------------------------------
+// -------------- MAC computing functions for kernel 4W-8A --------------------
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief   Mono mac operation (4W/8A version)
+ * @details Performs one mac operation for signed 4-bits weights
+ *          and 8-bits inputs (signed or not).
+ * 
+ * @tparam  Input_T     Input type (udata<8> or data<8>)
+ * @tparam  Weight_T    Weight type (should be data<4>)
+ * 
+ * @param[in]      inputs          Pointer to input vector
+ * @param[in]      weights         Pointer to kernel weights
+ * @param[in,out]  weightedSum     Accumulating sum from the 
+ *                                 previous mac operations
+ * @returns                        Updated weightedSum with 
+ *                                 the result of the dual mac operation
+ */
+template<typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 4
+         && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+Sum_T monoMac(const Input_T* __restrict inputs,
+              const Weight_T* __restrict weights,
+              Sum_T weightedSum)
+{
+    weightedSum += (Sum_T)inputs[0] * __SBFX(weights[0], 4, 4);
+    return weightedSum;
+}
+
+/**
+ * @brief   Unsigned dual mac operation (4W/8A version)
+ * @details Performs two mac operations for signed 4-bits weights
+ *          and unsigned 8-bits inputs. Extracts the two 4-bits weights
+ *          from a stored 8-bits weight and associates them into 
+ *          a 32-bits value. Then zero extends two 8-bits inputs and 
+ *          associates them into a 32-bits value. Finally performs a
+ *          dual mac operation with the __SMLAD instruction
+ * 
+ * @tparam  Input_T     Input type (should be udata<8>)
+ * @tparam  Weight_T    Weight type (should be data<4>)
+ * 
+ * @param[in]      inputs          Pointer to input vector
+ * @param[in]      weights         Pointer to compressed kernel weights
+ * @param[in,out]  weightedSum     Accumulating sum from the 
+ *                                 previous mac operations
+ * @returns                        Updated weightedSum with 
+ *                                 the result of the dual mac operation
+ */
+template<typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::is_unsigned<Input_T>::value
+         && std::numeric_limits<Weight_T>::digits == 4
+         && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+Sum_T dualMac(const Input_T* __restrict inputs,
+              const Weight_T* __restrict weights,
+              Sum_T weightedSum)
+{
+    uint8_t wt;
+    std::memcpy((void*) &wt, weights, sizeof(wt));
+
+    int32_t w0 = __SBFX(wt, 0, 4);
+    int32_t w1 = __SBFX(wt, 4, 4);
+    uint32_t wght = __BFI(w0, w1, 16, 16);
+
+    uint16_t in;
+    std::memcpy((void*) &in, inputs, sizeof(in));
+    
+    uint32_t act = ((in << 8) | in);
+    act = __UXTB16(act);
+    
+    weightedSum = __SMLAD(act, wght, weightedSum);
+
+    return weightedSum;
+}
+
+/**
+ * @brief   Signed dual mac operation (4W/8A version)
+ * @details Performs two mac operations for signed 4-bits weights
+ *          and signed 8-bits inputs. Extracts the two 4-bits weights
+ *          from a stored 8-bits weight and associates them into 
+ *          a 32-bits value. Then sign extends two 8-bits inputs and 
+ *          associates them into a 32-bits value. Finally performs a
+ *          dual mac operation with the __SMLAD instruction
+ * 
+ * @tparam  Input_T     Input type (should be data<8>)
+ * @tparam  Weight_T    Weight type (should be data<4>)
+ * 
+ * @param[in]      inputs          Pointer to input vector
+ * @param[in]      weights         Pointer to compressed kernel weights
+ * @param[in,out]  weightedSum     Accumulating sum from the 
+ *                                 previous mac operations
+ * @returns                        Updated weightedSum with 
+ *                                 the result of the dual mac operation
+ */
+template<typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(!std::is_unsigned<Input_T>::value
+         && std::numeric_limits<Weight_T>::digits == 4
+         && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+Sum_T dualMac(const Input_T* __restrict inputs,
+              const Weight_T* __restrict weights,
+              Sum_T weightedSum)
+{
+    uint8_t wt;
+    std::memcpy((void*) &wt, weights, sizeof(wt));
+
+    int32_t w0 = __SBFX(wt, 0, 4);
+    int32_t w1 = __SBFX(wt, 4, 4);
+    uint32_t wght = __BFI(w0, w1, 16, 16);
+
+    uint16_t in;
+    std::memcpy((void*) &in, inputs, sizeof(in));
+    
+    uint32_t act = ((in << 8) | in);
+    act = __SXTB16(act);
+    
+    weightedSum = __SMLAD(act, wght, weightedSum);
+
+    return weightedSum;
+}
+
+/**
+ * @brief   Unsigned quad mac operation (4W/8A version)
+ * @details Performs four mac operations for signed 4-bits weights
+ *          and unsigned 8-bits inputs. Extracts the four 4-bits weights
+ *          from two stored 8-bits weights and associates them into 
+ *          two 32-bits values. Then zero extends four 8-bits inputs and 
+ *          associates them into two 32-bits values. Finally performs a
+ *          double dual mac operation with the __SMLAD instruction
+ * 
+ * @tparam  Input_T     Input type (should be udata<8>)
+ * @tparam  Weight_T    Weight type (should be data<4>)
+ * 
+ * @param[in]      inputs          Pointer to input vector
+ * @param[in]      weights         Pointer to compressed kernel weights
+ * @param[in,out]  weightedSum     Accumulating sum from the 
+ *                                 previous mac operations
+ * @returns                        Updated weightedSum with 
+ *                                 the result of the quad mac operation
+ */
+template<typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::is_unsigned<Input_T>::value
+         && std::numeric_limits<Weight_T>::digits == 4
+         && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+Sum_T quadMac(const Input_T* __restrict inputs,
+              const Weight_T* __restrict weights,
+              Sum_T weightedSum)
+{
+    uint16_t wt;
+    std::memcpy((void*) &wt, weights, sizeof(wt));
+
+    int32_t w0 = __SBFX(wt, 0, 4);
+    int32_t w1 = __SBFX(wt, 4, 4);
+    int32_t w2 = __SBFX(wt, 8, 4);
+    int32_t w3 = __SBFX(wt, 12, 4);
+
+    uint32_t evenW1 = __PKHBT(w0, w2, 16);
+    uint32_t oddW1  = __PKHBT(w1, w3, 16);
+
+    uint32_t in;
+    std::memcpy((void*) &in, inputs, sizeof(in));
+
+    uint32_t evenA1 = __UXTB16(in);
+    uint32_t oddA1  = __UXTB16_RORn(in, 8);
+
+    weightedSum = __SMLAD(evenA1, oddW1, weightedSum);
+    weightedSum = __SMLAD(oddA1, evenW1, weightedSum);
+
+    return weightedSum;
+}
+
+/**
+ * @brief   Signed quad mac operation (4W/8A version)
+ * @details Performs four mac operations for signed 4-bits weights
+ *          and signed 8-bits inputs. Extracts the four 4-bits weights
+ *          from two stored 8-bits weights and associates them into 
+ *          two 32-bits values. Then sign extends four 8-bits inputs and 
+ *          associates them into two 32-bits values. Finally performs a
+ *          double dual mac operation with the __SMLAD instruction
+ * 
+ * @tparam  Input_T     Input type (should be data<8>)
+ * @tparam  Weight_T    Weight type (should be data<4>)
+ * 
+ * @param[in]      inputs          Pointer to input vector
+ * @param[in]      weights         Pointer to compressed kernel weights
+ * @param[in,out]  weightedSum     Accumulating sum from the 
+ *                                 previous mac operations
+ * @returns                        Updated weightedSum with 
+ *                                 the result of the quad mac operation
+ */
+template<typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(!std::is_unsigned<Input_T>::value
+         && std::numeric_limits<Weight_T>::digits == 4
+         && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+Sum_T quadMac(const Input_T* __restrict inputs,
+              const Weight_T* __restrict weights,
+              Sum_T weightedSum)
+{
+    uint16_t wt;
+    std::memcpy((void*) &wt, weights, sizeof(wt));
+
+    int32_t w0 = __SBFX(wt, 0, 4);
+    int32_t w1 = __SBFX(wt, 4, 4);
+    int32_t w2 = __SBFX(wt, 8, 4);
+    int32_t w3 = __SBFX(wt, 12, 4);
+
+    uint32_t evenW1 = __BFI(w2, w0, 16, 16);
+    uint32_t oddW1  = __BFI(w3, w1, 16, 16);
+
+    uint32_t in;
+    std::memcpy((void*) &in, inputs, sizeof(in));
+
+    uint32_t evenA1 = __SXTB16(in);
+    uint32_t oddA1  = __SXTB16_RORn(in, 8);
+
+    weightedSum = __SMLAD(evenA1, oddW1, weightedSum);
+    weightedSum = __SMLAD(oddA1, evenW1, weightedSum);
+
+    return weightedSum;
+}
+
+/**
+ * @brief   Unsigned octo mac operation (4W/8A version)
+ * @details Performs eight mac operations for signed 4-bits weights
+ *          and unsigned 8-bits inputs. Extracts the eight 4-bits weights
+ *          from four stored 8-bits weights and associates them into 
+ *          four 32-bits values. Then zero extends eights 8-bits inputs and 
+ *          associates them into four 32-bits values. Finally performs a
+ *          quadruple dual mac operation with the __SMLAD instruction
+ * 
+ * @tparam  Input_T     Input type (should be udata<8>)
+ * @tparam  Weight_T    Weight type (should be data<4>)
+ * 
+ * @param[in]      inputs          Pointer to input vector
+ * @param[in]      weights         Pointer to compressed kernel weights
+ * @param[in,out]  weightedSum     Accumulating sum from the 
+ *                                 previous mac operations
+ * @returns                        Updated weightedSum with 
+ *                                 the result of the octo mac operation
+ */
+template<typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::is_unsigned<Input_T>::value
+         && std::numeric_limits<Weight_T>::digits == 4
+         && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+Sum_T octoMac(const Input_T* __restrict inputs,
+              const Weight_T* __restrict weights,
+              Sum_T weightedSum)
+{
+    // uint32_t wt;
+    // std::memcpy((void*) &wt, weights, sizeof(wt));
+
+    // int32_t w0 = __SBFX(wt, 0, 4);
+    // int32_t w1 = __SBFX(wt, 4, 4);
+    // int32_t w2 = __SBFX(wt, 8, 4);
+    // int32_t w3 = __SBFX(wt, 12, 4);
+    // int32_t w4 = __SBFX(wt, 16, 4);
+    // int32_t w5 = __SBFX(wt, 20, 4);
+    // int32_t w6 = __SBFX(wt, 24, 4);
+    // int32_t w7 = __SBFX(wt, 28, 4);
+
+    // // uint32_t evenW1 = __BFI(w2, w0, 16, 16);
+    // // uint32_t oddW1  = __BFI(w3, w1, 16, 16);
+    // // uint32_t evenW2 = __BFI(w6, w4, 16, 16);
+    // // uint32_t oddW2  = __BFI(w7, w5, 16, 16);
+
+    // uint32_t evenW1 = __PKHBT(w0, w2, 16);
+    // uint32_t oddW1  = __PKHBT(w1, w3, 16);
+    // uint32_t evenW2 = __PKHBT(w4, w6, 16);
+    // uint32_t oddW2  = __PKHBT(w5, w7, 16);
+
+    // uint32_t in1, in2;
+    // std::memcpy((void*) &in1, inputs, sizeof(in1));
+    // std::memcpy((void*) &in2, (inputs + 4), sizeof(in2));
+    
+    // uint32_t evenA1 = __UXTB16(in1);
+    // uint32_t oddA1  = __UXTB16_RORn(in1, 8);
+    // uint32_t evenA2 = __UXTB16(in2);
+    // uint32_t oddA2  = __UXTB16_RORn(in2, 8);
+    
+    // weightedSum = __SMLAD(evenA1, oddW1, weightedSum);
+    // weightedSum = __SMLAD(oddA1, evenW1, weightedSum);
+    // weightedSum = __SMLAD(evenA2, oddW2, weightedSum);
+    // weightedSum = __SMLAD(oddA2, evenW2, weightedSum);
+
+    // 2nd implementation
+    // union n2d2_dataword wt;
+    // std::memcpy((void*) &wt, weights, sizeof(wt));
+
+    // union n2d2_udataword in1, in2;
+    // std::memcpy((void*) &in1, inputs, sizeof(in1));
+    // std::memcpy((void*) &in2, inputs + 4, sizeof(in2));
+
+    // weightedSum += (data<32>)(in1.bytes[0]) * wt.half_bytes[0].fields.op1;
+    // weightedSum += (data<32>)(in1.bytes[1]) * wt.half_bytes[0].fields.op0;
+    // weightedSum += (data<32>)(in1.bytes[2]) * wt.half_bytes[1].fields.op1;
+    // weightedSum += (data<32>)(in1.bytes[3]) * wt.half_bytes[1].fields.op0;
+    // weightedSum += (data<32>)(in2.bytes[0]) * wt.half_bytes[2].fields.op1;
+    // weightedSum += (data<32>)(in2.bytes[1]) * wt.half_bytes[2].fields.op0;
+    // weightedSum += (data<32>)(in2.bytes[2]) * wt.half_bytes[3].fields.op1;
+    // weightedSum += (data<32>)(in2.bytes[3]) * wt.half_bytes[3].fields.op0;
+
+    uint32_t wt;
+    memcpy((void*) &wt, weights, sizeof(wt));
+
+    // Works with weights * 4096 (weights << 12)
+    const uint32_t WeightMask = 0xF000F000;
+    uint32_t weight0 = WeightMask & (wt << 12);
+    uint32_t weight1 = WeightMask & (wt << 8);
+    uint32_t weight2 = WeightMask & (wt << 4);
+    uint32_t weight3 = WeightMask & (wt);
+
+    uint32_t in1, in2;
+    std::memcpy((void*) &in1, inputs, sizeof(in1));
+    std::memcpy((void*) &in2, (inputs + 4), sizeof(in2));
+
+    uint32_t in_a = __PKHBT(in1, in2, 16);
+    uint32_t in_b = __PKHTB(in2, in1, 16);
+    
+    uint32_t evenA1 = __UXTB16(in_a);
+    uint32_t oddA1  = __UXTB16_RORn(in_a, 8);
+    uint32_t evenA2 = __UXTB16(in_b);
+    uint32_t oddA2  = __UXTB16_RORn(in_b, 8);
+
+    Sum_T sum = 0;
+    sum = __SMLAD(oddA1, weight0, sum);
+    sum = __SMLAD(evenA1, weight1, sum);
+    sum = __SMLAD(oddA2, weight2, sum);
+    sum = __SMLAD(evenA2, weight3, sum);
+    weightedSum += sum >> 12;
+
+    return weightedSum;
+}
+
+/**
+ * @brief   Signed octo mac operation (4W/8A version)
+ * @details Performs eight mac operations for signed 4-bits weights
+ *          and signed 8-bits inputs. Extracts the eight 4-bits weights
+ *          from four stored 8-bits weights and associates them into 
+ *          four 32-bits values. Then sign extends eights 8-bits inputs and 
+ *          associates them into four 32-bits values. Finally performs a
+ *          quadruple dual mac operation with the __SMLAD instruction
+ * 
+ * @tparam  Input_T     Input type (should be data<8>)
+ * @tparam  Weight_T    Weight type (should be data<4>)
+ * 
+ * @param[in]      inputs          Pointer to input vector
+ * @param[in]      weights         Pointer to compressed kernel weights
+ * @param[in,out]  weightedSum     Accumulating sum from the 
+ *                                 previous mac operations
+ * @returns                        Updated weightedSum with 
+ *                                 the result of the octo mac operation
+ */
+template<typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(!std::is_unsigned<Input_T>::value
+         && std::numeric_limits<Weight_T>::digits == 4
+         && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+Sum_T octoMac(const Input_T* __restrict inputs,
+              const Weight_T* __restrict weights,
+              Sum_T weightedSum)
+{
+    uint32_t wt;
+    std::memcpy((void*) &wt, weights, sizeof(wt));
+
+    int32_t w0 = __SBFX(wt, 0, 4);
+    int32_t w1 = __SBFX(wt, 4, 4);
+    int32_t w2 = __SBFX(wt, 8, 4);
+    int32_t w3 = __SBFX(wt, 12, 4);
+    int32_t w4 = __SBFX(wt, 16, 4);
+    int32_t w5 = __SBFX(wt, 20, 4);
+    int32_t w6 = __SBFX(wt, 24, 4);
+    int32_t w7 = __SBFX(wt, 28, 4);
+
+    uint32_t evenW1 = __BFI(w2, w0, 16, 16);
+    uint32_t oddW1  = __BFI(w3, w1, 16, 16);
+    uint32_t evenW2 = __BFI(w6, w4, 16, 16);
+    uint32_t oddW2  = __BFI(w7, w5, 16, 16);
+
+    uint32_t in1, in2;
+    std::memcpy((void*) &in1, inputs, sizeof(in1));
+    std::memcpy((void*) &in2, (inputs + 4), sizeof(in2));
+    
+    uint32_t evenA1 = __SXTB16(in1);
+    uint32_t oddA1  = __SXTB16_RORn(in1, 8);
+    uint32_t evenA2 = __SXTB16(in2);
+    uint32_t oddA2  = __SXTB16_RORn(in2, 8);
+    
+    weightedSum = __SMLAD(evenA1, oddW1, weightedSum);
+    weightedSum = __SMLAD(oddA1, evenW1, weightedSum);
+    weightedSum = __SMLAD(evenA2, oddW2, weightedSum);
+    weightedSum = __SMLAD(oddA2, evenW2, weightedSum);
+
+    return weightedSum;
+}
+
+
+template<typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(
+            std::numeric_limits<Weight_T>::digits == 4 && 
+            std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr>
+void macsOnParallel(const Input_T* __restrict inputs,
+                    const Weight_T* __restrict weights,
+                    Sum_T* weightedSums,
+                    const int nb_data)
+{
+    uint32_t wt = 0;
+    std::memcpy((void*) &wt, weights, ceil((double)nb_data/2));
+
+    for (int i = 0; i < nb_data; ++i) {
+        weightedSums[i] += __SBFX(wt, 4*i, 4) * inputs[i];
+    }
+}
+
+
+// ----------------------------------------------------------------------------
+// -------------- MAC computing functions for kernel 4W-4A --------------------
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief   Unsigned mono mac operation (4W/4A version)
+ * @details Performs one mac operation for signed 4-bits weights
+ *          and unsigned 4-bits inputs.
+ * 
+ * @tparam  Input_T     Input type (should be udata<4>)
+ * @tparam  Weight_T    Weight type (should be data<4>)
+ * 
+ * @param[in]      inputs          Pointer to input vector
+ * @param[in]      weights         Pointer to kernel weights
+ * @param[in,out]  weightedSum     Accumulating sum from the 
+ *                                 previous mac operations
+ * @returns                        Updated weightedSum with 
+ *                                 the result of the dual mac operation
+ */
+template<typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::is_unsigned<Input_T>::value
+         && std::numeric_limits<Weight_T>::digits == 4
+         && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+Sum_T monoMac(const Input_T* __restrict inputs,
+              const Weight_T* __restrict weights,
+              Sum_T weightedSum)
+{
+    weightedSum += __UBFX(inputs[0], 4, 4) * __SBFX(weights[0], 4, 4);
+    return weightedSum;
+}
+
+/**
+ * @brief   Signed mono mac operation (4W/4A version)
+ * @details Performs one mac operation for signed 4-bits weights
+ *          and signed 4-bits inputs.
+ * 
+ * @tparam  Input_T     Input type (should be data<4>)
+ * @tparam  Weight_T    Weight type (should be data<4>)
+ * 
+ * @param[in]      inputs          Pointer to input vector
+ * @param[in]      weights         Pointer to kernel weights
+ * @param[in,out]  weightedSum     Accumulating sum from the 
+ *                                 previous mac operations
+ * @returns                        Updated weightedSum with 
+ *                                 the result of the dual mac operation
+ */
+template<typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(!std::is_unsigned<Input_T>::value
+         && std::numeric_limits<Weight_T>::digits == 4
+         && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+Sum_T monoMac(const Input_T* __restrict inputs,
+              const Weight_T* __restrict weights,
+              Sum_T weightedSum)
+{
+    weightedSum += __SBFX(inputs[0], 4, 4) * __SBFX(weights[0], 4, 4);
+    return weightedSum;
+}
+
+/**
+ * @brief   Unsigned dual mac operation (4W/4A version)
+ * @details Performs two mac operations for signed 4-bits weights
+ *          and unsigned 4-bits inputs. Extracts the two 4-bits weights
+ *          from a stored 8-bits weight and associates them into 
+ *          a 32-bits value. Then extracts the two 4-bits inputs
+ *          from a stored 8-bits input and associates them into 
+ *          a 32-bits value. Finally performs a dual mac operation 
+ *          with the __SMLAD instruction
+ * 
+ * @tparam  Input_T     Input type (should be udata<4>)
+ * @tparam  Weight_T    Weight type (should be data<4>)
+ * 
+ * @param[in]      inputs          Pointer to compressed input vector
+ * @param[in]      weights         Pointer to compressed kernel weights
+ * @param[in,out]  weightedSum     Accumulating sum from the 
+ *                                 previous mac operations
+ * @returns                        Updated weightedSum with 
+ *                                 the result of the dual mac operation
+ */
+template<typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::is_unsigned<Input_T>::value
+         && std::numeric_limits<Weight_T>::digits == 4
+         && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+Sum_T dualMac(const Input_T* __restrict inputs,
+              const Weight_T* __restrict weights,
+              Sum_T weightedSum)
+{
+    uint8_t wt;
+    std::memcpy((void*) &wt, weights, sizeof(wt));
+
+    int32_t w0 = __SBFX(wt, 0, 4);
+    int32_t w1 = __SBFX(wt, 4, 4);
+    uint32_t wght = __BFI(w1, w0, 16, 16);
+
+    uint8_t in;
+    std::memcpy((void*) &in, inputs, sizeof(in));
+
+    int32_t a0 = __UBFX(in, 0, 4);
+    int32_t a1 = __UBFX(in, 4, 4);
+    uint32_t act = __BFI(a1, a0, 16, 16);
+
+    weightedSum = __SMLAD(act, wght, weightedSum);
+
+    return weightedSum;
+}
+
+/**
+ * @brief   Signed dual mac operation (4W/4A version)
+ * @details Performs two mac operations for signed 4-bits weights
+ *          and signed 4-bits inputs. Extracts the two 4-bits weights
+ *          from a stored 8-bits weight and associates them into 
+ *          a 32-bits value. Then extracts the two 4-bits inputs
+ *          from a stored 8-bits input and associates them into 
+ *          a 32-bits value. Finally performs a dual mac operation 
+ *          with the __SMLAD instruction
+ * 
+ * @tparam  Input_T     Input type (should be data<4>)
+ * @tparam  Weight_T    Weight type (should be data<4>)
+ * 
+ * @param[in]      inputs          Pointer to compressed input vector
+ * @param[in]      weights         Pointer to compressed kernel weights
+ * @param[in,out]  weightedSum     Accumulating sum from the 
+ *                                 previous mac operations
+ * @returns                        Updated weightedSum with 
+ *                                 the result of the dual mac operation
+ */
+template<typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(!std::is_unsigned<Input_T>::value
+         && std::numeric_limits<Weight_T>::digits == 4
+         && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+Sum_T dualMac(const Input_T* __restrict inputs,
+              const Weight_T* __restrict weights,
+              Sum_T weightedSum)
+{
+    uint8_t wt;
+    std::memcpy((void*) &wt, weights, sizeof(wt));
+
+    int32_t w0 = __SBFX(wt, 0, 4);
+    int32_t w1 = __SBFX(wt, 4, 4);
+    uint32_t wght = __BFI(w1, w0, 16, 16);
+
+    uint8_t in;
+    std::memcpy((void*) &in, inputs, sizeof(in));
+
+    int32_t a0 = __SBFX(in, 0, 4);
+    int32_t a1 = __SBFX(in, 4, 4);
+    uint32_t act = __BFI(a1, a0, 16, 16);
+
+    weightedSum = __SMLAD(act, wght, weightedSum);
+
+    return weightedSum;
+}
+
+/**
+ * @brief   Unsigned quad mac operation (4W/4A version)
+ * @details Performs four mac operations for signed 4-bits weights
+ *          and unsigned 4-bits inputs. Extracts the four 4-bits weights
+ *          from two stored 8-bits weights and associates them into 
+ *          two 32-bits values. Then extracts the four 4-bits inputs
+ *          from two stored 8-bits inputs and associates them into 
+ *          two 32-bits values. Finally performs a double dual mac operation 
+ *          with the __SMLAD instruction
+ * 
+ * @tparam  Input_T     Input type (should be udata<4>)
+ * @tparam  Weight_T    Weight type (should be data<4>)
+ * 
+ * @param[in]      inputs          Pointer to compressed input vector
+ * @param[in]      weights         Pointer to compressed kernel weights
+ * @param[in,out]  weightedSum     Accumulating sum from the 
+ *                                 previous mac operations
+ * @returns                        Updated weightedSum with 
+ *                                 the result of the quad mac operation
+ */
+template<typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::is_unsigned<Input_T>::value
+         && std::numeric_limits<Weight_T>::digits == 4
+         && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+Sum_T quadMac(const Input_T* __restrict inputs,
+              const Weight_T* __restrict weights,
+              Sum_T weightedSum)
+{
+    uint16_t wt;
+    std::memcpy((void*) &wt, weights, sizeof(wt));
+
+    int32_t w0 = __SBFX(wt, 0, 4);
+    int32_t w1 = __SBFX(wt, 4, 4);
+    int32_t w2 = __SBFX(wt, 8, 4);
+    int32_t w3 = __SBFX(wt, 12, 4);
+
+    uint32_t evenW1 = __BFI(w2, w0, 16, 16);
+    uint32_t oddW1  = __BFI(w3, w1, 16, 16);
+
+    uint16_t in;
+    std::memcpy((void*) &in, inputs, sizeof(in));
+
+    int32_t a0 = __UBFX(in, 0, 4);
+    int32_t a1 = __UBFX(in, 4, 4);
+    int32_t a2 = __UBFX(in, 8, 4);
+    int32_t a3 = __UBFX(in, 12, 4);
+
+    uint32_t evenA1 = __BFI(a2, a0, 16, 16);
+    uint32_t oddA1  = __BFI(a3, a1, 16, 16);
+
+    weightedSum = __SMLAD(evenA1, evenW1, weightedSum);
+    weightedSum = __SMLAD(oddA1, oddW1, weightedSum);
+
+    return weightedSum;
+}
+
+/**
+ * @brief   Signed quad mac operation (4W/4A version)
+ * @details Performs four mac operations for signed 4-bits weights
+ *          and signed 4-bits inputs. Extracts the four 4-bits weights
+ *          from two stored 8-bits weights and associates them into 
+ *          two 32-bits values. Then extracts the four 4-bits inputs
+ *          from two stored 8-bits inputs and associates them into 
+ *          two 32-bits values. Finally performs a double dual mac operation 
+ *          with the __SMLAD instruction
+ * 
+ * @tparam  Input_T     Input type (should be data<4>)
+ * @tparam  Weight_T    Weight type (should be data<4>)
+ * 
+ * @param[in]      inputs          Pointer to compressed input vector
+ * @param[in]      weights         Pointer to compressed kernel weights
+ * @param[in,out]  weightedSum     Accumulating sum from the 
+ *                                 previous mac operations
+ * @returns                        Updated weightedSum with 
+ *                                 the result of the quad mac operation
+ */
+template<typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(!std::is_unsigned<Input_T>::value
+         && std::numeric_limits<Weight_T>::digits == 4
+         && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+Sum_T quadMac(const Input_T* __restrict inputs,
+              const Weight_T* __restrict weights,
+              Sum_T weightedSum)
+{
+    uint16_t wt;
+    std::memcpy((void*) &wt, weights, sizeof(wt));
+
+    int32_t w0 = __SBFX(wt, 0, 4);
+    int32_t w1 = __SBFX(wt, 4, 4);
+    int32_t w2 = __SBFX(wt, 8, 4);
+    int32_t w3 = __SBFX(wt, 12, 4);
+
+    uint32_t evenW1 = __PKHBT(w2, w0, 16);
+    uint32_t oddW1  = __PKHBT(w3, w1, 16);
+
+    uint16_t in;
+    std::memcpy((void*) &in, inputs, sizeof(in));
+
+    int32_t a0 = __SBFX(in, 0, 4);
+    int32_t a1 = __SBFX(in, 4, 4);
+    int32_t a2 = __SBFX(in, 8, 4);
+    int32_t a3 = __SBFX(in, 12, 4);
+    
+    uint32_t evenA1 = __PKHBT(a2, a0, 16);
+    uint32_t oddA1  = __PKHBT(a3, a1, 16);
+
+    weightedSum = __SMLAD(evenA1, evenW1, weightedSum);
+    weightedSum = __SMLAD(oddA1, oddW1, weightedSum);
+
+    return weightedSum;
+}
+
+/**
+ * @brief   Unsigned octo mac operation (4W/4A version)
+ * @details Performs eight mac operations for signed 4-bits weights
+ *          and unsigned 4-bits inputs. Extracts the eight 4-bits weights
+ *          from four stored 8-bits weights and associates them into 
+ *          four 32-bits values. Then extracts the eight 4-bits inputs
+ *          from four stored 8-bits inputs and associates them into 
+ *          four 32-bits values. Finally performs a quadruple dual mac operation 
+ *          with the __SMLAD instruction
+ * 
+ * @tparam  Input_T     Input type (should be udata<4>)
+ * @tparam  Weight_T    Weight type (should be data<4>)
+ * 
+ * @param[in]      inputs          Pointer to compressed input vector
+ * @param[in]      weights         Pointer to compressed kernel weights
+ * @param[in,out]  weightedSum     Accumulating sum from the 
+ *                                 previous mac operations
+ * @returns                        Updated weightedSum with 
+ *                                 the result of the octo mac operation
+ */
+// template<typename Input_T, typename Weight_T,
+//          typename std::enable_if<(std::is_unsigned<Input_T>::value
+//          && std::numeric_limits<Weight_T>::digits == 4
+//          && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
+// __attribute__((always_inline)) static inline
+// Sum_T octoMac(const Input_T* __restrict inputs,
+//               const Weight_T* __restrict weights,
+//               Sum_T weightedSum)
+// {
+//     uint32_t wt;
+//     std::memcpy((void*) &wt, weights, sizeof(wt));
+
+//     int32_t w0 = __SBFX(wt, 0, 4);
+//     int32_t w1 = __SBFX(wt, 4, 4);
+//     int32_t w2 = __SBFX(wt, 8, 4);
+//     int32_t w3 = __SBFX(wt, 12, 4);
+//     int32_t w4 = __SBFX(wt, 16, 4);
+//     int32_t w5 = __SBFX(wt, 20, 4);
+//     int32_t w6 = __SBFX(wt, 24, 4);
+//     int32_t w7 = __SBFX(wt, 28, 4);
+
+//     // uint32_t weight0 = __BFI(w4, w0, 16, 16);
+//     // uint32_t weight1 = __BFI(w5, w1, 16, 16);
+//     // uint32_t weight2 = __BFI(w6, w2, 16, 16);
+//     // uint32_t weight3 = __BFI(w7, w3, 16, 16);
+
+//     uint32_t weight0 = __PKHBT(w0, w4, 16);
+//     uint32_t weight1 = __PKHBT(w1, w5, 16);
+//     uint32_t weight2 = __PKHBT(w2, w6, 16);
+//     uint32_t weight3 = __PKHBT(w3, w7, 16);
+
+//     uint32_t in;
+//     std::memcpy((void*) &in, inputs, sizeof(in));
+
+//     uint32_t act0 = in & 0xF000F;
+//     uint32_t act1 = (in >> 4) & 0xF000F;
+//     uint32_t act2 = (in >> 8) & 0xF000F;
+//     uint32_t act3 = (in >> 12) & 0xF000F;
+
+//     weightedSum = __SMLAD(act0, weight0, weightedSum);
+//     weightedSum = __SMLAD(act1, weight1, weightedSum);
+//     weightedSum = __SMLAD(act2, weight2, weightedSum);
+//     weightedSum = __SMLAD(act3, weight3, weightedSum);
+
+//     return weightedSum;
+// }
+
+// template<typename Input_T, typename Weight_T,
+//          typename std::enable_if<(std::is_unsigned<Input_T>::value
+//          && std::numeric_limits<Weight_T>::digits == 4
+//          && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
+// __attribute__((always_inline)) static inline
+// Sum_T octoMac(const Input_T* __restrict inputs,
+//               const Weight_T* __restrict weights,
+//               Sum_T weightedSum)
+// {
+//     union n2d2_dataword wt;
+//     std::memcpy((void*) &wt, weights, sizeof(wt));
+
+//     union n2d2_udataword in;
+//     std::memcpy((void*) &in, inputs, sizeof(in));
+
+//     for (int i = 0; i < 4; ++i) {
+//         weightedSum += (data<32>)(in.half_bytes[i].fields.op0) * wt.half_bytes[i].fields.op0;
+//         weightedSum += (data<32>)(in.half_bytes[i].fields.op1) * wt.half_bytes[i].fields.op1;
+//     }
+
+//     // weightedSum += (data<32>)(in.half_bytes[0].fields.op0) * wt.half_bytes[0].fields.op0;
+//     // weightedSum += (data<32>)(in.half_bytes[0].fields.op1) * wt.half_bytes[0].fields.op1;
+//     // weightedSum += (data<32>)(in.half_bytes[1].fields.op0) * wt.half_bytes[1].fields.op0;
+//     // weightedSum += (data<32>)(in.half_bytes[1].fields.op1) * wt.half_bytes[1].fields.op1;
+//     // weightedSum += (data<32>)(in.half_bytes[2].fields.op0) * wt.half_bytes[2].fields.op0;
+//     // weightedSum += (data<32>)(in.half_bytes[2].fields.op1) * wt.half_bytes[2].fields.op1;
+//     // weightedSum += (data<32>)(in.half_bytes[3].fields.op0) * wt.half_bytes[3].fields.op0;
+//     // weightedSum += (data<32>)(in.half_bytes[3].fields.op1) * wt.half_bytes[3].fields.op1;
+
+//     return weightedSum;
+// }
+
+template<typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::is_unsigned<Input_T>::value
+         && std::numeric_limits<Weight_T>::digits == 4
+         && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+Sum_T octoMac(const Input_T* __restrict inputs,
+              const Weight_T* __restrict weights,
+              Sum_T weightedSum)
+{
+    uint32_t wt;
+    memcpy((void*) &wt, weights, sizeof(wt));
+
+    // Works with weights * 4096 (weights << 12)
+    const uint32_t WeightMask = 0xF000F000;
+    uint32_t weight0 = WeightMask & (wt << 12);
+    uint32_t weight1 = WeightMask & (wt << 8);
+    uint32_t weight2 = WeightMask & (wt << 4);
+    uint32_t weight3 = WeightMask & (wt);
+
+    uint32_t in;
+    memcpy((void*) &in, inputs, sizeof(in));
+
+    const uint32_t ActMask = 0x000F000F; // to explicit instructions
+    uint32_t act0 = in & ActMask;
+    // Expect second operand shift
+    uint32_t act1 = ActMask & (in >> 4);
+    uint32_t act2 = ActMask & (in >> 8);
+    uint32_t act3 = ActMask & (in >> 12);
+
+    Sum_T sum = 0;
+    sum = __SMLAD(act0, weight0, sum);
+    sum = __SMLAD(act1, weight1, sum);
+    sum = __SMLAD(act2, weight2, sum);
+    sum = __SMLAD(act3, weight3, sum);
+
+    return weightedSum + (sum >> 12);
+}
+
+/**
+ * @brief   Signed octo mac operation (4W/4A version)
+ * @details Performs eight mac operations for signed 4-bits weights
+ *          and signed 4-bits inputs. Extracts the eight 4-bits weights
+ *          from four stored 8-bits weights and associates them into 
+ *          four 32-bits values. Then extracts the eight 4-bits inputs
+ *          from four stored 8-bits inputs and associates them into 
+ *          four 32-bits values. Finally performs a quadruple dual mac operation 
+ *          with the __SMLAD instruction
+ * 
+ * @tparam  Input_T     Input type (should be data<4>)
+ * @tparam  Weight_T    Weight type (should be data<4>)
+ * 
+ * @param[in]      inputs          Pointer to compressed input vector
+ * @param[in]      weights         Pointer to compressed kernel weights
+ * @param[in,out]  weightedSum     Accumulating sum from the 
+ *                                 previous mac operations
+ * @returns                        Updated weightedSum with 
+ *                                 the result of the octo mac operation
+ */
+template<typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(!std::is_unsigned<Input_T>::value
+         && std::numeric_limits<Weight_T>::digits == 4
+         && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+Sum_T octoMac(const Input_T* __restrict inputs,
+              const Weight_T* __restrict weights,
+              Sum_T weightedSum)
+{
+    uint32_t wt;
+    std::memcpy((void*) &wt, weights, sizeof(wt));
+
+    int32_t w0 = __SBFX(wt, 0, 4);
+    int32_t w1 = __SBFX(wt, 4, 4);
+    int32_t w2 = __SBFX(wt, 8, 4);
+    int32_t w3 = __SBFX(wt, 12, 4);
+    int32_t w4 = __SBFX(wt, 16, 4);
+    int32_t w5 = __SBFX(wt, 20, 4);
+    int32_t w6 = __SBFX(wt, 24, 4);
+    int32_t w7 = __SBFX(wt, 28, 4);
+
+    uint32_t evenW1 = __PKHBT(w2, w0, 16);
+    uint32_t oddW1  = __PKHBT(w3, w1, 16);
+    uint32_t evenW2 = __PKHBT(w6, w4, 16);
+    uint32_t oddW2  = __PKHBT(w7, w5, 16);
+
+    uint32_t in;
+    std::memcpy((void*) &in, inputs, sizeof(in));
+
+    int32_t a0 = __SBFX(in, 0, 4);
+    int32_t a1 = __SBFX(in, 4, 4);
+    int32_t a2 = __SBFX(in, 8, 4);
+    int32_t a3 = __SBFX(in, 12, 4);
+    int32_t a4 = __SBFX(in, 16, 4);
+    int32_t a5 = __SBFX(in, 20, 4);
+    int32_t a6 = __SBFX(in, 24, 4);
+    int32_t a7 = __SBFX(in, 28, 4);
+
+    uint32_t evenA1 = __PKHBT(a2, a0, 16);
+    uint32_t oddA1  = __PKHBT(a3, a1, 16);
+    uint32_t evenA2 = __PKHBT(a6, a4, 16);
+    uint32_t oddA2  = __PKHBT(a7, a5, 16);
+
+    weightedSum = __SMLAD(evenA1, evenW1, weightedSum);
+    weightedSum = __SMLAD(oddA1, oddW1, weightedSum);
+    weightedSum = __SMLAD(evenA2, evenW2, weightedSum);
+    weightedSum = __SMLAD(oddA2, oddW2, weightedSum);
+
+    return weightedSum;
+}
+
+
+template<typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::is_unsigned<Input_T>::value
+         && std::numeric_limits<Weight_T>::digits == 4
+         && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
+void macsOnParallel(const Input_T* __restrict inputs,
+                    const Weight_T* __restrict weights,
+                    Sum_T* weightedSums,
+                    const int nb_data)
+{
+    uint32_t wt = 0;
+    std::memcpy((void*) &wt, weights, ceil((double)nb_data/2));
+
+    uint32_t in = 0;
+    std::memcpy((void*) &in, inputs, ceil((double)nb_data/2));
+
+    for (int i = 0; i < nb_data; ++i) {
+        weightedSums[i] += __SBFX(wt, 4*i, 4) * __UBFX(in, 4*i, 4);
+    }
+}
+
+template<typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(!std::is_unsigned<Input_T>::value
+         && std::numeric_limits<Weight_T>::digits == 4
+         && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
+void macsOnParallel(const Input_T* __restrict inputs,
+                    const Weight_T* __restrict weights,
+                    Sum_T* weightedSums,
+                    const int nb_data)
+{
+    uint32_t wt = 0;
+    std::memcpy((void*) &wt, weights, ceil((double)nb_data/2));
+
+    uint32_t in = 0;
+    std::memcpy((void*) &in, inputs, ceil((double)nb_data/2));
+
+    for (int i = 0; i < nb_data; ++i) {
+        weightedSums[i] += __SBFX(wt, 4*i, 4) * __SBFX(in, 4*i, 4);
+    }
+}
+
+
+// ----------------------------------------------------------------------------
+// ------------------ Notes about performing MAC operations -------------------
+// --------------------------- with 1-bit weights -----------------------------
+// ----------------------------------------------------------------------------
+
+/**
+ * @note How to perform MAC operations with 1-bit weight
+ * 
+ * Working with an 1-bit weight means working only with two possible values 
+ * for each weight. Thus, it has been defined a convention that will be used 
+ * in the following functions in this file.
+ * Convention: when the value of a weight is 0, it means 1
+ *             when the value of a weight is 1, it means -1
+ * 
+ * Example: let's take a simple dual MAC operation
+ *          weightedSum = w0 * a0 + w1 * a1;
+ * 
+ * if w0 = 0x00 and w1 = 0x01 then weightedSum should be:
+ *          weightedSum = a0 - a1;
+ * 
+ * To easily perform MAC operations and use as often as possible
+ * SIMD instructions to parallelize and speed up MAC calculations, most of
+ * the following functions use the same scheme:
+ * 
+ *  - Perform a parallel subtraction of 0 and the weights
+ *      Some SIMD instructions as __USUB16 and __USUB8 can perform 
+ *      parallel subtractions and activate a Greater or Equal flag (GE) if
+ *      the results of each subtraction is positive. 
+ *      Thus, if the result of 0 - w0 >= 0 ==> GE[0] = 1
+ *                             0 - w0 < 0  ==> GE[0] = 0
+ *      (the results of the subtractions are not saved because only the 
+ *       GE flags trigger is required)
+ * 
+ *  - Use of the __SEL instruction to read the GE flags
+ *      The __SEL can select an input from two values according to the
+ *      the GE flag provided by the previous subtraction. In the case of 
+ *      the 1W/8A project, the two possible values selected by __SEL are
+ *      (+input) or (-input). Thus, __SEL is often used like "__SEL(in, -in)"
+ *      The results of __SEL are saved as MAC results
+ * 
+ *  - Addition of the accumuling sums with the results of the MAC operations
+ *      Use of __SADD16 or __SADD8 for signed additions
+ * 
+ */
+
+// ----------------------------------------------------------------------------
+// ----------------- MAC computing functions for kernel -----------------------
+// ------------------------------- 1W / 8A ------------------------------------
+// ------------------------------- 1W / 7A ------------------------------------
+// ------------------------------- 1W / 6A ------------------------------------
+// ------------------------------- 1W / 5A ------------------------------------
+// ----------------------------------------------------------------------------
+
+template<typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
+         && std::numeric_limits<Input_T>::digits <= 8 
+         && std::numeric_limits<Input_T>::digits > 4)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+Sum_T monoMac (const Input_T* __restrict inputs,
+               const Weight_T* __restrict weights,
+               Sum_T weightedSum)
+{
+    weightedSum += (weights[0].fields.op7) ? (Sum_T)(-(inputs[0])) : (Sum_T)(inputs[0]);
+    return weightedSum;
+}
+
+template<int NB_ITERATIONS,
+         typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
+         && std::numeric_limits<Input_T>::digits <= 8 
+         && std::numeric_limits<Input_T>::digits > 4
+         && NB_ITERATIONS == 2)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+void macsOnRange (const Input_T* __restrict inputs,
+                  const Weight_T* __restrict weights,
+                  Sum_T& weightedSum)
+{
+    weightedSum += (weights[0].fields.op7) ? (Sum_T)(-(inputs[0])) : (Sum_T)(inputs[0]);
+    weightedSum += (weights[0].fields.op6) ? (Sum_T)(-(inputs[1])) : (Sum_T)(inputs[1]);
+}
+
+template<int NB_ITERATIONS,
+         typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
+         && std::numeric_limits<Input_T>::digits <= 8 
+         && std::numeric_limits<Input_T>::digits > 4
+         && NB_ITERATIONS == 3)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+void macsOnRange (const Input_T* __restrict inputs,
+                  const Weight_T* __restrict weights,
+                  Sum_T& weightedSum)
+{
+    weightedSum += (weights[0].fields.op7) ? (Sum_T)(-(inputs[0])) : (Sum_T)(inputs[0]);
+    weightedSum += (weights[0].fields.op6) ? (Sum_T)(-(inputs[1])) : (Sum_T)(inputs[1]);
+    weightedSum += (weights[0].fields.op5) ? (Sum_T)(-(inputs[2])) : (Sum_T)(inputs[2]);
+}
+
+template<int NB_ITERATIONS,
+         typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
+         && std::numeric_limits<Input_T>::digits <= 8 
+         && std::numeric_limits<Input_T>::digits > 4
+         && NB_ITERATIONS == 4)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+void macsOnRange (const Input_T* __restrict inputs,
+                  const Weight_T* __restrict weights,
+                  Sum_T& weightedSum)
+{
+    weightedSum += (weights[0].fields.op7) ? (Sum_T)(-(inputs[0])) : (Sum_T)(inputs[0]);
+    weightedSum += (weights[0].fields.op6) ? (Sum_T)(-(inputs[1])) : (Sum_T)(inputs[1]);
+    weightedSum += (weights[0].fields.op5) ? (Sum_T)(-(inputs[2])) : (Sum_T)(inputs[2]);
+    weightedSum += (weights[0].fields.op4) ? (Sum_T)(-(inputs[3])) : (Sum_T)(inputs[3]);
+}
+
+template<int NB_ITERATIONS,
+         typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
+         && std::numeric_limits<Input_T>::digits <= 8 
+         && std::numeric_limits<Input_T>::digits > 4
+         && NB_ITERATIONS == 5)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+void macsOnRange (const Input_T* __restrict inputs,
+                  const Weight_T* __restrict weights,
+                  Sum_T& weightedSum)
+{
+    weightedSum += (weights[0].fields.op7) ? (Sum_T)(-(inputs[0])) : (Sum_T)(inputs[0]);
+    weightedSum += (weights[0].fields.op6) ? (Sum_T)(-(inputs[1])) : (Sum_T)(inputs[1]);
+    weightedSum += (weights[0].fields.op5) ? (Sum_T)(-(inputs[2])) : (Sum_T)(inputs[2]);
+    weightedSum += (weights[0].fields.op4) ? (Sum_T)(-(inputs[3])) : (Sum_T)(inputs[3]);
+    weightedSum += (weights[0].fields.op3) ? (Sum_T)(-(inputs[4])) : (Sum_T)(inputs[4]);
+}
+
+template<int NB_ITERATIONS,
+         typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
+         && std::numeric_limits<Input_T>::digits <= 8 
+         && std::numeric_limits<Input_T>::digits > 4
+         && NB_ITERATIONS == 6)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+void macsOnRange (const Input_T* __restrict inputs,
+                  const Weight_T* __restrict weights,
+                  Sum_T& weightedSum)
+{
+    weightedSum += (weights[0].fields.op7) ? (Sum_T)(-(inputs[0])) : (Sum_T)(inputs[0]);
+    weightedSum += (weights[0].fields.op6) ? (Sum_T)(-(inputs[1])) : (Sum_T)(inputs[1]);
+    weightedSum += (weights[0].fields.op5) ? (Sum_T)(-(inputs[2])) : (Sum_T)(inputs[2]);
+    weightedSum += (weights[0].fields.op4) ? (Sum_T)(-(inputs[3])) : (Sum_T)(inputs[3]);
+    weightedSum += (weights[0].fields.op3) ? (Sum_T)(-(inputs[4])) : (Sum_T)(inputs[4]);
+    weightedSum += (weights[0].fields.op2) ? (Sum_T)(-(inputs[5])) : (Sum_T)(inputs[5]);
+}
+
+template<int NB_ITERATIONS,
+         typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
+         && std::numeric_limits<Input_T>::digits <= 8 
+         && std::numeric_limits<Input_T>::digits > 4
+         && NB_ITERATIONS == 7)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+void macsOnRange (const Input_T* __restrict inputs,
+                  const Weight_T* __restrict weights,
+                  Sum_T& weightedSum)
+{
+    weightedSum += (weights[0].fields.op7) ? (Sum_T)(-(inputs[0])) : (Sum_T)(inputs[0]);
+    weightedSum += (weights[0].fields.op6) ? (Sum_T)(-(inputs[1])) : (Sum_T)(inputs[1]);
+    weightedSum += (weights[0].fields.op5) ? (Sum_T)(-(inputs[2])) : (Sum_T)(inputs[2]);
+    weightedSum += (weights[0].fields.op4) ? (Sum_T)(-(inputs[3])) : (Sum_T)(inputs[3]);
+    weightedSum += (weights[0].fields.op3) ? (Sum_T)(-(inputs[4])) : (Sum_T)(inputs[4]);
+    weightedSum += (weights[0].fields.op2) ? (Sum_T)(-(inputs[5])) : (Sum_T)(inputs[5]);
+    weightedSum += (weights[0].fields.op1) ? (Sum_T)(-(inputs[6])) : (Sum_T)(inputs[6]);
+}
+
+
+// ----------------------------------------------------------------------------
+// ----------------- MAC computing functions for kernel -----------------------
+// ------------------------------- 1W / 8A ------------------------------------
+// ----------------------------------------------------------------------------
+
+template<typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
+         && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+Sum_T octoMac (const Input_T* __restrict inputs,
+                  const Weight_T* __restrict weights,
+                  Sum_T weightedSum)
+{
+    uint32_t mac_result = 0;
+    uint32_t in;
+    uint32_t wt = 0;
+    std::memcpy((void*) &wt, weights, 1);
+    wt |= wt << 16;
+
+    memcpy((void*) &in, inputs, sizeof(in));
+    uint32_t evenA1 = __UXTB16(in);
+    uint32_t oddA1  = __UXTB16_RORn(in, 8);
+    uint32_t neg_evenA1 = __SSUB16(0, evenA1);
+    uint32_t neg_oddA1 = __SSUB16(0, oddA1);
+
+    __USUB16(0, wt & 0x40001);  
+    mac_result = __SEL(evenA1, neg_evenA1);
+    weightedSum = __SADD16(mac_result, weightedSum);  
+
+    __USUB16(0, wt & 0x80002);  
+    mac_result = __SEL(oddA1, neg_oddA1);
+    weightedSum = __SADD16(mac_result, weightedSum);  
+
+
+    memcpy((void*) &in, inputs + 4, sizeof(in));
+    evenA1 = __UXTB16(in);
+    oddA1  = __UXTB16_RORn(in, 8);
+    neg_evenA1 = __SSUB16(0, evenA1);
+    neg_oddA1 = __SSUB16(0, oddA1);
+
+    __USUB16(0, wt & 0x400010);  
+    mac_result = __SEL(evenA1, neg_evenA1);
+    weightedSum = __SADD16(mac_result, weightedSum);  
+
+    __USUB16(0, wt & 0x800020);  
+    mac_result = __SEL(oddA1, neg_oddA1);
+    weightedSum = __SADD16(mac_result, weightedSum);  
+
+    return weightedSum;
+}
+
+template<typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
+         && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+Sum_T quadquadMac (const Input_T* __restrict inputs,
+                   const Weight_T* __restrict weights,
+                   Sum_T weightedSum)
+{
+    uint32_t mac_result = 0;
+    uint32_t in;
+    uint32_t wt = 0;
+    std::memcpy((void*) &wt, weights, 2);
+    wt |= wt << 16;
+    
+    memcpy((void*) &in, inputs, sizeof(in));
+    uint32_t evenA1 = __UXTB16(in);
+    uint32_t oddA1  = __UXTB16_RORn(in, 8);
+    uint32_t neg_evenA1 = __SSUB16(0, evenA1);
+    uint32_t neg_oddA1 = __SSUB16(0, oddA1);
+
+    __USUB16(0, wt & 0x40001);  
+    mac_result = __SEL(evenA1, neg_evenA1);
+    weightedSum = __SADD16(mac_result, weightedSum);  
+
+    __USUB16(0, wt & 0x80002);  
+    mac_result = __SEL(oddA1, neg_oddA1);
+    weightedSum = __SADD16(mac_result, weightedSum);  
+
+
+    memcpy((void*) &in, inputs + 4, sizeof(in));
+    evenA1 = __UXTB16(in);
+    oddA1  = __UXTB16_RORn(in, 8);
+    neg_evenA1 = __SSUB16(0, evenA1);
+    neg_oddA1 = __SSUB16(0, oddA1);
+
+    __USUB16(0, wt & 0x400010);  
+    mac_result = __SEL(evenA1, neg_evenA1);
+    weightedSum = __SADD16(mac_result, weightedSum);  
+
+    __USUB16(0, wt & 0x800020);  
+    mac_result = __SEL(oddA1, neg_oddA1);
+    weightedSum = __SADD16(mac_result, weightedSum); 
+
+
+    memcpy((void*) &in, inputs + 8, sizeof(in));
+    evenA1 = __UXTB16(in);
+    oddA1  = __UXTB16_RORn(in, 8);
+    neg_evenA1 = __SSUB16(0, evenA1);
+    neg_oddA1 = __SSUB16(0, oddA1);
+
+    __USUB16(0, wt & 0x4000100);  
+    mac_result = __SEL(evenA1, neg_evenA1);
+    weightedSum = __SADD16(mac_result, weightedSum);  
+
+    __USUB16(0, wt & 0x8000200);  
+    mac_result = __SEL(oddA1, neg_oddA1);
+    weightedSum = __SADD16(mac_result, weightedSum);  
+
+
+    memcpy((void*) &in, inputs + 12, sizeof(in));
+    evenA1 = __UXTB16(in);
+    oddA1  = __UXTB16_RORn(in, 8);
+    neg_evenA1 = __SSUB16(0, evenA1);
+    neg_oddA1 = __SSUB16(0, oddA1);
+
+    __USUB16(0, wt & 0x40001000);  
+    mac_result = __SEL(evenA1, neg_evenA1);
+    weightedSum = __SADD16(mac_result, weightedSum);  
+
+    __USUB16(0, wt & 0x80002000);  
+    mac_result = __SEL(oddA1, neg_oddA1);
+    weightedSum = __SADD16(mac_result, weightedSum);
+
+    return weightedSum;
+}
+
+template<typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
+         && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+Sum_T octoquadMac (const Input_T* __restrict inputs,
+                   const Weight_T* __restrict weights,
+                   Sum_T weightedSum)
+{
+    uint32_t mac_result = 0;
+    uint32_t in;
+    uint32_t wt;
+    memcpy((void*) &wt, weights, 4);
+    uint32_t wt1 = __PKHBT(wt, wt, 16);
+    uint32_t wt2 = __PKHTB(wt, wt, 16);
+
+    memcpy((void*) &in, inputs, sizeof(in));
+    uint32_t evenA1 = __UXTB16(in);
+    uint32_t oddA1  = __UXTB16_RORn(in, 8);
+    uint32_t neg_evenA1 = __SSUB16(0, evenA1);
+    uint32_t neg_oddA1 = __SSUB16(0, oddA1);
+
+    __USUB16(0, wt & 0x40001);  
+    mac_result = __SEL(evenA1, neg_evenA1);
+    weightedSum = __SADD16(mac_result, weightedSum);  
+
+    __USUB16(0, wt & 0x80002);  
+    mac_result = __SEL(oddA1, neg_oddA1);
+    weightedSum = __SADD16(mac_result, weightedSum);  
+
+
+    memcpy((void*) &in, inputs + 4, sizeof(in));
+    evenA1 = __UXTB16(in);
+    oddA1  = __UXTB16_RORn(in, 8);
+    neg_evenA1 = __SSUB16(0, evenA1);
+    neg_oddA1 = __SSUB16(0, oddA1);
+
+    __USUB16(0, wt & 0x400010);  
+    mac_result = __SEL(evenA1, neg_evenA1);
+    weightedSum = __SADD16(mac_result, weightedSum);  
+
+    __USUB16(0, wt & 0x800020);  
+    mac_result = __SEL(oddA1, neg_oddA1);
+    weightedSum = __SADD16(mac_result, weightedSum); 
+
+
+    memcpy((void*) &in, inputs + 8, sizeof(in));
+    evenA1 = __UXTB16(in);
+    oddA1  = __UXTB16_RORn(in, 8);
+    neg_evenA1 = __SSUB16(0, evenA1);
+    neg_oddA1 = __SSUB16(0, oddA1);
+
+    __USUB16(0, wt & 0x4000100);  
+    mac_result = __SEL(evenA1, neg_evenA1);
+    weightedSum = __SADD16(mac_result, weightedSum);  
+
+    __USUB16(0, wt & 0x8000200);  
+    mac_result = __SEL(oddA1, neg_oddA1);
+    weightedSum = __SADD16(mac_result, weightedSum);  
+
+
+    memcpy((void*) &in, inputs + 12, sizeof(in));
+    evenA1 = __UXTB16(in);
+    oddA1  = __UXTB16_RORn(in, 8);
+    neg_evenA1 = __SSUB16(0, evenA1);
+    neg_oddA1 = __SSUB16(0, oddA1);
+
+    __USUB16(0, wt & 0x40001000);  
+    mac_result = __SEL(evenA1, neg_evenA1);
+    weightedSum = __SADD16(mac_result, weightedSum);  
+
+    __USUB16(0, wt & 0x80002000);  
+    mac_result = __SEL(oddA1, neg_oddA1);
+    weightedSum = __SADD16(mac_result, weightedSum);
+
+
+    memcpy((void*) &in, inputs + 16, sizeof(in));
+    evenA1 = __UXTB16(in);
+    oddA1  = __UXTB16_RORn(in, 8);
+    neg_evenA1 = __SSUB16(0, evenA1);
+    neg_oddA1 = __SSUB16(0, oddA1);
+
+    __USUB16(0, wt2 & 0x40001);  
+    mac_result = __SEL(evenA1, neg_evenA1);
+    weightedSum = __SADD16(mac_result, weightedSum);  
+
+    __USUB16(0, wt2 & 0x80002);  
+    mac_result = __SEL(oddA1, neg_oddA1);
+    weightedSum = __SADD16(mac_result, weightedSum);  
+
+
+    memcpy((void*) &in, inputs + 20, sizeof(in));
+    evenA1 = __UXTB16(in);
+    oddA1  = __UXTB16_RORn(in, 8);
+    neg_evenA1 = __SSUB16(0, evenA1);
+    neg_oddA1 = __SSUB16(0, oddA1);
+
+    __USUB16(0, wt2 & 0x400010);  
+    mac_result = __SEL(evenA1, neg_evenA1);
+    weightedSum = __SADD16(mac_result, weightedSum);  
+
+    __USUB16(0, wt2 & 0x800020);  
+    mac_result = __SEL(oddA1, neg_oddA1);
+    weightedSum = __SADD16(mac_result, weightedSum); 
+
+
+    memcpy((void*) &in, inputs + 24, sizeof(in));
+    evenA1 = __UXTB16(in);
+    oddA1  = __UXTB16_RORn(in, 8);
+    neg_evenA1 = __SSUB16(0, evenA1);
+    neg_oddA1 = __SSUB16(0, oddA1);
+
+    __USUB16(0, wt2 & 0x4000100);  
+    mac_result = __SEL(evenA1, neg_evenA1);
+    weightedSum = __SADD16(mac_result, weightedSum);  
+
+    __USUB16(0, wt2 & 0x8000200);  
+    mac_result = __SEL(oddA1, neg_oddA1);
+    weightedSum = __SADD16(mac_result, weightedSum);  
+
+
+    memcpy((void*) &in, inputs + 28, sizeof(in));
+    evenA1 = __UXTB16(in);
+    oddA1  = __UXTB16_RORn(in, 8);
+    neg_evenA1 = __SSUB16(0, evenA1);
+    neg_oddA1 = __SSUB16(0, oddA1);
+
+    __USUB16(0, wt2 & 0x40001000);  
+    mac_result = __SEL(evenA1, neg_evenA1);
+    weightedSum = __SADD16(mac_result, weightedSum);  
+
+    __USUB16(0, wt2 & 0x80002000);  
+    mac_result = __SEL(oddA1, neg_oddA1);
+    weightedSum = __SADD16(mac_result, weightedSum);
+
+    return weightedSum;
+}
+
+// ----------------------------------------------------------------------------
+// ----------------- MAC computing functions for kernel -----------------------
+// ------------------------------- 1W / 7A ------------------------------------
+// ----------------------------------------------------------------------------
+
+template<typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
+         && std::numeric_limits<Input_T>::digits == 7)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+Sum_T octoMac (const Input_T* __restrict inputs,
+                  const Weight_T* __restrict weights,
+                  Sum_T weightedSum)
+{
+    uint32_t mac_result = 0;
+    uint32_t in;
+    uint32_t neg_in;
+    uint32_t wt = 0;
+    std::memcpy((void*) &wt, weights, 1);
+    wt |= wt << 8;
+    wt |= wt << 16;
+
+    memcpy((void*) &in, inputs, sizeof(in));
+
+    // Sign extend 
+    if (!std::is_unsigned<Input_T>::value)
+        in = (in + 0xC0C0C0C0) ^ 0xC0C0C0C0;
+
+    neg_in = __SSUB8(0, in);
+    __USUB8(0, wt & 0x08040201);  
+    mac_result = __SEL(in, neg_in);
+    weightedSum = __SXTAB16(weightedSum, mac_result);
+    weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); 
+
+    memcpy((void*) &in, inputs + 4, sizeof(in));
+
+    // Sign extend 
+    if (!std::is_unsigned<Input_T>::value)
+        in = (in + 0xC0C0C0C0) ^ 0xC0C0C0C0;
+
+    neg_in = __SSUB8(0, in);
+    __USUB8(0, wt & 0x80402010);  
+    mac_result = __SEL(in, neg_in);
+    weightedSum = __SXTAB16(weightedSum, mac_result);
+    weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); 
+
+    return weightedSum;
+}
+
+template<typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
+         && std::numeric_limits<Input_T>::digits == 7)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+Sum_T quadquadMac (const Input_T* __restrict inputs,
+                   const Weight_T* __restrict weights,
+                   Sum_T weightedSum)
 {
-    return std::is_unsigned<Input_T>::value ? __UXTB16(val) : __SXTB16(val);
+    uint32_t mac_result = 0;
+    uint32_t in;
+    uint32_t neg_in;
+    uint32_t wt = 0;
+    std::memcpy((void*) &wt, weights, 4);
+
+    memcpy((void*) &in, inputs, sizeof(in));
+    neg_in = __SSUB8(0, in);
+    __USUB8(0, wt & 0x01010101);  
+    mac_result = __SEL(in, neg_in);
+    weightedSum = __SXTAB16(weightedSum, mac_result);
+    weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); 
+
+    memcpy((void*) &in, inputs + 4, sizeof(in));
+    neg_in = __SSUB8(0, in);
+    __USUB8(0, wt & 0x02020202);  
+    mac_result = __SEL(in, neg_in);
+    weightedSum = __SXTAB16(weightedSum, mac_result);
+    weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); 
+
+    memcpy((void*) &in, inputs + 8, sizeof(in));
+    neg_in = __SSUB8(0, in);
+    __USUB8(0, wt & 0x04040404);  
+    mac_result = __SEL(in, neg_in);
+    weightedSum = __SXTAB16(weightedSum, mac_result);
+    weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); 
+
+    memcpy((void*) &in, inputs + 12, sizeof(in));
+    neg_in = __SSUB8(0, in);
+    __USUB8(0, wt & 0x08080808);  
+    mac_result = __SEL(in, neg_in);
+    weightedSum = __SXTAB16(weightedSum, mac_result);
+    weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); 
+
+    return weightedSum;
 }
 
-template<int INPUTS_INC = 1,
-         int WEIGHTS_INC = 1,
-         typename Input_T,
-         typename Weight_T,
-         typename Sum_T>
-inline static
-Sum_T dualMac(const Input_T* __restrict inputs, 
-              const Weight_T* __restrict weights, 
-              Sum_T weightedSum) 
+template<typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
+         && std::numeric_limits<Input_T>::digits == 7)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+Sum_T octoquadMac (const Input_T* __restrict inputs,
+                   const Weight_T* __restrict weights,
+                   Sum_T weightedSum)
 {
-    weightedSum += inputs[0] * weights[0]
-        + inputs[INPUTS_INC] * weights[WEIGHTS_INC];
+    uint32_t mac_result = 0;
+    uint32_t in;
+    uint32_t neg_in;
+    uint32_t wt = 0;
+    std::memcpy((void*) &wt, weights, 4);
+
+    memcpy((void*) &in, inputs, sizeof(in));
+    neg_in = __SSUB8(0, in);
+    __USUB8(0, wt & 0x01010101);  
+    mac_result = __SEL(in, neg_in);
+    weightedSum = __SXTAB16(weightedSum, mac_result);
+    weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); 
+
+    memcpy((void*) &in, inputs + 4, sizeof(in));
+    neg_in = __SSUB8(0, in);
+    __USUB8(0, wt & 0x02020202);  
+    mac_result = __SEL(in, neg_in);
+    weightedSum = __SXTAB16(weightedSum, mac_result);
+    weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); 
+
+    memcpy((void*) &in, inputs + 8, sizeof(in));
+    neg_in = __SSUB8(0, in);
+    __USUB8(0, wt & 0x04040404);  
+    mac_result = __SEL(in, neg_in);
+    weightedSum = __SXTAB16(weightedSum, mac_result);
+    weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); 
+
+    memcpy((void*) &in, inputs + 12, sizeof(in));
+    neg_in = __SSUB8(0, in);
+    __USUB8(0, wt & 0x08080808);  
+    mac_result = __SEL(in, neg_in);
+    weightedSum = __SXTAB16(weightedSum, mac_result);
+    weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); 
+
+    memcpy((void*) &in, inputs + 16, sizeof(in));
+    neg_in = __SSUB8(0, in);
+    __USUB8(0, wt & 0x10101010);  
+    mac_result = __SEL(in, neg_in);
+    weightedSum = __SXTAB16(weightedSum, mac_result);
+    weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); 
+
+    memcpy((void*) &in, inputs + 20, sizeof(in));
+    neg_in = __SSUB8(0, in);
+    __USUB8(0, wt & 0x20202020);  
+    mac_result = __SEL(in, neg_in);
+    weightedSum = __SXTAB16(weightedSum, mac_result);
+    weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); 
+
+    memcpy((void*) &in, inputs + 24, sizeof(in));
+    neg_in = __SSUB8(0, in);
+    __USUB8(0, wt & 0x40404040);  
+    mac_result = __SEL(in, neg_in);
+    weightedSum = __SXTAB16(weightedSum, mac_result);
+    weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); 
+
+    memcpy((void*) &in, inputs + 28, sizeof(in));
+    neg_in = __SSUB8(0, in);
+    __USUB8(0, wt & 0x80808080);  
+    mac_result = __SEL(in, neg_in);
+    weightedSum = __SXTAB16(weightedSum, mac_result);
+    weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); 
 
     return weightedSum;
 }
 
-template<int INPUTS_INC = 1,
-         int WEIGHTS_INC = 1,
-         typename Input_T,
-         typename Weight_T,
-         typename Sum_T,
-         typename std::enable_if<std::is_floating_point<Input_T>::value>::type* = nullptr>
-inline static
-Sum_T quadMac(const Input_T* __restrict inputs, 
-              const Weight_T* __restrict weights, 
-              Sum_T weightedSum) 
+// ----------------------------------------------------------------------------
+// ----------------- MAC computing functions for kernel -----------------------
+// ------------------------------- 1W / 5A ------------------------------------
+// ----------------------------------------------------------------------------
+
+template<typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
+         && std::numeric_limits<Input_T>::digits == 5)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+Sum_T octoMac (const Input_T* __restrict inputs,
+                  const Weight_T* __restrict weights,
+                  Sum_T weightedSum)
 {
-    weightedSum += inputs[0*INPUTS_INC] * weights[0*WEIGHTS_INC]
-        + inputs[1*INPUTS_INC] * weights[1*WEIGHTS_INC]
-        + inputs[2*INPUTS_INC] * weights[2*WEIGHTS_INC]
-        + inputs[3*INPUTS_INC] * weights[3*WEIGHTS_INC];
+    uint32_t sum = 0;
+    uint32_t mac_result = 0;
+    uint32_t in;
+    uint32_t neg_in;
+    uint32_t wt = 0;
+    std::memcpy((void*) &wt, weights, 1);
+    wt |= wt << 8;
+    wt |= wt << 16;
+
+    memcpy((void*) &in, inputs, sizeof(in));
+
+    // Sign extend 
+    if (!std::is_unsigned<Input_T>::value)
+        in = (in + 0x70707070) ^ 0x70707070;
+
+    neg_in = __SSUB8(0, in);
+    __USUB8(0, wt & 0x08040201);  
+    sum = __SEL(in, neg_in);
+
+    memcpy((void*) &in, inputs + 4, sizeof(in));
+
+    // Sign extend 
+    if (!std::is_unsigned<Input_T>::value)
+        in = (in + 0x70707070) ^ 0x70707070;
+
+    neg_in = __SSUB8(0, in);
+    __USUB8(0, wt & 0x80402010);  
+    mac_result = __SEL(in, neg_in);
+      
+    sum = __QADD8(sum, mac_result);
 
     return weightedSum;
 }
 
-template<int INPUTS_INC = 1,
-         int WEIGHTS_INC = 1,
-         typename Input_T,
-         typename Weight_T,
-         typename Sum_T,
-         typename std::enable_if<!std::is_floating_point<Input_T>::value>::type* = nullptr>
-inline static
-Sum_T quadMac(const Input_T* __restrict inputs, 
-              const Weight_T* __restrict weights, 
-              Sum_T weightedSum) 
+template<typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
+         && std::numeric_limits<Input_T>::digits == 5)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+Sum_T quadquadMac (const Input_T* __restrict inputs,
+                   const Weight_T* __restrict weights,
+                   Sum_T weightedSum)
 {
-    if(INPUTS_INC != 1 || WEIGHTS_INC != 1) {
-        weightedSum += inputs[0*INPUTS_INC] * weights[0*WEIGHTS_INC]
-            + inputs[1*INPUTS_INC] * weights[1*WEIGHTS_INC]
-            + inputs[2*INPUTS_INC] * weights[2*WEIGHTS_INC]
-            + inputs[3*INPUTS_INC] * weights[3*WEIGHTS_INC];
+    uint32_t sum = 0;
+    uint32_t mac_result = 0;
+    uint32_t in;
+    uint32_t neg_in;
+    uint32_t wt = 0;
+    std::memcpy((void*) &wt, weights, 2);
 
-        return weightedSum;
-    }
+    memcpy((void*) &in, inputs, sizeof(in));
+    neg_in = __SSUB8(0, in);
+    __USUB8(0, wt & 0x01010101);  
+    sum = __SEL(in, neg_in);
 
-    // Inputs loading & preparation
+    memcpy((void*) &in, inputs + 4, sizeof(in));
+    neg_in = __SSUB8(0, in);
+    __USUB8(0, wt & 0x02020202);   
+    mac_result = __SEL(in, neg_in);
+    sum = __QADD8(sum, mac_result);
+
+    memcpy((void*) &in, inputs + 8, sizeof(in));
+    neg_in = __SSUB8(0, in);
+    __USUB8(0, wt & 0x04040404);   
+    mac_result = __SEL(in, neg_in);
+    sum = __QADD8(sum, mac_result);
+
+    memcpy((void*) &in, inputs + 12, sizeof(in));
+    neg_in = __SSUB8(0, in);
+    __USUB8(0, wt & 0x08080808);   
+    mac_result = __SEL(in, neg_in);
+    sum = __QADD8(sum, mac_result);
+
+    weightedSum = __SXTAB16(weightedSum, sum);
+    weightedSum = __SXTAB16_RORn(weightedSum, sum, 8);
+
+    return weightedSum;
+}
+
+template<typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
+         && std::numeric_limits<Input_T>::digits == 5)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+Sum_T octoquadMac (const Input_T* __restrict inputs,
+                   const Weight_T* __restrict weights,
+                   Sum_T weightedSum)
+{
+    uint32_t sum = 0;
+    uint32_t mac_result = 0;
     uint32_t in;
+    uint32_t neg_in;
+    uint32_t wt = 0;
+    std::memcpy((void*) &wt, weights, 4);
+
     memcpy((void*) &in, inputs, sizeof(in));
-    
-    uint32_t in1 = XTB16<Input_T>(in);
-    uint32_t in2 = XTB16<Input_T>(in >> 8);
-    
-    // Weights loading & preparation
-    uint32_t wt;
-    memcpy((void*) &wt, weights, sizeof(wt));
-    
-    uint32_t wt1 = XTB16<Weight_T>(wt);
-    uint32_t wt2 = XTB16<Weight_T>(wt >> 8);
+    neg_in = __SSUB8(0, in);
+    __USUB8(0, wt & 0x01010101);  
+    sum = __SEL(in, neg_in);
+
+    memcpy((void*) &in, inputs + 4, sizeof(in));
+    neg_in = __SSUB8(0, in);
+    __USUB8(0, wt & 0x02020202);   
+    mac_result = __SEL(in, neg_in);
+    sum = __QADD8(sum, mac_result);
+
+    memcpy((void*) &in, inputs + 8, sizeof(in));
+    neg_in = __SSUB8(0, in);
+    __USUB8(0, wt & 0x04040404);   
+    mac_result = __SEL(in, neg_in);
+    sum = __QADD8(sum, mac_result);
+
+    memcpy((void*) &in, inputs + 12, sizeof(in));
+    neg_in = __SSUB8(0, in);
+    __USUB8(0, wt & 0x08080808);   
+    mac_result = __SEL(in, neg_in);
+    sum = __QADD8(sum, mac_result);
+
+    memcpy((void*) &in, inputs + 16, sizeof(in));
+    neg_in = __SSUB8(0, in);
+    __USUB8(0, wt & 0x10101010);  
+    mac_result = __SEL(in, neg_in);
+    sum = __QADD8(sum, mac_result);
+
+    memcpy((void*) &in, inputs + 20, sizeof(in));
+    neg_in = __SSUB8(0, in);
+    __USUB8(0, wt & 0x20202020);   
+    mac_result = __SEL(in, neg_in);
+    sum = __QADD8(sum, mac_result);
+
+    memcpy((void*) &in, inputs + 24, sizeof(in));
+    neg_in = __SSUB8(0, in);
+    __USUB8(0, wt & 0x40404040);   
+    mac_result = __SEL(in, neg_in);
+    sum = __QADD8(sum, mac_result);
+
+    memcpy((void*) &in, inputs + 28, sizeof(in));
+    neg_in = __SSUB8(0, in);
+    __USUB8(0, wt & 0x80808080);   
+    mac_result = __SEL(in, neg_in);
+    sum = __QADD8(sum, mac_result);
+
+    weightedSum = __SXTAB16(weightedSum, sum);
+    weightedSum = __SXTAB16_RORn(weightedSum, sum, 8);
 
-    // Computation
-    if(std::is_same<Sum_T, int32_t>::value) {
-        weightedSum = __SMLAD(in1, wt1, weightedSum);
-        weightedSum = __SMLAD(in2, wt2, weightedSum);
-    }
-    else {
-        weightedSum = __SMLALD(in1, wt1, weightedSum);
-        weightedSum = __SMLALD(in2, wt2, weightedSum);
-        
-    }
-    
     return weightedSum;
 }
 
 
+template<int NB_ITERATIONS,
+         typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
+         && std::numeric_limits<Input_T>::digits <= 8 
+         && std::numeric_limits<Input_T>::digits > 4
+         && NB_ITERATIONS >= 8 && NB_ITERATIONS < 16)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+void macsOnRange (const Input_T* __restrict inputs,
+                  const Weight_T* __restrict weights,
+                  Sum_T& weightedSum)
+{
+    weightedSum = octoMac(inputs, weights, weightedSum);
+    macsOnRange<NB_ITERATIONS-8>(inputs + 8, weights + 1, weightedSum);
+}
+
+template<int NB_ITERATIONS,
+         typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
+         && std::numeric_limits<Input_T>::digits <= 8 
+         && std::numeric_limits<Input_T>::digits > 4
+         && NB_ITERATIONS >= 16 && NB_ITERATIONS < 32)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+void macsOnRange (const Input_T* __restrict inputs,
+                  const Weight_T* __restrict weights,
+                  Sum_T& weightedSum)
+{
+    weightedSum = quadquadMac(inputs, weights, weightedSum);
+    macsOnRange<NB_ITERATIONS-16>(inputs + 16, weights + 2, weightedSum);
+}
+
+template<int NB_ITERATIONS,
+         typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
+         && std::numeric_limits<Input_T>::digits <= 8 
+         && std::numeric_limits<Input_T>::digits > 4
+         && NB_ITERATIONS >= 32)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+void macsOnRange (const Input_T* __restrict inputs,
+                  const Weight_T* __restrict weights,
+                  Sum_T& weightedSum)
+{
+    weightedSum = octoquadMac(inputs, weights, weightedSum);
+    macsOnRange<NB_ITERATIONS-32>(inputs + 32, weights + 4, weightedSum);
+}
+
+
+// ----------------------------------------------------------------------------
+// ----------------- MAC computing functions for kernel -----------------------
+// ------------------------------- 1W / 4A ------------------------------------
+// ----------------------------------------------------------------------------
 
-// **************************************************************************
-// * Multiply-accumulate the values in inputs and weights for NB_ITERATIONS *
-// **************************************************************************
+template<typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
+         && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+Sum_T monoMac (const Input_T* __restrict inputs,
+               const Weight_T* __restrict weights,
+               Sum_T weightedSum)
+{
+    weightedSum += (weights[0].fields.op7) ? (Sum_T)(-(inputs[0].fields.op1)) : (Sum_T)(inputs[0].fields.op1);
+    return weightedSum;
+}
 
 template<int NB_ITERATIONS,
-         int INPUTS_INC = 1,
-         int WEIGHTS_INC = 1,
-         class Input_T, 
-         class Weight_T,
-         class Sum_T,
-         typename std::enable_if<(NB_ITERATIONS == 0)>::type* = nullptr>
-inline static 
-void macsOnRange(const Input_T* __restrict /*inputs*/, 
-                 const Weight_T* __restrict /*weights*/, 
-                 Sum_T& __restrict /*weightedSum*/) 
+         typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
+         && std::numeric_limits<Input_T>::digits == 4
+         && NB_ITERATIONS == 2)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+void macsOnRange (const Input_T* __restrict inputs,
+                  const Weight_T* __restrict weights,
+                  Sum_T& weightedSum)
 {
-    // Nothing to do
+    weightedSum += (weights[0].fields.op7) ? (Sum_T)(-(inputs[0].fields.op1)) : (Sum_T)(inputs[0].fields.op1);
+    weightedSum += (weights[0].fields.op6) ? (Sum_T)(-(inputs[0].fields.op0)) : (Sum_T)(inputs[0].fields.op0);
 }
 
 template<int NB_ITERATIONS,
-         int INPUTS_INC = 1,
-         int WEIGHTS_INC = 1,
-         class Input_T, 
-         class Weight_T,
-         class Sum_T,
-         typename std::enable_if<(NB_ITERATIONS == 1)>::type* = nullptr>
-inline static 
-void macsOnRange(const Input_T* __restrict inputs, 
-                 const Weight_T* __restrict weights, 
-                 Sum_T& __restrict weightedSum) 
+         typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
+         && std::numeric_limits<Input_T>::digits == 4
+         && NB_ITERATIONS == 3)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+void macsOnRange (const Input_T* __restrict inputs,
+                  const Weight_T* __restrict weights,
+                  Sum_T& weightedSum)
 {
-    weightedSum += (*weights) * (*inputs);
+    weightedSum += (weights[0].fields.op7) ? (Sum_T)(-(inputs[0].fields.op1)) : (Sum_T)(inputs[0].fields.op1);
+    weightedSum += (weights[0].fields.op6) ? (Sum_T)(-(inputs[0].fields.op0)) : (Sum_T)(inputs[0].fields.op0);
+    weightedSum += (weights[0].fields.op5) ? (Sum_T)(-(inputs[1].fields.op1)) : (Sum_T)(inputs[1].fields.op1);
 }
 
 template<int NB_ITERATIONS,
-         int INPUTS_INC = 1,
-         int WEIGHTS_INC = 1,
-         class Input_T, 
-         class Weight_T,
-         class Sum_T,
-         typename std::enable_if<(NB_ITERATIONS >= 2 && NB_ITERATIONS < 4)>::type* = nullptr>
-inline static 
-void macsOnRange(const Input_T* __restrict inputs, 
-                 const Weight_T* __restrict weights, 
-                 Sum_T& __restrict weightedSum) 
+         typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
+         && std::numeric_limits<Input_T>::digits == 4
+         && NB_ITERATIONS == 4)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+void macsOnRange (const Input_T* __restrict inputs,
+                  const Weight_T* __restrict weights,
+                  Sum_T& weightedSum)
 {
-    weightedSum = dualMac<INPUTS_INC, WEIGHTS_INC>(inputs, weights, weightedSum);
-    macsOnRange<NB_ITERATIONS - 2, INPUTS_INC, WEIGHTS_INC>(inputs + 2*INPUTS_INC, 
-                                                            weights + 2*WEIGHTS_INC, 
-                                                            weightedSum);
+    weightedSum += (weights[0].fields.op7) ? (Sum_T)(-(inputs[0].fields.op1)) : (Sum_T)(inputs[0].fields.op1);
+    weightedSum += (weights[0].fields.op6) ? (Sum_T)(-(inputs[0].fields.op0)) : (Sum_T)(inputs[0].fields.op0);
+    weightedSum += (weights[0].fields.op5) ? (Sum_T)(-(inputs[1].fields.op1)) : (Sum_T)(inputs[1].fields.op1);
+    weightedSum += (weights[0].fields.op4) ? (Sum_T)(-(inputs[1].fields.op0)) : (Sum_T)(inputs[1].fields.op0);
+}
+
+template<int NB_ITERATIONS,
+         typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
+         && std::numeric_limits<Input_T>::digits == 4
+         && NB_ITERATIONS == 5)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+void macsOnRange (const Input_T* __restrict inputs,
+                  const Weight_T* __restrict weights,
+                  Sum_T& weightedSum)
+{
+    weightedSum += (weights[0].fields.op7) ? (Sum_T)(-(inputs[0].fields.op1)) : (Sum_T)(inputs[0].fields.op1);
+    weightedSum += (weights[0].fields.op6) ? (Sum_T)(-(inputs[0].fields.op0)) : (Sum_T)(inputs[0].fields.op0);
+    weightedSum += (weights[0].fields.op5) ? (Sum_T)(-(inputs[1].fields.op1)) : (Sum_T)(inputs[1].fields.op1);
+    weightedSum += (weights[0].fields.op4) ? (Sum_T)(-(inputs[1].fields.op0)) : (Sum_T)(inputs[1].fields.op0);
+    weightedSum += (weights[0].fields.op3) ? (Sum_T)(-(inputs[2].fields.op1)) : (Sum_T)(inputs[2].fields.op1);
+}
+
+template<int NB_ITERATIONS,
+         typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
+         && std::numeric_limits<Input_T>::digits == 4
+         && NB_ITERATIONS == 6)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+void macsOnRange (const Input_T* __restrict inputs,
+                  const Weight_T* __restrict weights,
+                  Sum_T& weightedSum)
+{
+    weightedSum += (weights[0].fields.op7) ? (Sum_T)(-(inputs[0].fields.op1)) : (Sum_T)(inputs[0].fields.op1);
+    weightedSum += (weights[0].fields.op6) ? (Sum_T)(-(inputs[0].fields.op0)) : (Sum_T)(inputs[0].fields.op0);
+    weightedSum += (weights[0].fields.op5) ? (Sum_T)(-(inputs[1].fields.op1)) : (Sum_T)(inputs[1].fields.op1);
+    weightedSum += (weights[0].fields.op4) ? (Sum_T)(-(inputs[1].fields.op0)) : (Sum_T)(inputs[1].fields.op0);
+    weightedSum += (weights[0].fields.op3) ? (Sum_T)(-(inputs[2].fields.op1)) : (Sum_T)(inputs[2].fields.op1);
+    weightedSum += (weights[0].fields.op2) ? (Sum_T)(-(inputs[2].fields.op0)) : (Sum_T)(inputs[2].fields.op0);
+}
+
+template<int NB_ITERATIONS,
+         typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
+         && std::numeric_limits<Input_T>::digits == 4
+         && NB_ITERATIONS == 7)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+void macsOnRange (const Input_T* __restrict inputs,
+                  const Weight_T* __restrict weights,
+                  Sum_T& weightedSum)
+{
+    weightedSum += (weights[0].fields.op7) ? (Sum_T)(-(inputs[0].fields.op1)) : (Sum_T)(inputs[0].fields.op1);
+    weightedSum += (weights[0].fields.op6) ? (Sum_T)(-(inputs[0].fields.op0)) : (Sum_T)(inputs[0].fields.op0);
+    weightedSum += (weights[0].fields.op5) ? (Sum_T)(-(inputs[1].fields.op1)) : (Sum_T)(inputs[1].fields.op1);
+    weightedSum += (weights[0].fields.op4) ? (Sum_T)(-(inputs[1].fields.op0)) : (Sum_T)(inputs[1].fields.op0);
+    weightedSum += (weights[0].fields.op3) ? (Sum_T)(-(inputs[2].fields.op1)) : (Sum_T)(inputs[2].fields.op1);
+    weightedSum += (weights[0].fields.op2) ? (Sum_T)(-(inputs[2].fields.op0)) : (Sum_T)(inputs[2].fields.op0);
+    weightedSum += (weights[0].fields.op1) ? (Sum_T)(-(inputs[3].fields.op1)) : (Sum_T)(inputs[3].fields.op1);
+}
+
+template<typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
+         && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+Sum_T octoMac (const Input_T* __restrict inputs,
+                  const Weight_T* __restrict weights,
+                  Sum_T weightedSum)
+{
+    uint32_t sum = 0;
+    uint32_t mac_result = 0;
+    uint32_t in;
+    uint32_t neg_in;
+    uint32_t wt = 0;
+    std::memcpy((void*) &wt, weights, 1);
+    wt |= wt << 8;
+    wt |= wt << 16;
+
+    memcpy((void*) &in, inputs, sizeof(in));
+
+    neg_in = __SSUB8(0, in & 0x0F0F0F0F);
+    __USUB8(0, wt & 0x40100401);   
+    mac_result = __SEL(in, neg_in);
+    sum = __QADD8(sum, mac_result);
+
+    neg_in = __SSUB8(0, (in >> 4) & 0xF0F0F0F0);
+    __USUB8(0, wt & 0x80200802);   
+    mac_result = __SEL(in, neg_in);
+    sum = __QADD8(sum, mac_result);
+
+    return weightedSum;
+}
+
+template<typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
+         && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+Sum_T quadquadMac (const Input_T* __restrict inputs,
+                   const Weight_T* __restrict weights,
+                   Sum_T weightedSum)
+{
+    uint32_t sum = 0;
+    uint32_t mac_result = 0;
+    uint32_t in;
+    uint32_t neg_in;
+    uint32_t wt = 0;
+    std::memcpy((void*) &wt, weights, 4);
+
+    memcpy((void*) &in, inputs, sizeof(in));
+
+    neg_in = __SSUB8(0, in & 0x0F0F0F0F);
+    __USUB8(0, wt & 0x01010101);   
+    mac_result = __SEL(in, neg_in);
+    sum = __QADD8(sum, mac_result);
+
+    neg_in = __SSUB8(0, (in >> 4) & 0x0F0F0F0F);
+    __USUB8(0, wt & 0x02020202);   
+    mac_result = __SEL(in, neg_in);
+    sum = __QADD8(sum, mac_result);
+
+    memcpy((void*) &in, inputs + 4, sizeof(in));
+
+    neg_in = __SSUB8(0, in & 0x0F0F0F0F);
+    __USUB8(0, wt & 0x04040404);   
+    mac_result = __SEL(in, neg_in);
+    sum = __QADD8(sum, mac_result);
+
+    neg_in = __SSUB8(0, (in >> 4) & 0x0F0F0F0F);
+    __USUB8(0, wt & 0x08080808);   
+    mac_result = __SEL(in, neg_in);
+    sum = __QADD8(sum, mac_result);
+
+
+    weightedSum = __SXTAB16(weightedSum, sum);
+    weightedSum = __SXTAB16_RORn(weightedSum, sum, 8);
+
+    return weightedSum;
+}
+
+template<typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
+         && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+Sum_T octoquadMac (const Input_T* __restrict inputs,
+                   const Weight_T* __restrict weights,
+                   Sum_T weightedSum)
+{
+    uint32_t sum = 0;
+    uint32_t mac_result = 0;
+    uint32_t in;
+    uint32_t neg_in;
+    uint32_t wt = 0;
+    std::memcpy((void*) &wt, weights, 4);
+
+    memcpy((void*) &in, inputs, sizeof(in));
+
+    neg_in = __SSUB8(0, in & 0x0F0F0F0F);
+    __USUB8(0, wt & 0x01010101);   
+    mac_result = __SEL(in, neg_in);
+    sum = __QADD8(sum, mac_result);
+
+    neg_in = __SSUB8(0, (in >> 4) & 0x0F0F0F0F);
+    __USUB8(0, wt & 0x02020202);   
+    mac_result = __SEL(in, neg_in);
+    sum = __QADD8(sum, mac_result);
+
+    memcpy((void*) &in, inputs + 4, sizeof(in));
+
+    neg_in = __SSUB8(0, in & 0x0F0F0F0F);
+    __USUB8(0, wt & 0x04040404);   
+    mac_result = __SEL(in, neg_in);
+    sum = __QADD8(sum, mac_result);
+
+    neg_in = __SSUB8(0, (in >> 4) & 0x0F0F0F0F);
+    __USUB8(0, wt & 0x08080808);   
+    mac_result = __SEL(in, neg_in);
+    sum = __QADD8(sum, mac_result);
+
+    memcpy((void*) &in, inputs + 8, sizeof(in));
+
+    neg_in = __SSUB8(0, in & 0x0F0F0F0F);
+    __USUB8(0, wt & 0x10101010);   
+    mac_result = __SEL(in, neg_in);
+    sum = __QADD8(sum, mac_result);
+
+    neg_in = __SSUB8(0, (in >> 4) & 0x0F0F0F0F);
+    __USUB8(0, wt & 0x20202020);   
+    mac_result = __SEL(in, neg_in);
+    sum = __QADD8(sum, mac_result);
+
+    memcpy((void*) &in, inputs + 12, sizeof(in));
+
+    neg_in = __SSUB8(0, in & 0x0F0F0F0F);
+    __USUB8(0, wt & 0x40404040);   
+    mac_result = __SEL(in, neg_in);
+    sum = __QADD8(sum, mac_result);
+
+    neg_in = __SSUB8(0, (in >> 4) & 0x0F0F0F0F);
+    __USUB8(0, wt & 0x80808080);   
+    mac_result = __SEL(in, neg_in);
+    sum = __QADD8(sum, mac_result);
+
+    weightedSum = __SXTAB16(weightedSum, sum);
+    weightedSum = __SXTAB16_RORn(weightedSum, sum, 8);
+
+    return weightedSum;
+}
+
+
+template<int NB_ITERATIONS,
+         typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
+         && std::numeric_limits<Input_T>::digits == 4
+         && NB_ITERATIONS >= 8 && NB_ITERATIONS < 16)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+void macsOnRange (const Input_T* __restrict inputs,
+                  const Weight_T* __restrict weights,
+                  Sum_T& weightedSum)
+{
+    weightedSum = octoMac(inputs, weights, weightedSum);
+    macsOnRange<NB_ITERATIONS-8>(inputs + 4, weights + 1, weightedSum);
+}
+
+template<int NB_ITERATIONS,
+         typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
+         && std::numeric_limits<Input_T>::digits == 4
+         && NB_ITERATIONS >= 16 && NB_ITERATIONS < 32)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+void macsOnRange (const Input_T* __restrict inputs,
+                  const Weight_T* __restrict weights,
+                  Sum_T& weightedSum)
+{
+    weightedSum = quadquadMac(inputs, weights, weightedSum);
+    macsOnRange<NB_ITERATIONS-16>(inputs + 8, weights + 2, weightedSum);
 }
 
-/**
- * @brief   MACs Processing
- * @details Performs NB_ITERATIONS MACs operations, storing results into the
- *          weightedSum variable. 
- * 
- * @tparam  NB_ITERATIONS   Number of MACs to perform
- * @tparam  INPUTS_INC      Input Stride
- * @tparam  WEIGHTS_INC     Weights Stride
- * @tparam  Input_T         Input Type
- * 
- * @param   inputs          Pointer to inputs vector
- * @param   weights         Pointer to weights vector
- * @param   weightedSum     Pointer to weightedSum
-*/
 template<int NB_ITERATIONS,
-         int INPUTS_INC = 1,
-         int WEIGHTS_INC = 1,
-         class Input_T, 
-         class Weight_T,
-         class Sum_T,
-         typename std::enable_if<(NB_ITERATIONS >= 4)>::type* = nullptr>
-inline static 
-void macsOnRange(const Input_T* __restrict inputs, 
-                 const Weight_T* __restrict weights, 
-                 Sum_T& __restrict weightedSum) 
+         typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
+         && std::numeric_limits<Input_T>::digits == 4
+         && NB_ITERATIONS >= 32)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+void macsOnRange (const Input_T* __restrict inputs,
+                  const Weight_T* __restrict weights,
+                  Sum_T& weightedSum)
+{
+    weightedSum = octoquadMac(inputs, weights, weightedSum);
+    macsOnRange<NB_ITERATIONS-32>(inputs + 16, weights + 4, weightedSum);
+}
+
+
+// ----------------------------------------------------------------------------
+// -------------- MAC computing functions for kernel 1W-7A --------------------
+// ----------------------------------------------------------------------------
+
+template<typename Input_T,
+         typename std::enable_if<(std::numeric_limits<Input_T>::digits == 7)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+uint32_t quadMacInter(const Input_T* __restrict inputs,
+                      const uint32_t weight,
+                      uint32_t weightedSum)
+{
+    uint32_t in;
+    memcpy((void*) &in, inputs, sizeof(in));
+
+    // Sign extend 
+    if (!std::is_unsigned<Input_T>::value)
+        in = (in + 0xC0C0C0C0) ^ 0xC0C0C0C0;
+
+    uint32_t neg_in = __SSUB8(0, in);
+
+    __USUB8(0, weight);  
+    uint32_t mac_result = __SEL(in, neg_in);
+      
+    uint32_t evenA1 = __SXTB16(mac_result);
+    uint32_t oddA1  = __SXTB16_RORn(mac_result, 8);
+
+    weightedSum = __SADD16(evenA1, weightedSum);  
+    weightedSum = __SADD16(oddA1, weightedSum);  
+
+    return weightedSum;
+}
+
+template<typename Input_T,
+         typename std::enable_if<(std::numeric_limits<Input_T>::digits == 7)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+uint32_t quadMacInterV2(const Input_T* __restrict inputs,
+                      const uint32_t weight,
+                      uint32_t weightedSum)
+{
+    uint32_t in;
+    memcpy((void*) &in, inputs, sizeof(in));
+
+    // Sign extend 
+    if (!std::is_unsigned<Input_T>::value)
+        in = (in + 0xC0C0C0C0) ^ 0xC0C0C0C0;
+
+    uint32_t neg_in = __SSUB8(0, in);
+
+    __USUB8(0, weight);  
+    uint32_t mac_result = __SEL(in, neg_in);
+      
+    weightedSum = __SXTAB16(weightedSum, mac_result);
+    weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); 
+
+    return weightedSum;
+}
+
+
+// ----------------------------------------------------------------------------
+// -------------- MAC computing functions for kernel 1W-5A --------------------
+// ----------------------------------------------------------------------------
+
+template<typename Input_T,
+         typename std::enable_if<(std::numeric_limits<Input_T>::digits == 5)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+uint32_t quadMacInter(const Input_T* __restrict inputs,
+                      const uint32_t weight,
+                      uint32_t weightedSum)
+{
+    uint32_t in;
+    memcpy((void*) &in, inputs, sizeof(in));
+
+    // Sign extend 
+    if (!std::is_unsigned<Input_T>::value)
+        in = (in + 0x70707070) ^ 0x70707070;
+
+    uint32_t neg_in = __SSUB8(0, in);
+
+    __USUB8(0, weight);  
+    uint32_t mac_result = __SEL(in, neg_in);
+      
+    weightedSum = __QADD8(weightedSum, mac_result);
+
+    return weightedSum;
+}
+
+
+// ----------------------------------------------------------------------------
+// ------------------- MAC computing general functions ------------------------
+// ----------------------------------------------------------------------------
+
+template<int NB_ITERATIONS, typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(NB_ITERATIONS == 0)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+void macsOnRange(const Input_T* __restrict /*inputs*/,
+                 const Weight_T* __restrict /*weights*/,
+                 Sum_T& /*weightedSum*/)
+{
+    // Nothing should happen
+}
+
+template<int NB_ITERATIONS, typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(NB_ITERATIONS == 1)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+void macsOnRange(const Input_T* __restrict inputs,
+                 const Weight_T* __restrict weights,
+                 Sum_T& weightedSum)
+{
+    weightedSum = monoMac(inputs, weights, weightedSum);
+}
+
+template<int NB_ITERATIONS, typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(NB_ITERATIONS >= 2 && NB_ITERATIONS < 4 && std::numeric_limits<Weight_T>::digits > 1)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+void macsOnRange(const Input_T* __restrict inputs,
+                 const Weight_T* __restrict weights,
+                 Sum_T& weightedSum)
+{
+    constexpr unsigned int idxI 
+        = (std::numeric_limits<Input_T>::digits > 4) ? 2 : 1;
+    constexpr unsigned int idxW 
+        = (std::numeric_limits<Weight_T>::digits > 4) ? 2 : 1;
+
+    weightedSum = dualMac(inputs, weights, weightedSum);
+    macsOnRange<NB_ITERATIONS - 2>(inputs + idxI, weights + idxW, weightedSum);
+}
+
+template<int NB_ITERATIONS, typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<NB_ITERATIONS >= 4 
+         && (std::numeric_limits<Weight_T>::digits > 4)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+void macsOnRange(const Input_T* __restrict inputs,
+                 const Weight_T* __restrict weights,
+                 Sum_T& weightedSum)
+{
+    constexpr unsigned int idxI 
+        = (std::numeric_limits<Input_T>::digits > 4) 
+          ? 4 : (std::numeric_limits<Input_T>::digits == 4) ? 2 : 1;
+
+    constexpr unsigned int idxW = 4;
+
+    weightedSum = quadMac(inputs, weights, weightedSum);
+    macsOnRange<NB_ITERATIONS - 4>(inputs + idxI, weights + idxW, weightedSum);
+}
+
+template<int NB_ITERATIONS, typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<(NB_ITERATIONS >= 4 && NB_ITERATIONS < 8) 
+         && (std::numeric_limits<Weight_T>::digits == 4)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+void macsOnRange(const Input_T* __restrict inputs,
+                 const Weight_T* __restrict weights,
+                 Sum_T& weightedSum)
 {
-    weightedSum = quadMac<INPUTS_INC, WEIGHTS_INC>(inputs, weights, weightedSum);
-    macsOnRange<NB_ITERATIONS - 4, INPUTS_INC, WEIGHTS_INC>(inputs + 4*INPUTS_INC, 
-                                                            weights + 4*WEIGHTS_INC, 
-                                                            weightedSum);
+    constexpr unsigned int idxI 
+        = (std::numeric_limits<Input_T>::digits > 4) 
+          ? 4 : (std::numeric_limits<Input_T>::digits == 4) ? 2 : 1;
+
+    constexpr unsigned int idxW = 2;
+
+    weightedSum = quadMac(inputs, weights, weightedSum);
+    macsOnRange<NB_ITERATIONS - 4>(inputs + idxI, weights + idxW, weightedSum);
 }
 
+template<int NB_ITERATIONS, typename Input_T, typename Weight_T, typename Sum_T,
+         typename std::enable_if<NB_ITERATIONS >= 8 
+         && (std::numeric_limits<Weight_T>::digits == 4)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+void macsOnRange(const Input_T* __restrict inputs,
+                 const Weight_T* __restrict weights,
+                 Sum_T& weightedSum)
+{
+    constexpr unsigned int idxI 
+        = (std::numeric_limits<Input_T>::digits > 4) 
+          ? 8 : (std::numeric_limits<Input_T>::digits == 4) 
+            ? 4 : (std::numeric_limits<Input_T>::digits == 2)
+              ? 2 : 1;
+
+    constexpr unsigned int idxW = 4;
+
+    weightedSum = octoMac(inputs, weights, weightedSum);
+    macsOnRange<NB_ITERATIONS - 8>(inputs + idxI, weights + idxW, weightedSum);
+}
 
-}   // N2D2_Export
 
-#endif  // __N2D2_EXPORT_CPP_MACS_HPP__
+#endif // __N2D2_MAC_FUNCTIONS_HPP__
diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/nn_scaling_functions.hpp b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/nn_scaling_functions.hpp
index 68d6f21..94615a5 100644
--- a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/nn_scaling_functions.hpp
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/nn_scaling_functions.hpp
@@ -27,15 +27,15 @@
 
 namespace N2D2_Export {
 
-static int64_t toInt64(uint32_t lo, uint32_t hi) {
-    return (int64_t) (((uint64_t) hi) << 32ull) | ((uint64_t) lo);
-}
-
-static int64_t smlal(int32_t lhs, int32_t rhs, 
-                     uint32_t accumLo, uint32_t accumHi) 
-{
-    return ((int64_t) lhs) * ((int64_t) rhs) + toInt64(accumLo, accumHi);
-}
+// static int64_t toInt64(uint32_t lo, uint32_t hi) {
+//     return (int64_t) (((uint64_t) hi) << 32ull) | ((uint64_t) lo);
+// }
+
+// static int64_t smlal(int32_t lhs, int32_t rhs, 
+//                      uint32_t accumLo, uint32_t accumHi) 
+// {
+//     return ((int64_t) lhs) * ((int64_t) rhs) + toInt64(accumLo, accumHi);
+// }
 
 // ---------------------------------------------------
 // ------------------- No Scaling --------------------
diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/subkernels_functions.hpp b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/subkernels_functions.hpp
new file mode 100644
index 0000000..62743db
--- /dev/null
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/subkernels_functions.hpp
@@ -0,0 +1,312 @@
+/**
+ ******************************************************************************
+ * @file     subkernels_functions.hpp
+ * @brief    Header file for the network subkernels
+ * 
+ ******************************************************************************
+ * @attention
+ * 
+ * (C) Copyright 2021 CEA LIST. All Rights Reserved.
+ *  Contributor(s): Vincent TEMPLIER (vincent.templier@cea.fr)
+ * 
+ * This file is not part of the open source version of N2D2 and is NOT under
+ * the CeCILL-C license. This code is the property of the CEA. It can not be
+ * copied or disseminated without its authorization.
+ * 
+ ******************************************************************************
+ */
+
+#ifndef __SUBKERNELS_FUNCTIONS_H__
+#define __SUBKERNELS_FUNCTIONS_H__
+
+#include <cstring>
+#include <cmsis_compiler.h>
+#include "typedefs.hpp"
+#include "assert.h"
+
+
+// ----------------------------------------------------------------------------
+// -------------------------- Compression functions ---------------------------
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief   Compact data during a loop with an accumulator
+ * @details This function is used in the network functions to compress 
+ *          and store a value in the outputs vector. The function adds 
+ *          the value to an accumulator. If the accumulator is full 
+ *          (ie all the available slots are taken), then the accumulator
+ *          is stored in the outputs. Otherwise, the accumulator temporaly
+ *          keeps the previous values and it is shifted by 
+ *          the number of bits required to store the quantized values.
+ * 
+ * @param[in]     value        Value to be stored in the accumulator
+ * @param[in,out] outputs      Pointer to compressed output vector
+ * @param[in,out] outputOffset Pointer to the current output index
+ * @param[in,out] infoPack     Object containing the accumulator
+ * @returns                    None
+ * 
+ */
+template<typename Output_T, typename std::enable_if_t<std::numeric_limits<Output_T>::digits < 8, int> = 0>
+__attribute__((always_inline)) static inline
+void compact_data_during_loop (Output_T value,
+                               Output_T* __restrict outputs,
+                               int& outputOffset,
+                               PackSupport& infoPack)
+{
+    if (std::numeric_limits<Output_T>::digits < 8) {
+        constexpr uint8_t mask = (1U << std::numeric_limits<Output_T>::digits) - 1;
+        constexpr uint8_t nbSlot = ceil((double)8/std::numeric_limits<Output_T>::digits);
+
+        infoPack.accumulator |= value.value & mask;
+        infoPack.cptAccumulator += 1;
+
+        if (infoPack.cptAccumulator == nbSlot) {
+            outputs[outputOffset] = (Output_T) infoPack.accumulator;
+            ++outputOffset;
+            infoPack.cptAccumulator = 0;
+            infoPack.accumulator = 0;
+        }
+        else {
+            infoPack.accumulator <<= std::numeric_limits<Output_T>::digits;
+        }
+    } else {
+        outputs[outputOffset] = (Output_T) value;
+        ++outputOffset;
+    }
+}
+
+template<typename Output_T, typename std::enable_if_t<std::numeric_limits<Output_T>::digits >= 8, int> = 0>
+__attribute__((always_inline)) static inline
+void compact_data_during_loop (const Output_T value,
+                               Output_T* __restrict outputs,
+                               int& outputOffset,
+                               PackSupport& infoPack)
+{
+    outputs[outputOffset] = value;
+}
+
+/**
+ * @brief   Compact data after a loop with an accumulator
+ * @details It may happen that the accumulator is not completely filled
+ *          after calling "compact_data_during_loop" and the stored 
+ *          quantized values in the accumulator have not been saved
+ *          in the outputs. Thus, this function adds extra zeros to the
+ *          accumulator until it is full. Then the accumulator is 
+ *          stored in the outputs. 
+ *          This function should always be called at the end of a loop
+ *          where "compact_data_during_loop" is called
+ * 
+ * @param[in,out] outputs      Pointer to compressed output vector
+ * @param[in,out] outputOffset Current output index
+ * @param[in,out] infoPack     Object containing the accumulator
+ * @returns                    None
+ * 
+ */
+template<typename Output_T, typename std::enable_if_t<std::numeric_limits<Output_T>::digits < 8, int> = 0>
+__attribute__((always_inline)) static inline
+void compact_data_end_loop (Output_T* __restrict outputs,
+                            int& outputOffset,
+                            PackSupport& infoPack)
+{
+    if (std::numeric_limits<Output_T>::digits < 8) {
+    
+        // if data still accumulated but not stored
+        if (infoPack.cptAccumulator != 0) {
+            constexpr unsigned int nbSlot = ceil((double)8/std::numeric_limits<Output_T>::digits);
+
+            // Add extra zero to shift data to the left
+            infoPack.cptAccumulator += 1;
+            while (infoPack.cptAccumulator < nbSlot) {
+                infoPack.accumulator <<= std::numeric_limits<Output_T>::digits;
+                infoPack.cptAccumulator += 1;
+            }
+            outputs[outputOffset] = infoPack.accumulator;
+            ++outputOffset;
+            infoPack.cptAccumulator = 0;
+            infoPack.accumulator = 0;
+        }
+    }
+}
+
+template<typename Output_T, typename std::enable_if_t<std::numeric_limits<Output_T>::digits >= 8, int> = 0>
+__attribute__((always_inline)) static inline
+void compact_data_end_loop (Output_T* __restrict outputs,
+                            int& outputOffset,
+                            PackSupport& infoPack)
+{
+    //  Nothing
+}
+
+
+
+// ----------------------------------------------------------------------------
+// ------------------------- Pooling subfunctions -----------------------------
+// ------------------------------ Max Pooling ---------------------------------
+// ----------------------------------------------------------------------------
+
+__attribute__((always_inline)) static inline
+int get_pool_nbData (const int nbBits)
+{
+    int nb_data = 1;
+    switch (nbBits)
+    {
+    case 8: nb_data = 4;
+            break;
+    case 4: nb_data = 2;
+            break;
+    case 16: nb_data = 2;
+            break;
+    default:
+        break;
+    }
+    return nb_data;
+}
+
+template<typename Output_T,
+    typename std::enable_if<std::numeric_limits<Output_T>::digits == 4>::type* = nullptr>
+__attribute__((always_inline)) static inline
+void storeMaxPooling (Output_T* __restrict outputs,
+                      int& outputOffset,
+                      const uint32_t maxVal,
+                      const int nb_data)
+{
+    uint32_t data_val = maxVal;
+    assert(nb_data == 2 || nb_data == 1);
+
+    // Gather bytes in pairs of bytes
+    // Ex: 0x0A050403 -> 0x00A50043
+    data_val = ((data_val & 0x0F000F00) >> 4) | (data_val & 0x000F000F);
+
+    // Output compression and storage
+    for (int index = 0; index < nb_data; ++index) {
+        outputs[outputOffset] = (uint8_t) ((data_val >> 16*index) & 0xFF);
+        outputOffset += 1;
+    }
+}
+
+template<typename Output_T,
+    typename std::enable_if<std::numeric_limits<Output_T>::digits == 8>::type* = nullptr>
+__attribute__((always_inline)) static inline
+void storeMaxPooling (Output_T* __restrict outputs,
+                      int& outputOffset,
+                      const uint32_t maxVal,
+                      const int nb_data)
+{
+    memcpy(outputs, &maxVal, nb_data*sizeof(uint8_t));
+}
+
+template<typename Input_T,
+         typename std::enable_if<(std::is_unsigned<Input_T>::value
+         && std::numeric_limits<Input_T>::digits == 16)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+void parallelMaxPooling (const Input_T* __restrict inputs,
+                         uint32_t& maxVal,
+                         const int nb_data)
+{
+    assert(nb_data == 2 || nb_data == 1);
+
+    uint32_t in = 0;
+    memcpy((void*) &in, inputs, nb_data*sizeof(uint16_t));
+
+    maxVal = __UQSUB16(maxVal, in);
+    maxVal = __UQADD16(maxVal, in);
+}
+
+template<typename Input_T,
+         typename std::enable_if<(!std::is_unsigned<Input_T>::value
+         && std::numeric_limits<Input_T>::digits == 16)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+void parallelMaxPooling (const Input_T* __restrict inputs,
+                         uint32_t maxVal,
+                         const int nb_data)
+{
+    assert(nb_data == 2 || nb_data == 1);
+
+    uint32_t in = 0;
+    memcpy((void*) &in, inputs, nb_data*sizeof(uint16_t));
+
+    maxVal = __SSUB16(maxVal, in);
+    maxVal = __SEL(maxVal, 0);
+    maxVal = __SADD16(maxVal, in);
+}
+
+template<typename Input_T,
+         typename std::enable_if<(std::is_unsigned<Input_T>::value
+         && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+void parallelMaxPooling (const Input_T* __restrict inputs,
+                         uint32_t& maxVal,
+                         const int nb_data)
+{
+    assert(nb_data <= 4 && nb_data >= 1);
+
+    uint32_t in = 0;
+    memcpy((void*) &in, inputs, nb_data*sizeof(uint8_t));
+
+    maxVal = __UQSUB8(maxVal, in);
+    maxVal = __UQADD8(maxVal, in);
+}
+
+template<typename Input_T,
+         typename std::enable_if<(!std::is_unsigned<Input_T>::value
+         && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+void parallelMaxPooling (const Input_T* __restrict inputs,
+                         uint32_t maxVal,
+                         const int nb_data)
+{
+    assert(nb_data <= 4 && nb_data >= 1);
+
+    uint32_t in = 0;
+    memcpy((void*) &in, inputs, nb_data*sizeof(uint8_t));
+
+    maxVal = __SSUB8(maxVal, in);
+    maxVal = __SEL(maxVal, 0);
+    maxVal = __SADD8(maxVal, in);
+}
+
+template<typename Input_T,
+         typename std::enable_if<(std::is_unsigned<Input_T>::value
+         && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+void parallelMaxPooling (const Input_T* __restrict inputs,
+                         uint32_t& maxVal,
+                         const int nb_data)
+{
+    assert(nb_data == 2 || nb_data == 1);
+
+    uint32_t in = 0;
+    memcpy((void*) &in, inputs, nb_data*sizeof(uint8_t));
+
+    in = (in | in << 8) & 0xFF00FF;
+    in = (in | in << 4) & 0xF0F0F0F;
+
+    maxVal = __UQSUB8(maxVal, in);
+    maxVal = __UQADD8(maxVal, in);
+}
+
+template<typename Input_T,
+         typename std::enable_if<(!std::is_unsigned<Input_T>::value
+         && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+void parallelMaxPooling (const Input_T* __restrict inputs,
+                         uint32_t maxVal,
+                         const int nb_data)
+{
+    assert(nb_data == 2 || nb_data == 1);
+
+    uint32_t in = 0;
+    memcpy((void*) &in, inputs, nb_data*sizeof(uint8_t));
+
+    in = (in | in << 8) & 0xFF00FF;
+    in = (in | in << 4) & 0xF0F0F0F;
+    in += 0x78787878;
+    in ^= 0x78787878;
+
+    maxVal = __SSUB8(maxVal, in);
+    maxVal = __SEL(maxVal, 0);
+    maxVal = __SADD8(maxVal, in);
+}
+
+
+#endif
\ No newline at end of file
diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/swar_arm_acle.h b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/swar_arm_acle.h
new file mode 100644
index 0000000..31223f2
--- /dev/null
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/swar_arm_acle.h
@@ -0,0 +1,356 @@
+/**
+ ******************************************************************************
+ * @file     swar_arm_acle.h
+ * @brief    Complete ARM Non-NEON ACLE intrinsics for Cortex m7 and m4
+ * 
+ ******************************************************************************
+ * @attention
+ * 
+ * (C) Copyright 2021 CEA LIST. All Rights Reserved.
+ *  Contributor(s): Vincent TEMPLIER (vincent.templier@cea.fr)
+ *                  Philippe DORE (philippe.dore@cea.fr)
+ * 
+ * This file is not part of the open source version of N2D2 and is NOT under
+ * the CeCILL-C license. This code is the property of the CEA. It can not be
+ * copied or disseminated without its authorization.
+ * 
+ ******************************************************************************
+ */
+
+#ifndef _SWAR_ARM_ACLE_H
+#define _SWAR_ARM_ACLE_H
+
+#include <cmsis_compiler.h>
+#include "assert.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief   Rotate right and perform dual extracted 8-bit to 16-bit signed addition
+ * @details This function rotates op2, extracts two 8-bit values from op2 (at bit positions [7:0] and [23:16]), 
+ *          sign-extend them to 16-bits each, and add the results to op1
+ * @param[in]  op1  Two 16-bit values in op1[15:0] and op1[31:16]
+ * @param[in]  op2  Two 8-bit values in op2[7:0] and op2[23:16] to be sign-extended
+ * @param[in]  ror  Number of bits to rotate op2. Only 8,16 and 24 are accepted  
+ * @returns         The addition of op1 and op2, where op2 has been rotated, the 8-bit values in op2[7:0] 
+ *                  and op2[23:16] have been extracted and sign-extended prior to the addition
+ * 
+ */
+__attribute__((always_inline)) __STATIC_INLINE 
+int32_t __SXTAB16_RORn (const int32_t op1, const int32_t op2, const int8_t ror)
+{
+    int32_t result;
+
+    assert((ror == 0) || (ror == 8) || (ror == 16) || (ror == 24));
+    __ASM volatile ("sxtab16 %0, %1, %2, ROR %3" : "=r" (result) : "r" (op1) , "r" (op2) , "i" (ror) );
+    return result;
+}
+
+
+/**
+ * @brief   Rotate right, dual extract 8-bits and sign extend each to 16-bits
+ * @param[in]  op1  Two 8-bit values in op1[7:0] and op1[23:16] to be sign-extended
+ * @param[in]  ror  Number of bits to rotate op1. Only 8,16 and 24 are accepted  
+ * @returns         The 8-bit values sign-extended to 16-bit values
+ * 
+ */
+__attribute__((always_inline)) __STATIC_INLINE 
+int32_t __SXTB16_RORn (const int32_t op1, const int8_t ror)
+{
+    int32_t result;
+
+    assert((ror == 0) || (ror == 8) || (ror == 16) || (ror == 24));
+    __ASM volatile ("sxtb16 %0, %1, ROR %2" : "=r" (result) : "r" (op1), "i" (ror) );
+    return result;
+}
+
+
+/**
+ * @brief   Rotate right and perform dual extracted 8-bit to 16-bit zero addition
+ * @details This function rotates op2, extracts two 8-bit values from op2 (at bit positions [7:0] and [23:16]), 
+ *          zero-extend them to 16-bits each, and add the results to op1
+ * @param[in]  op1  Two 16-bit values in op1[15:0] and op1[31:16]
+ * @param[in]  op2  Two 8-bit values in op2[7:0] and op2[23:16] to be zero-extended
+ * @param[in]  ror  Number of bits to rotate op2. Only 8,16 and 24 are accepted  
+ * @returns         The addition of op1 and op2, where op2 has been rotated, the 8-bit values in op2[7:0] 
+ *                  and op2[23:16] have been extracted and zero-extended prior to the addition
+ * 
+ */
+__attribute__((always_inline)) __STATIC_INLINE 
+uint32_t __UXTAB16_RORn (const uint32_t op1, const uint32_t op2, const int8_t ror)
+{
+    uint32_t result;
+
+    assert((ror == 0) || (ror == 8) || (ror == 16) || (ror == 24));
+    __ASM volatile ("uxtab16 %0, %1, %2, ROR %3" : "=r" (result) : "r" (op1) , "r" (op2) , "i" (ror) );
+    return result;
+}
+
+
+/**
+ * @brief   Rotate right, dual extract 8-bits and zero extend each to 16-bits
+ * @param[in]  op1  Two 8-bit values in op1[7:0] and op1[23:16] to be zero-extended
+ * @param[in]  ror  Number of bits to rotate op1. Only 8,16 and 24 are accepted  
+ * @returns         The 8-bit values zero-extended to 16-bit values
+ * 
+ */
+__attribute__((always_inline)) __STATIC_INLINE 
+uint32_t __UXTB16_RORn (const uint32_t op1, const int8_t ror)
+{
+    uint32_t result;
+
+    assert((ror == 0) || (ror == 8) || (ror == 16) || (ror == 24));
+    __ASM volatile ("uxtb16 %0, %1, ROR %2" : "=r" (result) : "r" (op1), "i" (ror) );
+    return result;
+}
+
+
+/**
+ * @brief   Sign extend Halfword
+ * @details Extends a 16-bit value to a signed 32-bit value
+ * @param[in]  op1  op1[15:0] to be sign-extended
+ * @returns         Register holding the sign-extended 32-bit value
+ * 
+ */
+__attribute__((always_inline)) __STATIC_INLINE 
+uint32_t __SXTH (const uint32_t op1)
+{
+    uint32_t result;
+
+    __ASM volatile ("sxth %0, %1" : "=r" (result) : "r" (op1));
+    return result;
+}
+
+
+/**
+ * @brief   Zero extend Halfword
+ * @details Extends a 16-bit value to an unsigned 32-bit value
+ * @param[in]  op1  op1[15:0] to be zero-extended
+ * @returns         Register holding the zero-extended 32-bit value
+ * 
+ */
+__attribute__((always_inline)) __STATIC_INLINE 
+uint32_t __UXTH (const uint32_t op1)
+{
+    uint32_t result;
+
+    __ASM volatile ("uxth %0, %1" : "=r" (result) : "r" (op1));
+    return result;
+}
+
+
+/**
+ * @brief   Rotate right and sign extend halfword
+ * @param[in]  op1  op1[15:0] to be sign-extended
+ * @param[in]  ror  Number of bits to rotate op1. Only 8,16 and 24 are accepted  
+ * @returns         Register holding the sign-extended 32-bit value
+ * 
+ */
+__attribute__((always_inline)) __STATIC_INLINE 
+uint32_t __SXTH_RORn (const uint32_t op1, const int8_t ror)
+{
+    uint32_t result;
+
+    assert((ror == 0) || (ror == 8) || (ror == 16) || (ror == 24));
+    __ASM volatile ("sxth %0, %1, ROR %2" : "=r" (result) : "r" (op1), "i" (ror) );
+    return result;
+}
+
+
+/**
+ * @brief   Rotate right and zero extend halfword
+ * @param[in]  op1  op1[15:0] to be zero-extended
+ * @param[in]  ror  Number of bits to rotate op1. Only 8,16 and 24 are accepted  
+ * @returns         Register holding the zero-extended 32-bit value
+ * 
+ */
+__attribute__((always_inline)) __STATIC_INLINE 
+uint32_t __UXTH_RORn (const uint32_t op1, const int8_t ror)
+{
+    uint32_t result;
+
+    assert((ror == 0) || (ror == 8) || (ror == 16) || (ror == 24));
+    __ASM volatile ("uxth %0, %1, ROR %2" : "=r" (result) : "r" (op1), "i" (ror) );
+    return result;
+}
+
+
+/**
+ * @brief   Sign extend Byte
+ * @details Extends a 8-bit value to a signed 32-bit value
+ * @param[in]  op1  op1[7:0] to be sign-extended
+ * @returns         Register holding the sign-extended 32-bit value
+ * 
+ */
+__attribute__((always_inline)) __STATIC_INLINE 
+uint32_t __SXTB (const uint32_t op1)
+{
+    uint32_t result;
+
+    __ASM volatile ("sxtb %0, %1" : "=r" (result) : "r" (op1));
+    return result;
+}
+
+
+/**
+ * @brief   Zero extend Byte
+ * @details Extends a 8-bit value to an unsigned 32-bit value
+ * @param[in]  op1  op1[7:0] to be zero-extended
+ * @returns         Register holding the zero-extended 32-bit value
+ * 
+ */
+__attribute__((always_inline)) __STATIC_INLINE 
+uint32_t __UXTB (const uint32_t op1)
+{
+    uint32_t result;
+
+    __ASM volatile ("uxtb %0, %1" : "=r" (result) : "r" (op1));
+    return result;
+}
+
+
+/**
+ * @brief   Rotate right and sign extend byte
+ * @param[in]  op1  op1[7:0] to be sign-extended
+ * @param[in]  ror  Number of bits to rotate op1. Only 8,16 and 24 are accepted  
+ * @returns         Register holding the sign-extended 32-bit value
+ * 
+ */
+__attribute__((always_inline)) __STATIC_INLINE 
+uint32_t __SXTB_RORn (const uint32_t op1, const int8_t ror)
+{
+    uint32_t result;
+
+    assert((ror == 0) || (ror == 8) || (ror == 16) || (ror == 24));
+    __ASM volatile ("sxtb %0, %1, ROR %2" : "=r" (result) : "r" (op1), "i" (ror) );
+    return result;
+}
+
+
+/**
+ * @brief   Rotate right and zero extend byte
+ * @param[in]  op1  op1[7:0] to be zero-extended
+ * @param[in]  ror  Number of bits to rotate op1. Only 8,16 and 24 are accepted  
+ * @returns         Register holding the zero-extended 32-bit value
+ * 
+ */
+__attribute__((always_inline)) __STATIC_INLINE 
+uint32_t __UXTB_RORn (const uint32_t op1, const int8_t ror)
+{
+    uint32_t result;
+
+    assert((ror == 0) || (ror == 8) || (ror == 16) || (ror == 24));
+    __ASM volatile ("uxtb %0, %1, ROR %2" : "=r" (result) : "r" (op1), "i" (ror) );
+    return result;
+}
+
+
+/**
+ * @brief   Signed Bit Field Extract
+ * @details Copies adjacent bits from one register into the least significant bits 
+ *          of a second register, and sign extends to 32 bits
+ * @param[in]  op1    Value to be extracted
+ * @param[in]  lsb    Position of the least significant bit of the bit field
+ * @param[in]  width  Width of the bit field
+ * @returns           Extracted bitfield and sign extended to 32 bits
+ * 
+ */
+__attribute__((always_inline)) __STATIC_INLINE 
+int32_t __SBFX (const uint32_t op1, const int8_t lsb, const int8_t width)
+{
+    int32_t result;
+
+    assert((lsb >= 0) && (lsb < 32) && (width >= 0) && (width < 32-lsb));
+    __ASM volatile ("sbfx %0, %1, %2, %3" : "=r" (result) : "r" (op1), "i" (lsb), "i" (width) );
+    return result;
+}
+
+
+/**
+ * @brief   Unsigned Bit Field Extract
+ * @details Copies adjacent bits from one register into the least significant bits 
+ *          of a second register, and zero extends to 32 bits
+ * @param[in]  op1    Value to be extracted
+ * @param[in]  lsb    Position of the least significant bit of the bit field
+ * @param[in]  width  Width of the bit field
+ * @returns           Extracted bitfield and zero extended to 32 bits
+ * 
+ */
+__attribute__((always_inline)) __STATIC_INLINE 
+uint32_t __UBFX (const uint32_t op1, const int8_t lsb, const int8_t width)
+{
+    uint32_t result;
+
+    assert((lsb >= 0) && (lsb < 32) && (width >= 0) && (width < 32-lsb));
+    __ASM volatile ("ubfx %0, %1, %2, %3" : "=r" (result) : "r" (op1), "i" (lsb), "i" (width) );
+    return result;
+}
+
+
+/**
+ * @brief   Bit Field Insert
+ * @details Copies a bitfield into one register from another register
+ *          It replaces width bits in op2 starting at the position lsb, 
+ *          with width bits from op1 starting at bit[0].  
+ *          Other bits in op2 are unchanged
+ * @param[in]      op1    Source value
+ * @param[in,out]  op2    Destination value 
+ * @param[in]      lsb    Position of the least significant bit of the bit field
+ * @param[in]      width  Width of the bit field
+ * @returns               The register which contains op2 and the added bitfield
+ * 
+ */
+__attribute__((always_inline)) __STATIC_INLINE 
+uint32_t __BFI (uint32_t op1, uint32_t op2, const int8_t lsb, const int8_t width)
+{
+    assert((lsb >= 0) && (lsb < 32) && (width >= 0) && (width < 32-lsb));
+    __ASM volatile ("bfi %0, %1, %2, %3" : "+r" (op2) : "r" (op1), "i" (lsb), "i" (width), "0" (op2) );
+    return op2;
+}
+
+
+/**
+ * @brief   Signed Divide
+ * @details Performs a signed integer division of the value in op1 
+ *          by the value in op2.
+ * @param[in]  op1  Register holding the value to be divided
+ * @param[in]  op2  Register holding the divisor
+ * @returns         Register holding the signed result op1/op2
+ * 
+ */
+__attribute__((always_inline)) __STATIC_INLINE 
+uint32_t __SDIV (const uint32_t op1, const uint32_t op2)
+{
+    uint32_t result;
+
+    __ASM volatile ("sdiv %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+    return result;
+}
+
+
+/**
+ * @brief   Unsigned Divide
+ * @details Performs an unsigned integer division of the value in op1 
+ *          by the value in op2.
+ * @param[in]  op1  Register holding the value to be divided
+ * @param[in]  op2  Register holding the divisor
+ * @returns         Register holding the unsigned result op1/op2
+ * 
+ */
+__attribute__((always_inline)) __STATIC_INLINE 
+uint32_t __UDIV (const uint32_t op1, const uint32_t op2)
+{
+    uint32_t result;
+
+    __ASM volatile ("udiv %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+    return result;
+}
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/utils.hpp b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/utils.hpp
index 6b1228b..9111d1c 100644
--- a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/utils.hpp
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/utils.hpp
@@ -28,7 +28,7 @@
 #include <cstring>
 #include <limits>
 
-#include "typedefs.h"
+#include "kernels/typedefs.hpp"
 
 namespace N2D2_Export {
 
-- 
GitLab