From bb480b57c563d5b5acc55e14a6a23dc9c36a6254 Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Wed, 16 Jul 2025 17:46:22 +0200
Subject: [PATCH] Operators adaptation

---
 aidge_export_cpp/kernels/convolution.hpp      | 169 +++++++------
 .../kernels/convolution_depthwise.hpp         | 177 +++++++------
 aidge_export_cpp/kernels/fullyconnected.hpp   | 234 +++++++++++-------
 aidge_export_cpp/kernels/pooling.hpp          |  90 ++++++-
 aidge_export_cpp/static/macs.hpp              | 125 ----------
 aidge_export_cpp/static/typedefs.hpp          |  40 +++
 6 files changed, 471 insertions(+), 364 deletions(-)

diff --git a/aidge_export_cpp/kernels/convolution.hpp b/aidge_export_cpp/kernels/convolution.hpp
index 1b4f9fd..beaa3a6 100644
--- a/aidge_export_cpp/kernels/convolution.hpp
+++ b/aidge_export_cpp/kernels/convolution.hpp
@@ -40,6 +40,11 @@ void convolution_forward(
     const Bias_T* __restrict biases,
     const Rescaling_T& __restrict rescaling)
 {
+    constexpr size_t NB_I_CHANNELS_PACKED = (NB_CHANNELS + n_pack<Input_T>() - 1) / n_pack<Input_T>();
+    constexpr size_t NB_W_CHANNELS_PACKED = (NB_CHANNELS + n_pack<Weight_T>() - 1) / n_pack<Weight_T>();
+    constexpr size_t NB_OUTPUTS_PACKED = (NB_OUTPUTS + n_pack<Output_T>() - 1) / n_pack<Output_T>();
+    constexpr size_t NB_OUTPUTS_REM = NB_OUTPUTS % n_pack<Output_T>();
+
     constexpr size_t OUTPUTS_HEIGHT_NOPAD
         = (CHANNELS_HEIGHT - DILATION_Y * (KERNEL_HEIGHT - 1) - 1 + STRIDE_Y) / STRIDE_Y;
     constexpr size_t OUTPUTS_WIDTH_NOPAD
@@ -54,21 +59,15 @@ void convolution_forward(
                     0, KERNEL_HEIGHT);
         const int iy = static_cast<int>(oy * STRIDE_Y) - static_cast<int>(PADDING_Y);
 
+        // oy loop should not be parallelized to allow memory wrapping: memory
+        // lines must be processed in order.
 #ifdef _OPENMP
 #pragma omp parallel for collapse(2)
 #endif
         for (size_t ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
-            for (size_t output = 0; output < NB_OUTPUTS; ++output) {
-                // moved to inner loop for collapsing -->
-                const size_t sxMin = (PADDING_X == 0) ? 0
-                    : max((PADDING_X - (ox * STRIDE_X) + DILATION_X - 1) / DILATION_X, 0);
-                const size_t sxMax = (PADDING_X == 0
-                        && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
-                            ? KERNEL_WIDTH
-                    : clamp((CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X)) / DILATION_X,
-                            0, KERNEL_WIDTH);
-                const int ix = static_cast<int>(ox * STRIDE_X) - static_cast<int>(PADDING_X);
-
+            for (size_t outputPack = 0; outputPack < NB_OUTPUTS_PACKED; ++outputPack) {
+                Output_T packedOutputVal;
+            
                 const size_t oPos = (ox + OUTPUTS_WIDTH * oy);
                 int oOffset = (OUTPUT_MEM_STRIDE / sizeof(Output_T)) * oPos;
 
@@ -77,82 +76,104 @@ void convolution_forward(
                                 - OUTPUT_MEM_CONT_SIZE) / sizeof(Output_T);
                 }
 
-                // <--
-                // Check if the biases are defined
-                Bias_T weightedSum = biases ? biases[output] : 0;
-
-                for (size_t sy = 0; sy < KERNEL_HEIGHT; ++sy) {
-                    if ((PADDING_Y != 0
-                            || OUTPUTS_HEIGHT != OUTPUTS_HEIGHT_NOPAD)
-                        && sy >= syMax - syMin)
-                    {
+                for (size_t packElt = 0; packElt < n_pack<Output_T>(); ++packElt) {
+                    if (NB_OUTPUTS_REM != 0 && packElt == NB_OUTPUTS_REM) {
                         break;
                     }
 
-                    const size_t iPos = static_cast<size_t>(sxMin * DILATION_X + ix)
-                                        + CHANNELS_WIDTH * (static_cast<size_t>(iy + (syMin + sy) * DILATION_Y));
-                    int iOffset = (INPUT_MEM_STRIDE / sizeof(Input_T)) * iPos;
+                    const size_t output = outputPack * n_pack<Output_T>() + packElt;
+
+                    // moved to inner loop for collapsing -->
+                    const size_t sxMin = (PADDING_X == 0) ? 0
+                        : max((PADDING_X - (ox * STRIDE_X) + DILATION_X - 1) / DILATION_X, 0);
+                    const size_t sxMax = (PADDING_X == 0
+                            && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
+                                ? KERNEL_WIDTH
+                        : clamp((CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X)) / DILATION_X,
+                                0, KERNEL_WIDTH);
+                    const int ix = static_cast<int>(ox * STRIDE_X) - static_cast<int>(PADDING_X);
+
+                    // <--
+                    // Check if the biases are defined
+                    Bias_T weightedSum = biases ? biases[output] : 0;
+
+                    for (size_t sy = 0; sy < KERNEL_HEIGHT; ++sy) {
+                        if ((PADDING_Y != 0
+                                || OUTPUTS_HEIGHT != OUTPUTS_HEIGHT_NOPAD)
+                            && sy >= syMax - syMin)
+                        {
+                            break;
+                        }
 
-                    // Wrapping cannot occur in the middle of a line, except if
-                    // there is only one line (1D)!
-                    bool wrapInRange = false;
+                        const size_t iPos = static_cast<size_t>(sxMin * DILATION_X + ix)
+                                            + CHANNELS_WIDTH * (static_cast<size_t>(iy + (syMin + sy) * DILATION_Y));
+                        int iOffset = (INPUT_MEM_STRIDE / sizeof(Input_T)) * iPos;
 
-                    if (INPUT_MEM_WRAP_SIZE > 0
-                        && iOffset >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
-                    {
-                        iOffset += (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET
-                                    - INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
-                    }
-                    else if (INPUT_MEM_WRAP_SIZE > 0 && KERNEL_WIDTH > 1
-                        && CHANNELS_HEIGHT == 1 // single line (1D)!
-                        && iOffset + KERNEL_WIDTH * NB_CHANNELS
-                            > (INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
-                    {
-                        wrapInRange = true;
-                    }
+                        // Wrapping cannot occur in the middle of a line, except if
+                        // there is only one line (1D)!
+                        bool wrapInRange = false;
 
-                    const size_t wOffset = NB_CHANNELS * (sxMin
-                        + KERNEL_WIDTH * (syMin + sy + KERNEL_HEIGHT * output));
-
-                    if (!wrapInRange && NB_CHANNELS == (INPUT_MEM_STRIDE / sizeof(Input_T))
-                        && DILATION_X == 1 && ((PADDING_X == 0 && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
-                        || sxMax - sxMin == KERNEL_WIDTH))
-                    {
-                        macsOnRange<KERNEL_WIDTH * NB_CHANNELS>(
-                            inputs + iOffset,
-                            weights + wOffset,
-                            weightedSum);
-                    }
-                    else {
-                        for (size_t sx = 0; sx < KERNEL_WIDTH; ++sx) {
-                            if ((PADDING_X != 0
-                                    || OUTPUTS_WIDTH != OUTPUTS_WIDTH_NOPAD)
-                                && sx >= sxMax - sxMin)
-                            {
-                                break;
-                            }
-
-                            int iOffsetInRange = iOffset
-                                + sx * DILATION_X * (INPUT_MEM_STRIDE / sizeof(Input_T));
+                        if (INPUT_MEM_WRAP_SIZE > 0
+                            && iOffset >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
+                        {
+                            iOffset += (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET
+                                        - INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
+                        }
+                        else if (INPUT_MEM_WRAP_SIZE > 0 && KERNEL_WIDTH > 1
+                            && CHANNELS_HEIGHT == 1 // single line (1D)!
+                            && iOffset + KERNEL_WIDTH * NB_I_CHANNELS_PACKED
+                                > (INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
+                        {
+                            wrapInRange = true;
+                        }
 
-                            if (wrapInRange
-                                && iOffsetInRange >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
-                            {
-                                iOffsetInRange += (INPUT_MEM_WRAP_OFFSET
-                                            - INPUT_MEM_CONT_OFFSET
-                                            - INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
-                            }
+                        const size_t wOffset = NB_W_CHANNELS_PACKED * (sxMin
+                            + KERNEL_WIDTH * (syMin + sy + KERNEL_HEIGHT * output));
 
-                            macsOnRange<NB_CHANNELS>(
-                                // same input line so no wrapping can occur
-                                inputs + iOffsetInRange,
-                                weights + wOffset + sx * NB_CHANNELS,
+                        if (!wrapInRange && NB_I_CHANNELS_PACKED == (INPUT_MEM_STRIDE / sizeof(Input_T))
+                            && DILATION_X == 1 && ((PADDING_X == 0 && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
+                            || sxMax - sxMin == KERNEL_WIDTH))
+                        {
+                            macsOnRange<KERNEL_WIDTH * NB_CHANNELS>(
+                                inputs + iOffset,
+                                weights + wOffset,
                                 weightedSum);
                         }
+                        else {
+                            for (size_t sx = 0; sx < KERNEL_WIDTH; ++sx) {
+                                if ((PADDING_X != 0
+                                        || OUTPUTS_WIDTH != OUTPUTS_WIDTH_NOPAD)
+                                    && sx >= sxMax - sxMin)
+                                {
+                                    break;
+                                }
+
+                                int iOffsetInRange = iOffset
+                                    + sx * DILATION_X * (INPUT_MEM_STRIDE / sizeof(Input_T));
+
+                                if (wrapInRange
+                                    && iOffsetInRange >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
+                                {
+                                    iOffsetInRange += (INPUT_MEM_WRAP_OFFSET
+                                                - INPUT_MEM_CONT_OFFSET
+                                                - INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
+                                }
+
+                                macsOnRange<NB_CHANNELS>(
+                                    // same input line so no wrapping can occur
+                                    inputs + iOffsetInRange,
+                                    weights + wOffset + sx * NB_W_CHANNELS_PACKED,
+                                    weightedSum);
+                            }
+                        }
                     }
+
+                    const auto outputVal
+                        = activation_forward_value<Output_T>(weightedSum, output, ACTIVATION, rescaling);
+                    pack_rev_set(packedOutputVal, packElt, outputVal);
                 }
 
-                outputs[oOffset + output] = activation_forward_value<Output_T>(weightedSum, output, ACTIVATION, rescaling);
+                outputs[oOffset + outputPack] = packedOutputVal;
             }
         }
     }
diff --git a/aidge_export_cpp/kernels/convolution_depthwise.hpp b/aidge_export_cpp/kernels/convolution_depthwise.hpp
index 1ee9b45..7ad506c 100644
--- a/aidge_export_cpp/kernels/convolution_depthwise.hpp
+++ b/aidge_export_cpp/kernels/convolution_depthwise.hpp
@@ -42,6 +42,10 @@ void convolution_depthwise_forward(
     static_assert(NB_OUTPUTS % NB_CHANNELS == 0,
         "NB_OUTPUTS should be a multiple of NB_CHANNELS.");
 
+    constexpr size_t NB_I_CHANNELS_PACKED = (NB_CHANNELS + n_pack<Input_T>() - 1) / n_pack<Input_T>();
+    constexpr size_t NB_OUTPUTS_PACKED = (NB_OUTPUTS + n_pack<Output_T>() - 1) / n_pack<Output_T>();
+    constexpr size_t NB_OUTPUTS_REM = NB_OUTPUTS % n_pack<Output_T>();
+
     constexpr size_t DILATED_KERNEL_HEIGHT 
             = KERNEL_HEIGHT + (DILATION_Y - 1) * (KERNEL_HEIGHT - 1);
 
@@ -62,20 +66,14 @@ void convolution_depthwise_forward(
                     0, DILATED_KERNEL_HEIGHT);
         const int iy = static_cast<int>(oy * STRIDE_Y) - static_cast<int>(PADDING_Y);
 
+        // oy loop should not be parallelized to allow memory wrapping: memory
+        // lines must be processed in order.
 #ifdef _OPENMP
 #pragma omp parallel for collapse(2)
 #endif
         for (size_t ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
-            for (size_t output = 0; output < NB_OUTPUTS; ++output) {
-                // moved to inner loop for collapsing -->
-                const size_t sxMin = (PADDING_X == 0) ? 0
-                    : max(PADDING_X - (ox * STRIDE_X), 0);
-                const size_t sxMax = (PADDING_X == 0
-                        && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
-                            ? DILATED_KERNEL_WIDTH
-                    : clamp(CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X), 
-                            0, DILATED_KERNEL_WIDTH);
-                const int ix = static_cast<int>(ox * STRIDE_X) - static_cast<int>(PADDING_X);
+            for (size_t outputPack = 0; outputPack < NB_OUTPUTS_PACKED; ++outputPack) {
+                Output_T packedOutputVal;
 
                 const size_t oPos = (ox + OUTPUTS_WIDTH * oy);
                 int oOffset = (OUTPUT_MEM_STRIDE / sizeof(Output_T)) * oPos;
@@ -84,82 +82,123 @@ void convolution_depthwise_forward(
                     oOffset += (OUTPUT_MEM_WRAP_OFFSET - OUTPUT_MEM_CONT_OFFSET
                                 - OUTPUT_MEM_CONT_SIZE) / sizeof(Output_T);
                 }
-                // <--
 
-                const size_t channel = (output * NB_CHANNELS) / NB_OUTPUTS;
+                for (size_t packElt = 0; packElt < n_pack<Output_T>(); ++packElt) {
+                    if (NB_OUTPUTS_REM != 0 && packElt == NB_OUTPUTS_REM) {
+                        break;
+                    }
 
-                Bias_T weightedSum = biases ? biases[output] : 0;
+                    const size_t output = outputPack * n_pack<Output_T>() + packElt;
 
-                for (size_t sy = 0; sy < KERNEL_HEIGHT; ++sy) {
-                    if ((PADDING_Y != 0
-                            || OUTPUTS_HEIGHT != OUTPUTS_HEIGHT_NOPAD)
-                        && ((sy*DILATION_Y < syMin) || (sy*DILATION_Y >= syMax)))
-                    {
-                        continue;
-                    }
+                    // moved to inner loop for collapsing -->
+                    const size_t sxMin = (PADDING_X == 0) ? 0
+                        : max(PADDING_X - (ox * STRIDE_X), 0);
+                    const size_t sxMax = (PADDING_X == 0
+                            && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
+                                ? DILATED_KERNEL_WIDTH
+                        : clamp(CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X), 
+                                0, DILATED_KERNEL_WIDTH);
+                    const int ix = static_cast<int>(ox * STRIDE_X) - static_cast<int>(PADDING_X);
+                    // <--
 
-                    const size_t iPos = static_cast<size_t>(ix)
-                        + CHANNELS_WIDTH * (static_cast<size_t>(iy + sy * DILATION_Y));
-                    int iOffset = (INPUT_MEM_STRIDE / sizeof(Input_T)) * iPos;
+                    const size_t channel = (output * NB_CHANNELS) / NB_OUTPUTS;
 
-                    // Wrapping cannot occur in the middle of a line, except if
-                    // there is only one line (1D)!
-                    bool wrapInRange = false;
+                    Bias_T weightedSum = biases ? biases[output] : 0;
 
-                    if (INPUT_MEM_WRAP_SIZE > 0
-                        && iOffset >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
-                    {
-                        iOffset += (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET
-                                    - INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
-                    }
-                    else if (INPUT_MEM_WRAP_SIZE > 0 && KERNEL_WIDTH > 1
-                        && CHANNELS_HEIGHT == 1 // single line (1D)!
-                        && iOffset + KERNEL_WIDTH * NB_CHANNELS
-                            > (INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
-                    {
-                        wrapInRange = true;
-                    }
+                    for (size_t sy = 0; sy < KERNEL_HEIGHT; ++sy) {
+                        if ((PADDING_Y != 0
+                                || OUTPUTS_HEIGHT != OUTPUTS_HEIGHT_NOPAD)
+                            && ((sy*DILATION_Y < syMin) || (sy*DILATION_Y >= syMax)))
+                        {
+                            continue;
+                        }
 
-                    const size_t wOffset = (output*KERNEL_HEIGHT + sy) 
-                                        * KERNEL_WIDTH;
+                        const size_t iPos = static_cast<size_t>(ix)
+                            + CHANNELS_WIDTH * (static_cast<size_t>(iy + sy * DILATION_Y));
+                        int iOffset = (INPUT_MEM_STRIDE / sizeof(Input_T)) * iPos;
 
-                    if (!wrapInRange && NB_CHANNELS == (INPUT_MEM_STRIDE / sizeof(Input_T))
-                        && DILATION_X == 1 && ((PADDING_X == 0
-                            && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
-                        || sxMax - sxMin == KERNEL_WIDTH))
-                    {
-                        macsOnRange<KERNEL_WIDTH, NB_CHANNELS>(
-                            inputs + iOffset + channel, 
-                            weights + wOffset, 
-                            weightedSum);
-                    }
-                    else {
-                        for (size_t sx = 0; sx < KERNEL_WIDTH; ++sx) {
-                            if ((PADDING_X != 0
-                                    || OUTPUTS_WIDTH != OUTPUTS_WIDTH_NOPAD)
-                                && ((sx*DILATION_X < sxMin) || (sx*DILATION_X >= sxMax)))
-                            {
-                                continue;
+                        // Wrapping cannot occur in the middle of a line, except if
+                        // there is only one line (1D)!
+                        bool wrapInRange = false;
+
+                        if (INPUT_MEM_WRAP_SIZE > 0
+                            && iOffset >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
+                        {
+                            iOffset += (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET
+                                        - INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
+                        }
+                        else if (INPUT_MEM_WRAP_SIZE > 0 && KERNEL_WIDTH > 1
+                            && CHANNELS_HEIGHT == 1 // single line (1D)!
+                            && iOffset + KERNEL_WIDTH * NB_I_CHANNELS_PACKED
+                                > (INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
+                        {
+                            wrapInRange = true;
+                        }
+
+                        const size_t wOffset = (output*KERNEL_HEIGHT + sy) 
+                                            * KERNEL_WIDTH;
+                        const auto inPackOffset = channel % n_pack<Input_T>();
+
+                        if (!wrapInRange && NB_I_CHANNELS_PACKED == (INPUT_MEM_STRIDE / sizeof(Input_T))
+                            && DILATION_X == 1 && ((PADDING_X == 0
+                                && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
+                            || sxMax - sxMin == KERNEL_WIDTH))
+                        {
+                            const auto wPackOffset = wOffset % n_pack<Weight_T>();
+
+                            if (inPackOffset == 0 && wPackOffset == 0) {
+                                macsOnRange<KERNEL_WIDTH, NB_I_CHANNELS_PACKED>(
+                                    inputs + iOffset + channel / n_pack<Input_T>(), 
+                                    weights + wOffset / n_pack<Weight_T>(), 
+                                    weightedSum);
                             }
+                            else {
+                                for (size_t sx = 0; sx < KERNEL_WIDTH; ++sx) {
+                                    const int iOffsetInRange = iOffset
+                                        + sx * DILATION_X * (INPUT_MEM_STRIDE / sizeof(Input_T));
 
-                            int iOffsetInRange = iOffset
-                                + sx * DILATION_X * (INPUT_MEM_STRIDE / sizeof(Input_T));
+                                    const auto in = inputs[iOffsetInRange + channel / n_pack<Input_T>()];
+                                    const auto w = weights[(wOffset + sx) / n_pack<Weight_T>()];
 
-                            if (wrapInRange
-                                && iOffsetInRange >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
-                            {
-                                iOffsetInRange += (INPUT_MEM_WRAP_OFFSET
-                                            - INPUT_MEM_CONT_OFFSET
-                                            - INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
+                                    weightedSum += pack_rev_get(in, inPackOffset)
+                                                    * pack_rev_get(w, (wOffset + sx) % n_pack<Weight_T>());
+                                }
                             }
+                        }
+                        else {
+                            for (size_t sx = 0; sx < KERNEL_WIDTH; ++sx) {
+                                if ((PADDING_X != 0
+                                        || OUTPUTS_WIDTH != OUTPUTS_WIDTH_NOPAD)
+                                    && ((sx*DILATION_X < sxMin) || (sx*DILATION_X >= sxMax)))
+                                {
+                                    continue;
+                                }
+
+                                int iOffsetInRange = iOffset
+                                    + sx * DILATION_X * (INPUT_MEM_STRIDE / sizeof(Input_T));
+
+                                if (wrapInRange
+                                    && iOffsetInRange >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
+                                {
+                                    iOffsetInRange += (INPUT_MEM_WRAP_OFFSET
+                                                - INPUT_MEM_CONT_OFFSET
+                                                - INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
+                                }
 
-                            weightedSum += inputs[iOffsetInRange + channel]
-                                * weights[wOffset + sx];
+                                const auto in = inputs[iOffsetInRange + channel / n_pack<Input_T>()];
+                                const auto w = weights[(wOffset + sx) / n_pack<Weight_T>()];
+                                weightedSum += pack_rev_get(in, inPackOffset)
+                                                * pack_rev_get(w, (wOffset + sx) % n_pack<Weight_T>());
+                            }
                         }
                     }
+
+                    const auto outputVal
+                        = activation_forward_value<Output_T>(weightedSum, output, ACTIVATION, rescaling);
+                    pack_rev_set(packedOutputVal, packElt, outputVal);
                 }
 
-                outputs[oOffset + output] = activation_forward_value<Output_T>(weightedSum, output, ACTIVATION, rescaling);
+                outputs[oOffset + outputPack] = packedOutputVal;
             }
         }
     }
diff --git a/aidge_export_cpp/kernels/fullyconnected.hpp b/aidge_export_cpp/kernels/fullyconnected.hpp
index 7723a7d..0fe732b 100644
--- a/aidge_export_cpp/kernels/fullyconnected.hpp
+++ b/aidge_export_cpp/kernels/fullyconnected.hpp
@@ -39,67 +39,81 @@ void fullyconnected_forward (
     const Bias_T* __restrict biases,
     const Rescaling_T& __restrict rescaling)
 {
-    constexpr size_t INPUT_WIDTH_STRIDE = (INPUT_MEM_STRIDE / sizeof(Input_T));
-    constexpr size_t INPUT_HEIGHT_STRIDE = (INPUT_MEM_STRIDE / sizeof(Input_T))*CHANNELS_WIDTH;
-    // constexpr size_t INPUT_OUT_CHANNELS_STRIDE = (INPUT_MEM_STRIDE / sizeof(Input_T))*CHANNELS_WIDTH*CHANNELS_HEIGHT;
+    constexpr size_t NB_W_CHANNELS_PACKED = (NB_CHANNELS + n_pack<Weight_T>() - 1) / n_pack<Weight_T>();
+
+    constexpr size_t NB_OUTPUTS_PACKED = (NB_OUTPUTS + n_pack<Output_T>() - 1) / n_pack<Output_T>();
+    constexpr size_t NB_OUTPUTS_REM = NB_OUTPUTS % n_pack<Output_T>();
 
-    constexpr size_t WEIGHT_WIDTH_STRIDE = NB_CHANNELS;
-    constexpr size_t WEIGHT_HEIGHT_STRIDE = NB_CHANNELS*CHANNELS_WIDTH;
-    constexpr size_t WEIGHT_OUT_CHANNELS_STRIDE = NB_CHANNELS*CHANNELS_WIDTH*CHANNELS_HEIGHT;
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
-    for (size_t och = 0; och < NB_OUTPUTS; ++och) {
-        Bias_T weightedSum = (biases) ? biases[och] : Bias_T(0);
+    for (size_t outputPack = 0; outputPack < NB_OUTPUTS_PACKED; ++outputPack) {
+        Output_T packedOutputVal;
 
-        for (size_t iy = 0; iy < CHANNELS_HEIGHT; ++iy) {
-            int iOffset = INPUT_HEIGHT_STRIDE * iy;
+        for (size_t packElt = 0; packElt < n_pack<Output_T>(); ++packElt) {
+            if (NB_OUTPUTS_REM != 0 && packElt == NB_OUTPUTS_REM) {
+                break;
+            }
 
-            // Wrapping cannot occur in the middle of a line, except if
-            // there is only one line (1D)!
-            bool wrapInRange = false;
+            const size_t och = outputPack * n_pack<Output_T>() + packElt;
 
-            if (INPUT_MEM_WRAP_SIZE > 0 && iOffset >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) {
-                iOffset += (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET
-                            - INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
-            }
-            else if (INPUT_MEM_WRAP_SIZE > 0 && CHANNELS_WIDTH > 1
-                && CHANNELS_HEIGHT == 1 // single line (1D)!
-                && iOffset + CHANNELS_WIDTH * NB_CHANNELS
-                    > (INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
-            {
-                wrapInRange = true;
-            }
+            Bias_T weightedSum = (biases) ? biases[och] : Bias_T(0);
 
-            const size_t wOffset = WEIGHT_HEIGHT_STRIDE * iy + WEIGHT_OUT_CHANNELS_STRIDE * och;
+            for (size_t iy = 0; iy < CHANNELS_HEIGHT; ++iy) {
+                const int iPos = (CHANNELS_WIDTH * iy);
+                int iOffset = (INPUT_MEM_STRIDE / sizeof(Input_T)) * iPos;
 
-            if (!wrapInRange && INPUT_WIDTH_STRIDE == WEIGHT_WIDTH_STRIDE) {
-                macsOnRange<INPUT_HEIGHT_STRIDE>(
-                    inputs + iOffset,
-                    weights + wOffset,
-                    weightedSum);
-            }
-            else {
-                for (size_t ix = 0; ix < CHANNELS_WIDTH; ++ix) {
-                    int iOffsetInRange = iOffset + ix * INPUT_WIDTH_STRIDE;
+                // Wrapping cannot occur in the middle of a line, except if
+                // there is only one line (1D)!
+                bool wrapInRange = false;
 
-                    if (wrapInRange
-                        && iOffsetInRange >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
-                    {
-                        iOffsetInRange += (INPUT_MEM_WRAP_OFFSET
-                                    - INPUT_MEM_CONT_OFFSET
-                                    - INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
-                    }
+                if (INPUT_MEM_WRAP_SIZE > 0 && iOffset >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) {
+                    iOffset += (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET
+                                - INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
+                }
+                else if (INPUT_MEM_WRAP_SIZE > 0 && CHANNELS_WIDTH > 1
+                    && CHANNELS_HEIGHT == 1 // single line (1D)!
+                    && iOffset + CHANNELS_WIDTH * NB_CHANNELS
+                        > (INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
+                {
+                    wrapInRange = true;
+                }
 
-                    macsOnRange<INPUT_WIDTH_STRIDE>(
-                        inputs + iOffsetInRange,
-                        weights + wOffset + ix * WEIGHT_WIDTH_STRIDE,
+                const size_t wOffset = NB_W_CHANNELS_PACKED * CHANNELS_WIDTH
+                                        * (iy + CHANNELS_HEIGHT * och);
+
+                if (!wrapInRange && (INPUT_MEM_STRIDE / sizeof(Input_T)) == NB_CHANNELS) {
+                    macsOnRange<NB_CHANNELS * CHANNELS_WIDTH>(
+                        inputs + iOffset,
+                        weights + wOffset,
                         weightedSum);
                 }
+                else {
+                    for (size_t ix = 0; ix < CHANNELS_WIDTH; ++ix) {
+                        int iOffsetInRange = iOffset + ix * (INPUT_MEM_STRIDE / sizeof(Input_T));
+
+                        if (wrapInRange
+                            && iOffsetInRange >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
+                        {
+                            iOffsetInRange += (INPUT_MEM_WRAP_OFFSET
+                                        - INPUT_MEM_CONT_OFFSET
+                                        - INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
+                        }
+
+                        macsOnRange<NB_CHANNELS>(
+                            inputs + iOffsetInRange,
+                            weights + wOffset + ix * NB_W_CHANNELS_PACKED,
+                            weightedSum);
+                    }
+                }
             }
+
+            const auto outputVal
+                = activation_forward_value<Output_T>(weightedSum, och, ACTIVATION, rescaling);
+            pack_rev_set(packedOutputVal, packElt, outputVal);
         }
 
-        outputs[och] = activation_forward_value<Output_T>(weightedSum, och, ACTIVATION, rescaling);
+        outputs[outputPack] = packedOutputVal;
     }
 }
 
@@ -135,22 +149,40 @@ void fullyconnected_default_forward (
     const Bias_T* __restrict biases,
     const Rescaling_T& __restrict rescaling)
 {
-    constexpr size_t WEIGHT_OUT_CHANNELS_STRIDE = NB_CHANNELS*CHANNELS_WIDTH*CHANNELS_HEIGHT;
+    constexpr size_t NB_W_CHANNELS_PACKED = (NB_CHANNELS + n_pack<Weight_T>() - 1) / n_pack<Weight_T>();
+    constexpr size_t WEIGHT_SPATIAL_SIZE = CHANNELS_WIDTH * CHANNELS_HEIGHT;
+
+    constexpr size_t NB_OUTPUTS_PACKED = (NB_OUTPUTS + n_pack<Output_T>() - 1) / n_pack<Output_T>();
+    constexpr size_t NB_OUTPUTS_REM = NB_OUTPUTS % n_pack<Output_T>();
 
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
-    for (size_t och = 0; och < NB_OUTPUTS; och++) {
-        Bias_T weightedSum = (biases) ? biases[och] : Bias_T(0);
+    for (size_t outputPack = 0; outputPack < NB_OUTPUTS_PACKED; ++outputPack) {
+        Output_T packedOutputVal;
+
+        for (size_t packElt = 0; packElt < n_pack<Output_T>(); ++packElt) {
+            if (NB_OUTPUTS_REM != 0 && packElt == NB_OUTPUTS_REM) {
+                break;
+            }
 
-        const size_t wOffset = WEIGHT_OUT_CHANNELS_STRIDE * och;
+            const size_t och = outputPack * n_pack<Output_T>() + packElt;
 
-        macsOnRange<WEIGHT_OUT_CHANNELS_STRIDE>(
-            inputs,
-            weights + wOffset,
-            weightedSum);
+            Bias_T weightedSum = (biases) ? biases[och] : Bias_T(0);
 
-        outputs[och] = activation_forward_value<Output_T>(weightedSum, och, ACTIVATION, rescaling);
+            const size_t wOffset = NB_W_CHANNELS_PACKED * WEIGHT_SPATIAL_SIZE * och;
+
+            macsOnRange<NB_CHANNELS * WEIGHT_SPATIAL_SIZE>(
+                inputs,
+                weights + wOffset,
+                weightedSum);
+
+            const auto outputVal
+                = activation_forward_value<Output_T>(weightedSum, och, ACTIVATION, rescaling);
+            pack_rev_set(packedOutputVal, packElt, outputVal);
+        }
+
+        outputs[outputPack] = packedOutputVal;
     }
 }
 
@@ -186,61 +218,77 @@ void fullyconnected_transpose_forward (
     const Bias_T* __restrict biases,
     const Rescaling_T& __restrict rescaling)
 {
-    constexpr size_t INPUT_WIDTH_STRIDE = (INPUT_MEM_STRIDE / sizeof(Input_T));
-    constexpr size_t INPUT_HEIGHT_STRIDE = (INPUT_MEM_STRIDE / sizeof(Input_T))*CHANNELS_WIDTH;
-    // constexpr size_t INPUT_OUT_CHANNELS_STRIDE = (INPUT_MEM_STRIDE / sizeof(Input_T))*CHANNELS_WIDTH*CHANNELS_HEIGHT;
+    constexpr size_t NB_I_CHANNELS_PACKED = (NB_CHANNELS + n_pack<Input_T>() - 1) / n_pack<Input_T>();
+    constexpr size_t NB_W_CHANNELS_PACKED = (NB_CHANNELS + n_pack<Weight_T>() - 1) / n_pack<Weight_T>();
+    constexpr size_t WEIGHT_SPATIAL_SIZE = CHANNELS_WIDTH * CHANNELS_HEIGHT;
+
+    constexpr size_t NB_OUTPUTS_PACKED = (NB_OUTPUTS + n_pack<Output_T>() - 1) / n_pack<Output_T>();
+    constexpr size_t NB_OUTPUTS_REM = NB_OUTPUTS % n_pack<Output_T>();
 
-    constexpr size_t WEIGHT_HEIGHT_STRIDE = CHANNELS_WIDTH;
-    constexpr size_t WEIGHT_IN_CHANNELS_STRIDE = CHANNELS_HEIGHT*CHANNELS_WIDTH;
-    constexpr size_t WEIGHT_OUT_CHANNELS_STRIDE = NB_CHANNELS*CHANNELS_HEIGHT*CHANNELS_WIDTH;
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
-    for (size_t och = 0; och < NB_OUTPUTS; och++) {
-        Bias_T weightedSum = (biases) ? biases[och] : Bias_T(0);
+    for (size_t outputPack = 0; outputPack < NB_OUTPUTS_PACKED; ++outputPack) {
+        Output_T packedOutputVal;
 
-        for (size_t iy = 0; iy < CHANNELS_HEIGHT; ++iy) {
-            int iOffset = INPUT_HEIGHT_STRIDE * iy;
+        for (size_t packElt = 0; packElt < n_pack<Output_T>(); ++packElt) {
+            if (NB_OUTPUTS_REM != 0 && packElt == NB_OUTPUTS_REM) {
+                break;
+            }
 
-            // Wrapping cannot occur in the middle of a line, except if
-            // there is only one line (1D)!
-            bool wrapInRange = false;
+            const size_t och = outputPack * n_pack<Output_T>() + packElt;
 
-            if (INPUT_MEM_WRAP_SIZE > 0 && iOffset >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) {
-                iOffset += (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET
-                            - INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
-            }
-            else if (INPUT_MEM_WRAP_SIZE > 0 && CHANNELS_WIDTH > 1
-                && CHANNELS_HEIGHT == 1 // single line (1D)!
-                && iOffset + CHANNELS_WIDTH * NB_CHANNELS
-                    > (INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
-            {
-                wrapInRange = true;
-            }
+            Bias_T weightedSum = (biases) ? biases[och] : Bias_T(0);
 
-            const size_t wOffset = WEIGHT_OUT_CHANNELS_STRIDE * och + WEIGHT_HEIGHT_STRIDE * iy;
+            for (size_t iy = 0; iy < CHANNELS_HEIGHT; ++iy) {
+                const int iPos = (CHANNELS_WIDTH * iy);
+                int iOffset = (INPUT_MEM_STRIDE / sizeof(Input_T)) * iPos;
 
-            for (size_t ix = 0; ix < CHANNELS_WIDTH; ++ix) {
-                int iOffsetInRange = iOffset + ix * INPUT_WIDTH_STRIDE;
+                // Wrapping cannot occur in the middle of a line, except if
+                // there is only one line (1D)!
+                bool wrapInRange = false;
 
-                if (wrapInRange
-                    && iOffsetInRange >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
-                {
-                    iOffsetInRange += (INPUT_MEM_WRAP_OFFSET
-                                - INPUT_MEM_CONT_OFFSET
+                if (INPUT_MEM_WRAP_SIZE > 0 && iOffset >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) {
+                    iOffset += (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET
                                 - INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
                 }
+                else if (INPUT_MEM_WRAP_SIZE > 0 && CHANNELS_WIDTH > 1
+                    && CHANNELS_HEIGHT == 1 // single line (1D)!
+                    && iOffset + CHANNELS_WIDTH * NB_I_CHANNELS_PACKED
+                        > (INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
+                {
+                    wrapInRange = true;
+                }
+
+                const int wOffset = CHANNELS_WIDTH
+                                        * (iy + CHANNELS_HEIGHT * NB_W_CHANNELS_PACKED * och);
+
+                for (size_t ix = 0; ix < CHANNELS_WIDTH; ++ix) {
+                    int iOffsetInRange = iOffset + ix * (INPUT_MEM_STRIDE / sizeof(Input_T));
 
-                // Beware that the pointer increment for weights is
-                // CHANNELS_HEIGHT*CHANNELS_WIDTH
-                macsOnRange<NB_CHANNELS, WEIGHT_IN_CHANNELS_STRIDE>(
-                    inputs + iOffsetInRange,
-                    weights + wOffset + ix,
-                    weightedSum);
+                    if (wrapInRange
+                        && iOffsetInRange >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
+                    {
+                        iOffsetInRange += (INPUT_MEM_WRAP_OFFSET
+                                    - INPUT_MEM_CONT_OFFSET
+                                    - INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
+                    }
+
+                    // Beware that the pointer increment for weights is
+                    // CHANNELS_HEIGHT*CHANNELS_WIDTH
+                    macsOnRange<NB_CHANNELS, WEIGHT_SPATIAL_SIZE>(
+                        inputs + iOffsetInRange,
+                        weights + wOffset + ix,
+                        weightedSum);
+                }
             }
+
+            const auto outputVal
+                = activation_forward_value<Output_T>(weightedSum, och, ACTIVATION, rescaling);
+            pack_rev_set(packedOutputVal, packElt, outputVal);
         }
 
-        outputs[och] = activation_forward_value<Output_T>(weightedSum, och, ACTIVATION, rescaling);
+        outputs[outputPack] = packedOutputVal;
     }
 }
 
diff --git a/aidge_export_cpp/kernels/pooling.hpp b/aidge_export_cpp/kernels/pooling.hpp
index 41d9174..e66465c 100644
--- a/aidge_export_cpp/kernels/pooling.hpp
+++ b/aidge_export_cpp/kernels/pooling.hpp
@@ -7,6 +7,86 @@
 #include <cmath>
 #include <stdexcept>
 
+// ----------------------------------------------------------------------------
+// Max pooling helpers - Unpacked inputs
+// ----------------------------------------------------------------------------
+
+template<class Input_T,
+            typename std::enable_if<(n_pack<Input_T>() == 1)>::type* = nullptr>
+static void fillMaxVal(Input_T& __restrict maxVal)
+{
+    maxVal = std::numeric_limits<Input_T>::lowest();
+}
+
+template<class Input_T,
+            typename std::enable_if<(n_pack<Input_T>() == 1)>::type* = nullptr>
+static void findMaxVal(const Input_T& __restrict inputs, Input_T& __restrict maxVal)
+{
+    if (inputs > maxVal){
+        maxVal = inputs;
+    }
+}
+
+// ----------------------------------------------------------------------------
+// Max pooling helpers - Packed <2> inputs
+// ----------------------------------------------------------------------------
+
+template<class Input_T,
+            typename std::enable_if<(n_pack<Input_T>() == 2)>::type* = nullptr>
+static void fillMaxVal(Input_T& __restrict maxVal)
+{
+    maxVal.rev_fields.op0 = std::numeric_limits<Input_T>::lowest();
+    maxVal.rev_fields.op1 = std::numeric_limits<Input_T>::lowest();
+}
+
+template<class Input_T,
+            typename std::enable_if<(n_pack<Input_T>() == 2)>::type* = nullptr>
+static void findMaxVal(const Input_T& __restrict inputs, Input_T& __restrict maxVal)
+{
+    if (inputs.rev_fields.op0 > maxVal.rev_fields.op0){
+        maxVal.rev_fields.op0 = inputs.rev_fields.op0;
+    }
+
+    if (inputs.rev_fields.op1 > maxVal.rev_fields.op1){
+        maxVal.rev_fields.op1 = inputs.rev_fields.op1;
+    }
+}
+
+// ----------------------------------------------------------------------------
+// Max pooling helpers - Packed <4> inputs
+// ----------------------------------------------------------------------------
+
+template<class Input_T,
+            typename std::enable_if<(n_pack<Input_T>() == 4)>::type* = nullptr>
+static void fillMaxVal(Input_T& __restrict maxVal)
+{
+    maxVal.rev_fields.op0 = std::numeric_limits<Input_T>::lowest();
+    maxVal.rev_fields.op1 = std::numeric_limits<Input_T>::lowest();
+    maxVal.rev_fields.op2 = std::numeric_limits<Input_T>::lowest();
+    maxVal.rev_fields.op3 = std::numeric_limits<Input_T>::lowest();
+}
+
+template<class Input_T,
+            typename std::enable_if<(n_pack<Input_T>() == 4)>::type* = nullptr>
+static void findMaxVal(const Input_T& __restrict inputs, Input_T& __restrict maxVal)
+{
+    if (inputs.rev_fields.op0 > maxVal.rev_fields.op0){
+        maxVal.rev_fields.op0 = inputs.rev_fields.op0;
+    }
+
+    if (inputs.rev_fields.op1 > maxVal.rev_fields.op1){
+        maxVal.rev_fields.op1 = inputs.rev_fields.op1;
+    }
+
+    if (inputs.rev_fields.op2 > maxVal.rev_fields.op2){
+        maxVal.rev_fields.op2 = inputs.rev_fields.op2;
+    }
+
+    if (inputs.rev_fields.op3 > maxVal.rev_fields.op3){
+        maxVal.rev_fields.op3 = inputs.rev_fields.op3;
+    }
+}
+
 
 template<size_t NB_CHANNELS,
          size_t CHANNELS_HEIGHT, size_t CHANNELS_WIDTH,
@@ -35,6 +115,8 @@ void pooling_forward(
     const Input_T* __restrict inputs,
     Output_T* __restrict outputs)
 {
+    static_assert(POOLING_TYPE != Average || (!is_packed<Input_T>() && !is_packed<Output_T>()), "I/O packing not yet supported for Average pooling");
+
     constexpr size_t OUTPUTS_HEIGHT_NOPAD
         = (CHANNELS_HEIGHT - POOL_HEIGHT + STRIDE_Y) / STRIDE_Y;
     constexpr size_t OUTPUTS_WIDTH_NOPAD
@@ -49,6 +131,8 @@ void pooling_forward(
                     0, POOL_HEIGHT);
         const int iy = static_cast<int>(oy * STRIDE_Y) - static_cast<int>(PADDING_Y);
 
+        // oy loop should not be parallelized to allow memory wrapping: memory
+        // lines must be processed in order.
 #ifdef _OPENMP
 #pragma omp parallel for collapse(2)
 #endif
@@ -74,7 +158,8 @@ void pooling_forward(
                 // <--
 
                 if (POOLING_TYPE == Max) {
-                    Input_T maxVal = std::numeric_limits<Input_T>::lowest();
+                    Input_T maxVal;
+                    fillMaxVal(maxVal);
 
                     for (size_t sy = 0; sy < POOL_HEIGHT; ++sy) {
                         if ((PADDING_Y != 0
@@ -125,8 +210,7 @@ void pooling_forward(
                                             - INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
                             }
 
-                            if (inputs[iOffsetInRange] > maxVal)
-                                maxVal = inputs[iOffsetInRange];
+                            findMaxVal(inputs[iOffsetInRange], maxVal);
                         }
                     }
 
diff --git a/aidge_export_cpp/static/macs.hpp b/aidge_export_cpp/static/macs.hpp
index 8bcb0fc..cf8ee9d 100644
--- a/aidge_export_cpp/static/macs.hpp
+++ b/aidge_export_cpp/static/macs.hpp
@@ -985,129 +985,4 @@ static void macsOnRange(const Input_T* __restrict inputs,
     }
 }
 
-/***************************************************************************************
-**************************************PACK_ACTIVATIONS**********************************
-***************************************************************************************/
-template<typename Output_T>
-static void compact_data_during_loop (const uint8_t value,
-                            Output_T* __restrict outputs,
-                            int* outputOffset,
-                            PackSupport* infoPack)
-{
-    constexpr unsigned int mask = (1L << std::numeric_limits<Output_T>::digits) - 1;
-    constexpr unsigned int nbSlot = ceil((double)8/std::numeric_limits<Output_T>::digits);
-
-    infoPack->accumulator |= value & mask;
-    infoPack->cptAccumulator += 1;
-
-    if (infoPack->cptAccumulator == nbSlot) {
-        outputs[*(outputOffset)] = infoPack->accumulator;
-        infoPack->cptAccumulator = 0;
-        infoPack->accumulator = 0;
-    }
-    else {
-        infoPack->accumulator <<= std::numeric_limits<Output_T>::digits;
-    }
-}
-
-template<typename Output_T>
-static void compact_data_end_loop (Output_T* __restrict outputs,
-                            int* outputOffset,
-                            PackSupport* infoPack)
-{
-    // if data still accumulated but not stored
-    if (infoPack->cptAccumulator != 0) {
-        constexpr unsigned int nbSlot = ceil((double)8/std::numeric_limits<Output_T>::digits);
-
-        // Add extra zero to shift data to the left
-        infoPack->cptAccumulator += 1;
-        while (infoPack->cptAccumulator < nbSlot) {
-            infoPack->accumulator <<= std::numeric_limits<Output_T>::digits;
-            infoPack->cptAccumulator += 1;
-        }
-        outputs[*(outputOffset)] = infoPack->accumulator;
-        infoPack->cptAccumulator = 0;
-        infoPack->accumulator = 0;
-    }
-}
-
-
-/*************************************************************************************
-***************************************MAX_POOL_4bits*********************************
-***************************************************************************************/
-template<class Input_T,
-            typename std::enable_if<(n_pack<Input_T>() == 2)>::type* = nullptr>
-static void fillMaxVal(Input_T& __restrict maxVal)
-{
-    maxVal.rev_fields.op0 = std::numeric_limits<Input_T>::lowest();
-    maxVal.rev_fields.op1 = std::numeric_limits<Input_T>::lowest();
-}
-
-template<class Input_T,
-            typename std::enable_if<(n_pack<Input_T>() == 2)>::type* = nullptr>
-static void findMaxVal(const Input_T& __restrict inputs, Input_T& __restrict maxVal)
-{
-    if(inputs.rev_fields.op0 > maxVal.rev_fields.op0){
-        maxVal.rev_fields.op0 = inputs.rev_fields.op0;
-    }
-    if(inputs.rev_fields.op1 > maxVal.rev_fields.op1){
-        maxVal.rev_fields.op1 = inputs.rev_fields.op1;
-    }
-}
-
-template<class Input_T, class Output_T,
-            typename std::enable_if<(n_pack<Input_T>() == 2)>::type* = nullptr>
-static void packMaxVal(Input_T& __restrict maxVal,
-                                            Output_T* __restrict outputs,
-                                            int* outputOffset,
-                                            PackSupport* infoPack)
-{
-    compact_data_during_loop(maxVal.rev_fields.op0, outputs, outputOffset, infoPack);
-    compact_data_during_loop(maxVal.rev_fields.op1, outputs, outputOffset, infoPack);
-}
-
-/*************************************************************************************
-***************************************MAX_POOL_2bits*********************************
-***************************************************************************************/
-template<class Input_T,
-            typename std::enable_if<(n_pack<Input_T>() == 4)>::type* = nullptr>
-static void fillMaxVal(Input_T& __restrict maxVal)
-{
-    maxVal.rev_fields.op0 = std::numeric_limits<Input_T>::lowest();
-    maxVal.rev_fields.op1 = std::numeric_limits<Input_T>::lowest();
-    maxVal.rev_fields.op2 = std::numeric_limits<Input_T>::lowest();
-    maxVal.rev_fields.op3 = std::numeric_limits<Input_T>::lowest();
-}
-
-template<class Input_T,
-            typename std::enable_if<(n_pack<Input_T>() == 4)>::type* = nullptr>
-static void findMaxVal(const Input_T& __restrict inputs, Input_T& __restrict maxVal)
-{
-    if(inputs.rev_fields.op0 > maxVal.rev_fields.op0){
-        maxVal.rev_fields.op0 = inputs.rev_fields.op0;
-    }
-    if(inputs.rev_fields.op1 > maxVal.rev_fields.op1){
-        maxVal.rev_fields.op1 = inputs.rev_fields.op1;
-    }
-    if(inputs.rev_fields.op2 > maxVal.rev_fields.op2){
-        maxVal.rev_fields.op2 = inputs.rev_fields.op2;
-    }
-    if(inputs.rev_fields.op3 > maxVal.rev_fields.op3){
-        maxVal.rev_fields.op3 = inputs.rev_fields.op3;
-    }
-}
-
-template<class Input_T, class Output_T,
-            typename std::enable_if<(n_pack<Input_T>() == 4)>::type* = nullptr>
-static void packMaxVal(Input_T& __restrict maxVal,
-                                            Output_T* __restrict outputs,
-                                            int* outputOffset,
-                                            PackSupport* infoPack)
-{
-    compact_data_during_loop(maxVal.rev_fields.op0, outputs, outputOffset, infoPack);
-    compact_data_during_loop(maxVal.rev_fields.op1, outputs, outputOffset, infoPack);
-    compact_data_during_loop(maxVal.rev_fields.op2, outputs, outputOffset, infoPack);
-    compact_data_during_loop(maxVal.rev_fields.op3, outputs, outputOffset, infoPack);
-}
-
 #endif
diff --git a/aidge_export_cpp/static/typedefs.hpp b/aidge_export_cpp/static/typedefs.hpp
index e680417..1544a98 100644
--- a/aidge_export_cpp/static/typedefs.hpp
+++ b/aidge_export_cpp/static/typedefs.hpp
@@ -85,6 +85,46 @@ struct n_pack : std::integral_constant<int, 1> {};
 template <std::size_t N_PACK, std::size_t N_BITS, bool SIGNED>
 struct n_pack<packed_bitint<N_PACK, N_BITS, SIGNED>> : std::integral_constant<int, N_PACK> {};
 
+template <typename T, typename std::enable_if<(n_pack<T>() == 1)>::type* = nullptr>
+constexpr auto pack_rev_get(T& data, int /*i*/) {
+    return data;
+}
+
+template <typename T, typename std::enable_if<(n_pack<T>() > 1)>::type* = nullptr>
+constexpr auto pack_rev_get(T& data, int i) {
+    assert((i < n_pack<T>()) && "pack index out of range");
+    switch (i) {
+        case 0: return data.rev_fields.op0;
+        case 1: return data.rev_fields.op1;
+        case 2: return data.rev_fields.op2;
+        case 3: return data.rev_fields.op3;
+        case 4: return data.rev_fields.op4;
+        case 5: return data.rev_fields.op5;
+        case 6: return data.rev_fields.op6;
+        default: return data.rev_fields.op7;
+    }
+}
+
+template <typename T, typename std::enable_if<(n_pack<T>() == 1)>::type* = nullptr>
+constexpr void pack_rev_set(T& data, int /*i*/, T val) {
+    data = val;
+}
+
+template <typename T, typename std::enable_if<(n_pack<T>() > 1)>::type* = nullptr>
+constexpr void pack_rev_set(T& data, int i, decltype(data.fields.op0) val) {
+    assert((i < n_pack<T>()) && "pack index out of range");
+    switch (i) {
+        case 0: data.rev_fields.op0 = val;
+        case 1: data.rev_fields.op1 = val;
+        case 2: data.rev_fields.op2 = val;
+        case 3: data.rev_fields.op3 = val;
+        case 4: data.rev_fields.op4 = val;
+        case 5: data.rev_fields.op5 = val;
+        case 6: data.rev_fields.op6 = val;
+        default: data.rev_fields.op7 = val;
+    }
+}
+
 // ----------------------------------------------------------------------------
 // ---------------- Custom bit-width types specializations --------------------
 // ----------------------------------------------------------------------------
-- 
GitLab