From bb480b57c563d5b5acc55e14a6a23dc9c36a6254 Mon Sep 17 00:00:00 2001 From: Olivier BICHLER <olivier.bichler@cea.fr> Date: Wed, 16 Jul 2025 17:46:22 +0200 Subject: [PATCH] Operators adaptation --- aidge_export_cpp/kernels/convolution.hpp | 169 +++++++------ .../kernels/convolution_depthwise.hpp | 177 +++++++------ aidge_export_cpp/kernels/fullyconnected.hpp | 234 +++++++++++------- aidge_export_cpp/kernels/pooling.hpp | 90 ++++++- aidge_export_cpp/static/macs.hpp | 125 ---------- aidge_export_cpp/static/typedefs.hpp | 40 +++ 6 files changed, 471 insertions(+), 364 deletions(-) diff --git a/aidge_export_cpp/kernels/convolution.hpp b/aidge_export_cpp/kernels/convolution.hpp index 1b4f9fd..beaa3a6 100644 --- a/aidge_export_cpp/kernels/convolution.hpp +++ b/aidge_export_cpp/kernels/convolution.hpp @@ -40,6 +40,11 @@ void convolution_forward( const Bias_T* __restrict biases, const Rescaling_T& __restrict rescaling) { + constexpr size_t NB_I_CHANNELS_PACKED = (NB_CHANNELS + n_pack<Input_T>() - 1) / n_pack<Input_T>(); + constexpr size_t NB_W_CHANNELS_PACKED = (NB_CHANNELS + n_pack<Weight_T>() - 1) / n_pack<Weight_T>(); + constexpr size_t NB_OUTPUTS_PACKED = (NB_OUTPUTS + n_pack<Output_T>() - 1) / n_pack<Output_T>(); + constexpr size_t NB_OUTPUTS_REM = NB_OUTPUTS % n_pack<Output_T>(); + constexpr size_t OUTPUTS_HEIGHT_NOPAD = (CHANNELS_HEIGHT - DILATION_Y * (KERNEL_HEIGHT - 1) - 1 + STRIDE_Y) / STRIDE_Y; constexpr size_t OUTPUTS_WIDTH_NOPAD @@ -54,21 +59,15 @@ void convolution_forward( 0, KERNEL_HEIGHT); const int iy = static_cast<int>(oy * STRIDE_Y) - static_cast<int>(PADDING_Y); + // oy loop should not be parallelized to allow memory wrapping: memory + // lines must be processed in order. #ifdef _OPENMP #pragma omp parallel for collapse(2) #endif for (size_t ox = 0; ox < OUTPUTS_WIDTH; ++ox) { - for (size_t output = 0; output < NB_OUTPUTS; ++output) { - // moved to inner loop for collapsing --> - const size_t sxMin = (PADDING_X == 0) ? 0 - : max((PADDING_X - (ox * STRIDE_X) + DILATION_X - 1) / DILATION_X, 0); - const size_t sxMax = (PADDING_X == 0 - && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD) - ? KERNEL_WIDTH - : clamp((CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X)) / DILATION_X, - 0, KERNEL_WIDTH); - const int ix = static_cast<int>(ox * STRIDE_X) - static_cast<int>(PADDING_X); - + for (size_t outputPack = 0; outputPack < NB_OUTPUTS_PACKED; ++outputPack) { + Output_T packedOutputVal; + const size_t oPos = (ox + OUTPUTS_WIDTH * oy); int oOffset = (OUTPUT_MEM_STRIDE / sizeof(Output_T)) * oPos; @@ -77,82 +76,104 @@ void convolution_forward( - OUTPUT_MEM_CONT_SIZE) / sizeof(Output_T); } - // <-- - // Check if the biases are defined - Bias_T weightedSum = biases ? biases[output] : 0; - - for (size_t sy = 0; sy < KERNEL_HEIGHT; ++sy) { - if ((PADDING_Y != 0 - || OUTPUTS_HEIGHT != OUTPUTS_HEIGHT_NOPAD) - && sy >= syMax - syMin) - { + for (size_t packElt = 0; packElt < n_pack<Output_T>(); ++packElt) { + if (NB_OUTPUTS_REM != 0 && packElt == NB_OUTPUTS_REM) { break; } - const size_t iPos = static_cast<size_t>(sxMin * DILATION_X + ix) - + CHANNELS_WIDTH * (static_cast<size_t>(iy + (syMin + sy) * DILATION_Y)); - int iOffset = (INPUT_MEM_STRIDE / sizeof(Input_T)) * iPos; + const size_t output = outputPack * n_pack<Output_T>() + packElt; + + // moved to inner loop for collapsing --> + const size_t sxMin = (PADDING_X == 0) ? 0 + : max((PADDING_X - (ox * STRIDE_X) + DILATION_X - 1) / DILATION_X, 0); + const size_t sxMax = (PADDING_X == 0 + && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD) + ? KERNEL_WIDTH + : clamp((CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X)) / DILATION_X, + 0, KERNEL_WIDTH); + const int ix = static_cast<int>(ox * STRIDE_X) - static_cast<int>(PADDING_X); + + // <-- + // Check if the biases are defined + Bias_T weightedSum = biases ? biases[output] : 0; + + for (size_t sy = 0; sy < KERNEL_HEIGHT; ++sy) { + if ((PADDING_Y != 0 + || OUTPUTS_HEIGHT != OUTPUTS_HEIGHT_NOPAD) + && sy >= syMax - syMin) + { + break; + } - // Wrapping cannot occur in the middle of a line, except if - // there is only one line (1D)! - bool wrapInRange = false; + const size_t iPos = static_cast<size_t>(sxMin * DILATION_X + ix) + + CHANNELS_WIDTH * (static_cast<size_t>(iy + (syMin + sy) * DILATION_Y)); + int iOffset = (INPUT_MEM_STRIDE / sizeof(Input_T)) * iPos; - if (INPUT_MEM_WRAP_SIZE > 0 - && iOffset >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) - { - iOffset += (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET - - INPUT_MEM_CONT_SIZE) / sizeof(Input_T); - } - else if (INPUT_MEM_WRAP_SIZE > 0 && KERNEL_WIDTH > 1 - && CHANNELS_HEIGHT == 1 // single line (1D)! - && iOffset + KERNEL_WIDTH * NB_CHANNELS - > (INPUT_MEM_CONT_SIZE / sizeof(Input_T))) - { - wrapInRange = true; - } + // Wrapping cannot occur in the middle of a line, except if + // there is only one line (1D)! + bool wrapInRange = false; - const size_t wOffset = NB_CHANNELS * (sxMin - + KERNEL_WIDTH * (syMin + sy + KERNEL_HEIGHT * output)); - - if (!wrapInRange && NB_CHANNELS == (INPUT_MEM_STRIDE / sizeof(Input_T)) - && DILATION_X == 1 && ((PADDING_X == 0 && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD) - || sxMax - sxMin == KERNEL_WIDTH)) - { - macsOnRange<KERNEL_WIDTH * NB_CHANNELS>( - inputs + iOffset, - weights + wOffset, - weightedSum); - } - else { - for (size_t sx = 0; sx < KERNEL_WIDTH; ++sx) { - if ((PADDING_X != 0 - || OUTPUTS_WIDTH != OUTPUTS_WIDTH_NOPAD) - && sx >= sxMax - sxMin) - { - break; - } - - int iOffsetInRange = iOffset - + sx * DILATION_X * (INPUT_MEM_STRIDE / sizeof(Input_T)); + if (INPUT_MEM_WRAP_SIZE > 0 + && iOffset >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) + { + iOffset += (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET + - INPUT_MEM_CONT_SIZE) / sizeof(Input_T); + } + else if (INPUT_MEM_WRAP_SIZE > 0 && KERNEL_WIDTH > 1 + && CHANNELS_HEIGHT == 1 // single line (1D)! + && iOffset + KERNEL_WIDTH * NB_I_CHANNELS_PACKED + > (INPUT_MEM_CONT_SIZE / sizeof(Input_T))) + { + wrapInRange = true; + } - if (wrapInRange - && iOffsetInRange >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) - { - iOffsetInRange += (INPUT_MEM_WRAP_OFFSET - - INPUT_MEM_CONT_OFFSET - - INPUT_MEM_CONT_SIZE) / sizeof(Input_T); - } + const size_t wOffset = NB_W_CHANNELS_PACKED * (sxMin + + KERNEL_WIDTH * (syMin + sy + KERNEL_HEIGHT * output)); - macsOnRange<NB_CHANNELS>( - // same input line so no wrapping can occur - inputs + iOffsetInRange, - weights + wOffset + sx * NB_CHANNELS, + if (!wrapInRange && NB_I_CHANNELS_PACKED == (INPUT_MEM_STRIDE / sizeof(Input_T)) + && DILATION_X == 1 && ((PADDING_X == 0 && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD) + || sxMax - sxMin == KERNEL_WIDTH)) + { + macsOnRange<KERNEL_WIDTH * NB_CHANNELS>( + inputs + iOffset, + weights + wOffset, weightedSum); } + else { + for (size_t sx = 0; sx < KERNEL_WIDTH; ++sx) { + if ((PADDING_X != 0 + || OUTPUTS_WIDTH != OUTPUTS_WIDTH_NOPAD) + && sx >= sxMax - sxMin) + { + break; + } + + int iOffsetInRange = iOffset + + sx * DILATION_X * (INPUT_MEM_STRIDE / sizeof(Input_T)); + + if (wrapInRange + && iOffsetInRange >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) + { + iOffsetInRange += (INPUT_MEM_WRAP_OFFSET + - INPUT_MEM_CONT_OFFSET + - INPUT_MEM_CONT_SIZE) / sizeof(Input_T); + } + + macsOnRange<NB_CHANNELS>( + // same input line so no wrapping can occur + inputs + iOffsetInRange, + weights + wOffset + sx * NB_W_CHANNELS_PACKED, + weightedSum); + } + } } + + const auto outputVal + = activation_forward_value<Output_T>(weightedSum, output, ACTIVATION, rescaling); + pack_rev_set(packedOutputVal, packElt, outputVal); } - outputs[oOffset + output] = activation_forward_value<Output_T>(weightedSum, output, ACTIVATION, rescaling); + outputs[oOffset + outputPack] = packedOutputVal; } } } diff --git a/aidge_export_cpp/kernels/convolution_depthwise.hpp b/aidge_export_cpp/kernels/convolution_depthwise.hpp index 1ee9b45..7ad506c 100644 --- a/aidge_export_cpp/kernels/convolution_depthwise.hpp +++ b/aidge_export_cpp/kernels/convolution_depthwise.hpp @@ -42,6 +42,10 @@ void convolution_depthwise_forward( static_assert(NB_OUTPUTS % NB_CHANNELS == 0, "NB_OUTPUTS should be a multiple of NB_CHANNELS."); + constexpr size_t NB_I_CHANNELS_PACKED = (NB_CHANNELS + n_pack<Input_T>() - 1) / n_pack<Input_T>(); + constexpr size_t NB_OUTPUTS_PACKED = (NB_OUTPUTS + n_pack<Output_T>() - 1) / n_pack<Output_T>(); + constexpr size_t NB_OUTPUTS_REM = NB_OUTPUTS % n_pack<Output_T>(); + constexpr size_t DILATED_KERNEL_HEIGHT = KERNEL_HEIGHT + (DILATION_Y - 1) * (KERNEL_HEIGHT - 1); @@ -62,20 +66,14 @@ void convolution_depthwise_forward( 0, DILATED_KERNEL_HEIGHT); const int iy = static_cast<int>(oy * STRIDE_Y) - static_cast<int>(PADDING_Y); + // oy loop should not be parallelized to allow memory wrapping: memory + // lines must be processed in order. #ifdef _OPENMP #pragma omp parallel for collapse(2) #endif for (size_t ox = 0; ox < OUTPUTS_WIDTH; ++ox) { - for (size_t output = 0; output < NB_OUTPUTS; ++output) { - // moved to inner loop for collapsing --> - const size_t sxMin = (PADDING_X == 0) ? 0 - : max(PADDING_X - (ox * STRIDE_X), 0); - const size_t sxMax = (PADDING_X == 0 - && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD) - ? DILATED_KERNEL_WIDTH - : clamp(CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X), - 0, DILATED_KERNEL_WIDTH); - const int ix = static_cast<int>(ox * STRIDE_X) - static_cast<int>(PADDING_X); + for (size_t outputPack = 0; outputPack < NB_OUTPUTS_PACKED; ++outputPack) { + Output_T packedOutputVal; const size_t oPos = (ox + OUTPUTS_WIDTH * oy); int oOffset = (OUTPUT_MEM_STRIDE / sizeof(Output_T)) * oPos; @@ -84,82 +82,123 @@ void convolution_depthwise_forward( oOffset += (OUTPUT_MEM_WRAP_OFFSET - OUTPUT_MEM_CONT_OFFSET - OUTPUT_MEM_CONT_SIZE) / sizeof(Output_T); } - // <-- - const size_t channel = (output * NB_CHANNELS) / NB_OUTPUTS; + for (size_t packElt = 0; packElt < n_pack<Output_T>(); ++packElt) { + if (NB_OUTPUTS_REM != 0 && packElt == NB_OUTPUTS_REM) { + break; + } - Bias_T weightedSum = biases ? biases[output] : 0; + const size_t output = outputPack * n_pack<Output_T>() + packElt; - for (size_t sy = 0; sy < KERNEL_HEIGHT; ++sy) { - if ((PADDING_Y != 0 - || OUTPUTS_HEIGHT != OUTPUTS_HEIGHT_NOPAD) - && ((sy*DILATION_Y < syMin) || (sy*DILATION_Y >= syMax))) - { - continue; - } + // moved to inner loop for collapsing --> + const size_t sxMin = (PADDING_X == 0) ? 0 + : max(PADDING_X - (ox * STRIDE_X), 0); + const size_t sxMax = (PADDING_X == 0 + && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD) + ? DILATED_KERNEL_WIDTH + : clamp(CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X), + 0, DILATED_KERNEL_WIDTH); + const int ix = static_cast<int>(ox * STRIDE_X) - static_cast<int>(PADDING_X); + // <-- - const size_t iPos = static_cast<size_t>(ix) - + CHANNELS_WIDTH * (static_cast<size_t>(iy + sy * DILATION_Y)); - int iOffset = (INPUT_MEM_STRIDE / sizeof(Input_T)) * iPos; + const size_t channel = (output * NB_CHANNELS) / NB_OUTPUTS; - // Wrapping cannot occur in the middle of a line, except if - // there is only one line (1D)! - bool wrapInRange = false; + Bias_T weightedSum = biases ? biases[output] : 0; - if (INPUT_MEM_WRAP_SIZE > 0 - && iOffset >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) - { - iOffset += (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET - - INPUT_MEM_CONT_SIZE) / sizeof(Input_T); - } - else if (INPUT_MEM_WRAP_SIZE > 0 && KERNEL_WIDTH > 1 - && CHANNELS_HEIGHT == 1 // single line (1D)! - && iOffset + KERNEL_WIDTH * NB_CHANNELS - > (INPUT_MEM_CONT_SIZE / sizeof(Input_T))) - { - wrapInRange = true; - } + for (size_t sy = 0; sy < KERNEL_HEIGHT; ++sy) { + if ((PADDING_Y != 0 + || OUTPUTS_HEIGHT != OUTPUTS_HEIGHT_NOPAD) + && ((sy*DILATION_Y < syMin) || (sy*DILATION_Y >= syMax))) + { + continue; + } - const size_t wOffset = (output*KERNEL_HEIGHT + sy) - * KERNEL_WIDTH; + const size_t iPos = static_cast<size_t>(ix) + + CHANNELS_WIDTH * (static_cast<size_t>(iy + sy * DILATION_Y)); + int iOffset = (INPUT_MEM_STRIDE / sizeof(Input_T)) * iPos; - if (!wrapInRange && NB_CHANNELS == (INPUT_MEM_STRIDE / sizeof(Input_T)) - && DILATION_X == 1 && ((PADDING_X == 0 - && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD) - || sxMax - sxMin == KERNEL_WIDTH)) - { - macsOnRange<KERNEL_WIDTH, NB_CHANNELS>( - inputs + iOffset + channel, - weights + wOffset, - weightedSum); - } - else { - for (size_t sx = 0; sx < KERNEL_WIDTH; ++sx) { - if ((PADDING_X != 0 - || OUTPUTS_WIDTH != OUTPUTS_WIDTH_NOPAD) - && ((sx*DILATION_X < sxMin) || (sx*DILATION_X >= sxMax))) - { - continue; + // Wrapping cannot occur in the middle of a line, except if + // there is only one line (1D)! + bool wrapInRange = false; + + if (INPUT_MEM_WRAP_SIZE > 0 + && iOffset >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) + { + iOffset += (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET + - INPUT_MEM_CONT_SIZE) / sizeof(Input_T); + } + else if (INPUT_MEM_WRAP_SIZE > 0 && KERNEL_WIDTH > 1 + && CHANNELS_HEIGHT == 1 // single line (1D)! + && iOffset + KERNEL_WIDTH * NB_I_CHANNELS_PACKED + > (INPUT_MEM_CONT_SIZE / sizeof(Input_T))) + { + wrapInRange = true; + } + + const size_t wOffset = (output*KERNEL_HEIGHT + sy) + * KERNEL_WIDTH; + const auto inPackOffset = channel % n_pack<Input_T>(); + + if (!wrapInRange && NB_I_CHANNELS_PACKED == (INPUT_MEM_STRIDE / sizeof(Input_T)) + && DILATION_X == 1 && ((PADDING_X == 0 + && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD) + || sxMax - sxMin == KERNEL_WIDTH)) + { + const auto wPackOffset = wOffset % n_pack<Weight_T>(); + + if (inPackOffset == 0 && wPackOffset == 0) { + macsOnRange<KERNEL_WIDTH, NB_I_CHANNELS_PACKED>( + inputs + iOffset + channel / n_pack<Input_T>(), + weights + wOffset / n_pack<Weight_T>(), + weightedSum); } + else { + for (size_t sx = 0; sx < KERNEL_WIDTH; ++sx) { + const int iOffsetInRange = iOffset + + sx * DILATION_X * (INPUT_MEM_STRIDE / sizeof(Input_T)); - int iOffsetInRange = iOffset - + sx * DILATION_X * (INPUT_MEM_STRIDE / sizeof(Input_T)); + const auto in = inputs[iOffsetInRange + channel / n_pack<Input_T>()]; + const auto w = weights[(wOffset + sx) / n_pack<Weight_T>()]; - if (wrapInRange - && iOffsetInRange >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) - { - iOffsetInRange += (INPUT_MEM_WRAP_OFFSET - - INPUT_MEM_CONT_OFFSET - - INPUT_MEM_CONT_SIZE) / sizeof(Input_T); + weightedSum += pack_rev_get(in, inPackOffset) + * pack_rev_get(w, (wOffset + sx) % n_pack<Weight_T>()); + } } + } + else { + for (size_t sx = 0; sx < KERNEL_WIDTH; ++sx) { + if ((PADDING_X != 0 + || OUTPUTS_WIDTH != OUTPUTS_WIDTH_NOPAD) + && ((sx*DILATION_X < sxMin) || (sx*DILATION_X >= sxMax))) + { + continue; + } + + int iOffsetInRange = iOffset + + sx * DILATION_X * (INPUT_MEM_STRIDE / sizeof(Input_T)); + + if (wrapInRange + && iOffsetInRange >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) + { + iOffsetInRange += (INPUT_MEM_WRAP_OFFSET + - INPUT_MEM_CONT_OFFSET + - INPUT_MEM_CONT_SIZE) / sizeof(Input_T); + } - weightedSum += inputs[iOffsetInRange + channel] - * weights[wOffset + sx]; + const auto in = inputs[iOffsetInRange + channel / n_pack<Input_T>()]; + const auto w = weights[(wOffset + sx) / n_pack<Weight_T>()]; + weightedSum += pack_rev_get(in, inPackOffset) + * pack_rev_get(w, (wOffset + sx) % n_pack<Weight_T>()); + } } } + + const auto outputVal + = activation_forward_value<Output_T>(weightedSum, output, ACTIVATION, rescaling); + pack_rev_set(packedOutputVal, packElt, outputVal); } - outputs[oOffset + output] = activation_forward_value<Output_T>(weightedSum, output, ACTIVATION, rescaling); + outputs[oOffset + outputPack] = packedOutputVal; } } } diff --git a/aidge_export_cpp/kernels/fullyconnected.hpp b/aidge_export_cpp/kernels/fullyconnected.hpp index 7723a7d..0fe732b 100644 --- a/aidge_export_cpp/kernels/fullyconnected.hpp +++ b/aidge_export_cpp/kernels/fullyconnected.hpp @@ -39,67 +39,81 @@ void fullyconnected_forward ( const Bias_T* __restrict biases, const Rescaling_T& __restrict rescaling) { - constexpr size_t INPUT_WIDTH_STRIDE = (INPUT_MEM_STRIDE / sizeof(Input_T)); - constexpr size_t INPUT_HEIGHT_STRIDE = (INPUT_MEM_STRIDE / sizeof(Input_T))*CHANNELS_WIDTH; - // constexpr size_t INPUT_OUT_CHANNELS_STRIDE = (INPUT_MEM_STRIDE / sizeof(Input_T))*CHANNELS_WIDTH*CHANNELS_HEIGHT; + constexpr size_t NB_W_CHANNELS_PACKED = (NB_CHANNELS + n_pack<Weight_T>() - 1) / n_pack<Weight_T>(); + + constexpr size_t NB_OUTPUTS_PACKED = (NB_OUTPUTS + n_pack<Output_T>() - 1) / n_pack<Output_T>(); + constexpr size_t NB_OUTPUTS_REM = NB_OUTPUTS % n_pack<Output_T>(); - constexpr size_t WEIGHT_WIDTH_STRIDE = NB_CHANNELS; - constexpr size_t WEIGHT_HEIGHT_STRIDE = NB_CHANNELS*CHANNELS_WIDTH; - constexpr size_t WEIGHT_OUT_CHANNELS_STRIDE = NB_CHANNELS*CHANNELS_WIDTH*CHANNELS_HEIGHT; #ifdef _OPENMP #pragma omp parallel for #endif - for (size_t och = 0; och < NB_OUTPUTS; ++och) { - Bias_T weightedSum = (biases) ? biases[och] : Bias_T(0); + for (size_t outputPack = 0; outputPack < NB_OUTPUTS_PACKED; ++outputPack) { + Output_T packedOutputVal; - for (size_t iy = 0; iy < CHANNELS_HEIGHT; ++iy) { - int iOffset = INPUT_HEIGHT_STRIDE * iy; + for (size_t packElt = 0; packElt < n_pack<Output_T>(); ++packElt) { + if (NB_OUTPUTS_REM != 0 && packElt == NB_OUTPUTS_REM) { + break; + } - // Wrapping cannot occur in the middle of a line, except if - // there is only one line (1D)! - bool wrapInRange = false; + const size_t och = outputPack * n_pack<Output_T>() + packElt; - if (INPUT_MEM_WRAP_SIZE > 0 && iOffset >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) { - iOffset += (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET - - INPUT_MEM_CONT_SIZE) / sizeof(Input_T); - } - else if (INPUT_MEM_WRAP_SIZE > 0 && CHANNELS_WIDTH > 1 - && CHANNELS_HEIGHT == 1 // single line (1D)! - && iOffset + CHANNELS_WIDTH * NB_CHANNELS - > (INPUT_MEM_CONT_SIZE / sizeof(Input_T))) - { - wrapInRange = true; - } + Bias_T weightedSum = (biases) ? biases[och] : Bias_T(0); - const size_t wOffset = WEIGHT_HEIGHT_STRIDE * iy + WEIGHT_OUT_CHANNELS_STRIDE * och; + for (size_t iy = 0; iy < CHANNELS_HEIGHT; ++iy) { + const int iPos = (CHANNELS_WIDTH * iy); + int iOffset = (INPUT_MEM_STRIDE / sizeof(Input_T)) * iPos; - if (!wrapInRange && INPUT_WIDTH_STRIDE == WEIGHT_WIDTH_STRIDE) { - macsOnRange<INPUT_HEIGHT_STRIDE>( - inputs + iOffset, - weights + wOffset, - weightedSum); - } - else { - for (size_t ix = 0; ix < CHANNELS_WIDTH; ++ix) { - int iOffsetInRange = iOffset + ix * INPUT_WIDTH_STRIDE; + // Wrapping cannot occur in the middle of a line, except if + // there is only one line (1D)! + bool wrapInRange = false; - if (wrapInRange - && iOffsetInRange >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) - { - iOffsetInRange += (INPUT_MEM_WRAP_OFFSET - - INPUT_MEM_CONT_OFFSET - - INPUT_MEM_CONT_SIZE) / sizeof(Input_T); - } + if (INPUT_MEM_WRAP_SIZE > 0 && iOffset >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) { + iOffset += (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET + - INPUT_MEM_CONT_SIZE) / sizeof(Input_T); + } + else if (INPUT_MEM_WRAP_SIZE > 0 && CHANNELS_WIDTH > 1 + && CHANNELS_HEIGHT == 1 // single line (1D)! + && iOffset + CHANNELS_WIDTH * NB_CHANNELS + > (INPUT_MEM_CONT_SIZE / sizeof(Input_T))) + { + wrapInRange = true; + } - macsOnRange<INPUT_WIDTH_STRIDE>( - inputs + iOffsetInRange, - weights + wOffset + ix * WEIGHT_WIDTH_STRIDE, + const size_t wOffset = NB_W_CHANNELS_PACKED * CHANNELS_WIDTH + * (iy + CHANNELS_HEIGHT * och); + + if (!wrapInRange && (INPUT_MEM_STRIDE / sizeof(Input_T)) == NB_CHANNELS) { + macsOnRange<NB_CHANNELS * CHANNELS_WIDTH>( + inputs + iOffset, + weights + wOffset, weightedSum); } + else { + for (size_t ix = 0; ix < CHANNELS_WIDTH; ++ix) { + int iOffsetInRange = iOffset + ix * (INPUT_MEM_STRIDE / sizeof(Input_T)); + + if (wrapInRange + && iOffsetInRange >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) + { + iOffsetInRange += (INPUT_MEM_WRAP_OFFSET + - INPUT_MEM_CONT_OFFSET + - INPUT_MEM_CONT_SIZE) / sizeof(Input_T); + } + + macsOnRange<NB_CHANNELS>( + inputs + iOffsetInRange, + weights + wOffset + ix * NB_W_CHANNELS_PACKED, + weightedSum); + } + } } + + const auto outputVal + = activation_forward_value<Output_T>(weightedSum, och, ACTIVATION, rescaling); + pack_rev_set(packedOutputVal, packElt, outputVal); } - outputs[och] = activation_forward_value<Output_T>(weightedSum, och, ACTIVATION, rescaling); + outputs[outputPack] = packedOutputVal; } } @@ -135,22 +149,40 @@ void fullyconnected_default_forward ( const Bias_T* __restrict biases, const Rescaling_T& __restrict rescaling) { - constexpr size_t WEIGHT_OUT_CHANNELS_STRIDE = NB_CHANNELS*CHANNELS_WIDTH*CHANNELS_HEIGHT; + constexpr size_t NB_W_CHANNELS_PACKED = (NB_CHANNELS + n_pack<Weight_T>() - 1) / n_pack<Weight_T>(); + constexpr size_t WEIGHT_SPATIAL_SIZE = CHANNELS_WIDTH * CHANNELS_HEIGHT; + + constexpr size_t NB_OUTPUTS_PACKED = (NB_OUTPUTS + n_pack<Output_T>() - 1) / n_pack<Output_T>(); + constexpr size_t NB_OUTPUTS_REM = NB_OUTPUTS % n_pack<Output_T>(); #ifdef _OPENMP #pragma omp parallel for #endif - for (size_t och = 0; och < NB_OUTPUTS; och++) { - Bias_T weightedSum = (biases) ? biases[och] : Bias_T(0); + for (size_t outputPack = 0; outputPack < NB_OUTPUTS_PACKED; ++outputPack) { + Output_T packedOutputVal; + + for (size_t packElt = 0; packElt < n_pack<Output_T>(); ++packElt) { + if (NB_OUTPUTS_REM != 0 && packElt == NB_OUTPUTS_REM) { + break; + } - const size_t wOffset = WEIGHT_OUT_CHANNELS_STRIDE * och; + const size_t och = outputPack * n_pack<Output_T>() + packElt; - macsOnRange<WEIGHT_OUT_CHANNELS_STRIDE>( - inputs, - weights + wOffset, - weightedSum); + Bias_T weightedSum = (biases) ? biases[och] : Bias_T(0); - outputs[och] = activation_forward_value<Output_T>(weightedSum, och, ACTIVATION, rescaling); + const size_t wOffset = NB_W_CHANNELS_PACKED * WEIGHT_SPATIAL_SIZE * och; + + macsOnRange<NB_CHANNELS * WEIGHT_SPATIAL_SIZE>( + inputs, + weights + wOffset, + weightedSum); + + const auto outputVal + = activation_forward_value<Output_T>(weightedSum, och, ACTIVATION, rescaling); + pack_rev_set(packedOutputVal, packElt, outputVal); + } + + outputs[outputPack] = packedOutputVal; } } @@ -186,61 +218,77 @@ void fullyconnected_transpose_forward ( const Bias_T* __restrict biases, const Rescaling_T& __restrict rescaling) { - constexpr size_t INPUT_WIDTH_STRIDE = (INPUT_MEM_STRIDE / sizeof(Input_T)); - constexpr size_t INPUT_HEIGHT_STRIDE = (INPUT_MEM_STRIDE / sizeof(Input_T))*CHANNELS_WIDTH; - // constexpr size_t INPUT_OUT_CHANNELS_STRIDE = (INPUT_MEM_STRIDE / sizeof(Input_T))*CHANNELS_WIDTH*CHANNELS_HEIGHT; + constexpr size_t NB_I_CHANNELS_PACKED = (NB_CHANNELS + n_pack<Input_T>() - 1) / n_pack<Input_T>(); + constexpr size_t NB_W_CHANNELS_PACKED = (NB_CHANNELS + n_pack<Weight_T>() - 1) / n_pack<Weight_T>(); + constexpr size_t WEIGHT_SPATIAL_SIZE = CHANNELS_WIDTH * CHANNELS_HEIGHT; + + constexpr size_t NB_OUTPUTS_PACKED = (NB_OUTPUTS + n_pack<Output_T>() - 1) / n_pack<Output_T>(); + constexpr size_t NB_OUTPUTS_REM = NB_OUTPUTS % n_pack<Output_T>(); - constexpr size_t WEIGHT_HEIGHT_STRIDE = CHANNELS_WIDTH; - constexpr size_t WEIGHT_IN_CHANNELS_STRIDE = CHANNELS_HEIGHT*CHANNELS_WIDTH; - constexpr size_t WEIGHT_OUT_CHANNELS_STRIDE = NB_CHANNELS*CHANNELS_HEIGHT*CHANNELS_WIDTH; #ifdef _OPENMP #pragma omp parallel for #endif - for (size_t och = 0; och < NB_OUTPUTS; och++) { - Bias_T weightedSum = (biases) ? biases[och] : Bias_T(0); + for (size_t outputPack = 0; outputPack < NB_OUTPUTS_PACKED; ++outputPack) { + Output_T packedOutputVal; - for (size_t iy = 0; iy < CHANNELS_HEIGHT; ++iy) { - int iOffset = INPUT_HEIGHT_STRIDE * iy; + for (size_t packElt = 0; packElt < n_pack<Output_T>(); ++packElt) { + if (NB_OUTPUTS_REM != 0 && packElt == NB_OUTPUTS_REM) { + break; + } - // Wrapping cannot occur in the middle of a line, except if - // there is only one line (1D)! - bool wrapInRange = false; + const size_t och = outputPack * n_pack<Output_T>() + packElt; - if (INPUT_MEM_WRAP_SIZE > 0 && iOffset >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) { - iOffset += (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET - - INPUT_MEM_CONT_SIZE) / sizeof(Input_T); - } - else if (INPUT_MEM_WRAP_SIZE > 0 && CHANNELS_WIDTH > 1 - && CHANNELS_HEIGHT == 1 // single line (1D)! - && iOffset + CHANNELS_WIDTH * NB_CHANNELS - > (INPUT_MEM_CONT_SIZE / sizeof(Input_T))) - { - wrapInRange = true; - } + Bias_T weightedSum = (biases) ? biases[och] : Bias_T(0); - const size_t wOffset = WEIGHT_OUT_CHANNELS_STRIDE * och + WEIGHT_HEIGHT_STRIDE * iy; + for (size_t iy = 0; iy < CHANNELS_HEIGHT; ++iy) { + const int iPos = (CHANNELS_WIDTH * iy); + int iOffset = (INPUT_MEM_STRIDE / sizeof(Input_T)) * iPos; - for (size_t ix = 0; ix < CHANNELS_WIDTH; ++ix) { - int iOffsetInRange = iOffset + ix * INPUT_WIDTH_STRIDE; + // Wrapping cannot occur in the middle of a line, except if + // there is only one line (1D)! + bool wrapInRange = false; - if (wrapInRange - && iOffsetInRange >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) - { - iOffsetInRange += (INPUT_MEM_WRAP_OFFSET - - INPUT_MEM_CONT_OFFSET + if (INPUT_MEM_WRAP_SIZE > 0 && iOffset >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) { + iOffset += (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET - INPUT_MEM_CONT_SIZE) / sizeof(Input_T); } + else if (INPUT_MEM_WRAP_SIZE > 0 && CHANNELS_WIDTH > 1 + && CHANNELS_HEIGHT == 1 // single line (1D)! + && iOffset + CHANNELS_WIDTH * NB_I_CHANNELS_PACKED + > (INPUT_MEM_CONT_SIZE / sizeof(Input_T))) + { + wrapInRange = true; + } + + const int wOffset = CHANNELS_WIDTH + * (iy + CHANNELS_HEIGHT * NB_W_CHANNELS_PACKED * och); + + for (size_t ix = 0; ix < CHANNELS_WIDTH; ++ix) { + int iOffsetInRange = iOffset + ix * (INPUT_MEM_STRIDE / sizeof(Input_T)); - // Beware that the pointer increment for weights is - // CHANNELS_HEIGHT*CHANNELS_WIDTH - macsOnRange<NB_CHANNELS, WEIGHT_IN_CHANNELS_STRIDE>( - inputs + iOffsetInRange, - weights + wOffset + ix, - weightedSum); + if (wrapInRange + && iOffsetInRange >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) + { + iOffsetInRange += (INPUT_MEM_WRAP_OFFSET + - INPUT_MEM_CONT_OFFSET + - INPUT_MEM_CONT_SIZE) / sizeof(Input_T); + } + + // Beware that the pointer increment for weights is + // CHANNELS_HEIGHT*CHANNELS_WIDTH + macsOnRange<NB_CHANNELS, WEIGHT_SPATIAL_SIZE>( + inputs + iOffsetInRange, + weights + wOffset + ix, + weightedSum); + } } + + const auto outputVal + = activation_forward_value<Output_T>(weightedSum, och, ACTIVATION, rescaling); + pack_rev_set(packedOutputVal, packElt, outputVal); } - outputs[och] = activation_forward_value<Output_T>(weightedSum, och, ACTIVATION, rescaling); + outputs[outputPack] = packedOutputVal; } } diff --git a/aidge_export_cpp/kernels/pooling.hpp b/aidge_export_cpp/kernels/pooling.hpp index 41d9174..e66465c 100644 --- a/aidge_export_cpp/kernels/pooling.hpp +++ b/aidge_export_cpp/kernels/pooling.hpp @@ -7,6 +7,86 @@ #include <cmath> #include <stdexcept> +// ---------------------------------------------------------------------------- +// Max pooling helpers - Unpacked inputs +// ---------------------------------------------------------------------------- + +template<class Input_T, + typename std::enable_if<(n_pack<Input_T>() == 1)>::type* = nullptr> +static void fillMaxVal(Input_T& __restrict maxVal) +{ + maxVal = std::numeric_limits<Input_T>::lowest(); +} + +template<class Input_T, + typename std::enable_if<(n_pack<Input_T>() == 1)>::type* = nullptr> +static void findMaxVal(const Input_T& __restrict inputs, Input_T& __restrict maxVal) +{ + if (inputs > maxVal){ + maxVal = inputs; + } +} + +// ---------------------------------------------------------------------------- +// Max pooling helpers - Packed <2> inputs +// ---------------------------------------------------------------------------- + +template<class Input_T, + typename std::enable_if<(n_pack<Input_T>() == 2)>::type* = nullptr> +static void fillMaxVal(Input_T& __restrict maxVal) +{ + maxVal.rev_fields.op0 = std::numeric_limits<Input_T>::lowest(); + maxVal.rev_fields.op1 = std::numeric_limits<Input_T>::lowest(); +} + +template<class Input_T, + typename std::enable_if<(n_pack<Input_T>() == 2)>::type* = nullptr> +static void findMaxVal(const Input_T& __restrict inputs, Input_T& __restrict maxVal) +{ + if (inputs.rev_fields.op0 > maxVal.rev_fields.op0){ + maxVal.rev_fields.op0 = inputs.rev_fields.op0; + } + + if (inputs.rev_fields.op1 > maxVal.rev_fields.op1){ + maxVal.rev_fields.op1 = inputs.rev_fields.op1; + } +} + +// ---------------------------------------------------------------------------- +// Max pooling helpers - Packed <4> inputs +// ---------------------------------------------------------------------------- + +template<class Input_T, + typename std::enable_if<(n_pack<Input_T>() == 4)>::type* = nullptr> +static void fillMaxVal(Input_T& __restrict maxVal) +{ + maxVal.rev_fields.op0 = std::numeric_limits<Input_T>::lowest(); + maxVal.rev_fields.op1 = std::numeric_limits<Input_T>::lowest(); + maxVal.rev_fields.op2 = std::numeric_limits<Input_T>::lowest(); + maxVal.rev_fields.op3 = std::numeric_limits<Input_T>::lowest(); +} + +template<class Input_T, + typename std::enable_if<(n_pack<Input_T>() == 4)>::type* = nullptr> +static void findMaxVal(const Input_T& __restrict inputs, Input_T& __restrict maxVal) +{ + if (inputs.rev_fields.op0 > maxVal.rev_fields.op0){ + maxVal.rev_fields.op0 = inputs.rev_fields.op0; + } + + if (inputs.rev_fields.op1 > maxVal.rev_fields.op1){ + maxVal.rev_fields.op1 = inputs.rev_fields.op1; + } + + if (inputs.rev_fields.op2 > maxVal.rev_fields.op2){ + maxVal.rev_fields.op2 = inputs.rev_fields.op2; + } + + if (inputs.rev_fields.op3 > maxVal.rev_fields.op3){ + maxVal.rev_fields.op3 = inputs.rev_fields.op3; + } +} + template<size_t NB_CHANNELS, size_t CHANNELS_HEIGHT, size_t CHANNELS_WIDTH, @@ -35,6 +115,8 @@ void pooling_forward( const Input_T* __restrict inputs, Output_T* __restrict outputs) { + static_assert(POOLING_TYPE != Average || (!is_packed<Input_T>() && !is_packed<Output_T>()), "I/O packing not yet supported for Average pooling"); + constexpr size_t OUTPUTS_HEIGHT_NOPAD = (CHANNELS_HEIGHT - POOL_HEIGHT + STRIDE_Y) / STRIDE_Y; constexpr size_t OUTPUTS_WIDTH_NOPAD @@ -49,6 +131,8 @@ void pooling_forward( 0, POOL_HEIGHT); const int iy = static_cast<int>(oy * STRIDE_Y) - static_cast<int>(PADDING_Y); + // oy loop should not be parallelized to allow memory wrapping: memory + // lines must be processed in order. #ifdef _OPENMP #pragma omp parallel for collapse(2) #endif @@ -74,7 +158,8 @@ void pooling_forward( // <-- if (POOLING_TYPE == Max) { - Input_T maxVal = std::numeric_limits<Input_T>::lowest(); + Input_T maxVal; + fillMaxVal(maxVal); for (size_t sy = 0; sy < POOL_HEIGHT; ++sy) { if ((PADDING_Y != 0 @@ -125,8 +210,7 @@ void pooling_forward( - INPUT_MEM_CONT_SIZE) / sizeof(Input_T); } - if (inputs[iOffsetInRange] > maxVal) - maxVal = inputs[iOffsetInRange]; + findMaxVal(inputs[iOffsetInRange], maxVal); } } diff --git a/aidge_export_cpp/static/macs.hpp b/aidge_export_cpp/static/macs.hpp index 8bcb0fc..cf8ee9d 100644 --- a/aidge_export_cpp/static/macs.hpp +++ b/aidge_export_cpp/static/macs.hpp @@ -985,129 +985,4 @@ static void macsOnRange(const Input_T* __restrict inputs, } } -/*************************************************************************************** -**************************************PACK_ACTIVATIONS********************************** -***************************************************************************************/ -template<typename Output_T> -static void compact_data_during_loop (const uint8_t value, - Output_T* __restrict outputs, - int* outputOffset, - PackSupport* infoPack) -{ - constexpr unsigned int mask = (1L << std::numeric_limits<Output_T>::digits) - 1; - constexpr unsigned int nbSlot = ceil((double)8/std::numeric_limits<Output_T>::digits); - - infoPack->accumulator |= value & mask; - infoPack->cptAccumulator += 1; - - if (infoPack->cptAccumulator == nbSlot) { - outputs[*(outputOffset)] = infoPack->accumulator; - infoPack->cptAccumulator = 0; - infoPack->accumulator = 0; - } - else { - infoPack->accumulator <<= std::numeric_limits<Output_T>::digits; - } -} - -template<typename Output_T> -static void compact_data_end_loop (Output_T* __restrict outputs, - int* outputOffset, - PackSupport* infoPack) -{ - // if data still accumulated but not stored - if (infoPack->cptAccumulator != 0) { - constexpr unsigned int nbSlot = ceil((double)8/std::numeric_limits<Output_T>::digits); - - // Add extra zero to shift data to the left - infoPack->cptAccumulator += 1; - while (infoPack->cptAccumulator < nbSlot) { - infoPack->accumulator <<= std::numeric_limits<Output_T>::digits; - infoPack->cptAccumulator += 1; - } - outputs[*(outputOffset)] = infoPack->accumulator; - infoPack->cptAccumulator = 0; - infoPack->accumulator = 0; - } -} - - -/************************************************************************************* -***************************************MAX_POOL_4bits********************************* -***************************************************************************************/ -template<class Input_T, - typename std::enable_if<(n_pack<Input_T>() == 2)>::type* = nullptr> -static void fillMaxVal(Input_T& __restrict maxVal) -{ - maxVal.rev_fields.op0 = std::numeric_limits<Input_T>::lowest(); - maxVal.rev_fields.op1 = std::numeric_limits<Input_T>::lowest(); -} - -template<class Input_T, - typename std::enable_if<(n_pack<Input_T>() == 2)>::type* = nullptr> -static void findMaxVal(const Input_T& __restrict inputs, Input_T& __restrict maxVal) -{ - if(inputs.rev_fields.op0 > maxVal.rev_fields.op0){ - maxVal.rev_fields.op0 = inputs.rev_fields.op0; - } - if(inputs.rev_fields.op1 > maxVal.rev_fields.op1){ - maxVal.rev_fields.op1 = inputs.rev_fields.op1; - } -} - -template<class Input_T, class Output_T, - typename std::enable_if<(n_pack<Input_T>() == 2)>::type* = nullptr> -static void packMaxVal(Input_T& __restrict maxVal, - Output_T* __restrict outputs, - int* outputOffset, - PackSupport* infoPack) -{ - compact_data_during_loop(maxVal.rev_fields.op0, outputs, outputOffset, infoPack); - compact_data_during_loop(maxVal.rev_fields.op1, outputs, outputOffset, infoPack); -} - -/************************************************************************************* -***************************************MAX_POOL_2bits********************************* -***************************************************************************************/ -template<class Input_T, - typename std::enable_if<(n_pack<Input_T>() == 4)>::type* = nullptr> -static void fillMaxVal(Input_T& __restrict maxVal) -{ - maxVal.rev_fields.op0 = std::numeric_limits<Input_T>::lowest(); - maxVal.rev_fields.op1 = std::numeric_limits<Input_T>::lowest(); - maxVal.rev_fields.op2 = std::numeric_limits<Input_T>::lowest(); - maxVal.rev_fields.op3 = std::numeric_limits<Input_T>::lowest(); -} - -template<class Input_T, - typename std::enable_if<(n_pack<Input_T>() == 4)>::type* = nullptr> -static void findMaxVal(const Input_T& __restrict inputs, Input_T& __restrict maxVal) -{ - if(inputs.rev_fields.op0 > maxVal.rev_fields.op0){ - maxVal.rev_fields.op0 = inputs.rev_fields.op0; - } - if(inputs.rev_fields.op1 > maxVal.rev_fields.op1){ - maxVal.rev_fields.op1 = inputs.rev_fields.op1; - } - if(inputs.rev_fields.op2 > maxVal.rev_fields.op2){ - maxVal.rev_fields.op2 = inputs.rev_fields.op2; - } - if(inputs.rev_fields.op3 > maxVal.rev_fields.op3){ - maxVal.rev_fields.op3 = inputs.rev_fields.op3; - } -} - -template<class Input_T, class Output_T, - typename std::enable_if<(n_pack<Input_T>() == 4)>::type* = nullptr> -static void packMaxVal(Input_T& __restrict maxVal, - Output_T* __restrict outputs, - int* outputOffset, - PackSupport* infoPack) -{ - compact_data_during_loop(maxVal.rev_fields.op0, outputs, outputOffset, infoPack); - compact_data_during_loop(maxVal.rev_fields.op1, outputs, outputOffset, infoPack); - compact_data_during_loop(maxVal.rev_fields.op2, outputs, outputOffset, infoPack); - compact_data_during_loop(maxVal.rev_fields.op3, outputs, outputOffset, infoPack); -} - #endif diff --git a/aidge_export_cpp/static/typedefs.hpp b/aidge_export_cpp/static/typedefs.hpp index e680417..1544a98 100644 --- a/aidge_export_cpp/static/typedefs.hpp +++ b/aidge_export_cpp/static/typedefs.hpp @@ -85,6 +85,46 @@ struct n_pack : std::integral_constant<int, 1> {}; template <std::size_t N_PACK, std::size_t N_BITS, bool SIGNED> struct n_pack<packed_bitint<N_PACK, N_BITS, SIGNED>> : std::integral_constant<int, N_PACK> {}; +template <typename T, typename std::enable_if<(n_pack<T>() == 1)>::type* = nullptr> +constexpr auto pack_rev_get(T& data, int /*i*/) { + return data; +} + +template <typename T, typename std::enable_if<(n_pack<T>() > 1)>::type* = nullptr> +constexpr auto pack_rev_get(T& data, int i) { + assert((i < n_pack<T>()) && "pack index out of range"); + switch (i) { + case 0: return data.rev_fields.op0; + case 1: return data.rev_fields.op1; + case 2: return data.rev_fields.op2; + case 3: return data.rev_fields.op3; + case 4: return data.rev_fields.op4; + case 5: return data.rev_fields.op5; + case 6: return data.rev_fields.op6; + default: return data.rev_fields.op7; + } +} + +template <typename T, typename std::enable_if<(n_pack<T>() == 1)>::type* = nullptr> +constexpr void pack_rev_set(T& data, int /*i*/, T val) { + data = val; +} + +template <typename T, typename std::enable_if<(n_pack<T>() > 1)>::type* = nullptr> +constexpr void pack_rev_set(T& data, int i, decltype(data.fields.op0) val) { + assert((i < n_pack<T>()) && "pack index out of range"); + switch (i) { + case 0: data.rev_fields.op0 = val; + case 1: data.rev_fields.op1 = val; + case 2: data.rev_fields.op2 = val; + case 3: data.rev_fields.op3 = val; + case 4: data.rev_fields.op4 = val; + case 5: data.rev_fields.op5 = val; + case 6: data.rev_fields.op6 = val; + default: data.rev_fields.op7 = val; + } +} + // ---------------------------------------------------------------------------- // ---------------- Custom bit-width types specializations -------------------- // ---------------------------------------------------------------------------- -- GitLab