From dfef8d164ade704b4c97286d025a881446923e37 Mon Sep 17 00:00:00 2001 From: thibault allenet <thibault.allenet@cea.fr> Date: Fri, 13 Dec 2024 15:27:09 +0000 Subject: [PATCH] Add Implementations for low bit kernels --- .../kernels/Convolution/CustomConv.hpp | 146 + .../kernels/FullyConnected/CustomFc.hpp | 83 + .../kernels/Pooling/CustomPooling.hpp | 114 + .../_Aidge_Arm/kernels/Utils/Macs.hpp | 3371 ++++++++++++++++- .../kernels/Utils/nn_scaling_functions.hpp | 18 +- .../kernels/Utils/subkernels_functions.hpp | 312 ++ .../_Aidge_Arm/kernels/Utils/swar_arm_acle.h | 356 ++ .../_Aidge_Arm/kernels/Utils/utils.hpp | 2 +- 8 files changed, 4232 insertions(+), 170 deletions(-) create mode 100644 aidge_export_arm_cortexm/_Aidge_Arm/kernels/Convolution/CustomConv.hpp create mode 100644 aidge_export_arm_cortexm/_Aidge_Arm/kernels/FullyConnected/CustomFc.hpp create mode 100644 aidge_export_arm_cortexm/_Aidge_Arm/kernels/Pooling/CustomPooling.hpp create mode 100644 aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/subkernels_functions.hpp create mode 100644 aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/swar_arm_acle.h diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Convolution/CustomConv.hpp b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Convolution/CustomConv.hpp new file mode 100644 index 0000000..5f35aaf --- /dev/null +++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Convolution/CustomConv.hpp @@ -0,0 +1,146 @@ +/* + (C) Copyright 2017 CEA LIST. All Rights Reserved. + Contributor(s): N2D2 Team + + This software is governed by the CeCILL-C license under French law and + abiding by the rules of distribution of free software. You can use, + modify and/ or redistribute the software under the terms of the CeCILL-C + license as circulated by CEA, CNRS and INRIA at the following URL + "http://www.cecill.info". + + As a counterpart to the access to the source code and rights to copy, + modify and redistribute granted by the license, users are provided only + with a limited warranty and the software's author, the holder of the + economic rights, and the successive licensors have only limited + liability. + + The fact that you are presently reading this means that you have had + knowledge of the CeCILL-C license and that you accept its terms. +*/ + +#ifndef __N2D2_EXPORT_ARM_CONV_CUSTOM_HPP__ +#define __N2D2_EXPORT_ARM_CONV_CUSTOM_HPP__ + +#include <cmath> + +#include "kernels/typedefs.hpp" +#include "assert.h" +#include "utils.hpp" +#include "kernels/Macs.hpp" +#include "kernels/subkernels_functions.hpp" + +namespace N2D2_Export { + +template<int NB_CHANNELS, + int CHANNELS_HEIGHT, int CHANNELS_WIDTH, + int NB_OUTPUTS, + int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH, + int PADDING_Y, int PADDING_X, + int STRIDE_Y, int STRIDE_X, + int KERNEL_HEIGHT, int KERNEL_WIDTH, + ActivationFunction_T ACTIVATION, + // // Memory mapping: inputs + // int INPUT_MEM_CONT_OFFSET, + // int INPUT_MEM_CONT_SIZE, + // int INPUT_MEM_WRAP_OFFSET, + // int INPUT_MEM_WRAP_SIZE, + // int INPUT_MEM_STRIDE, + // // Memory mapping: outputs + // int OUTPUT_MEM_CONT_OFFSET, + // int OUTPUT_MEM_CONT_SIZE, + // int OUTPUT_MEM_WRAP_OFFSET, + // int OUTPUT_MEM_WRAP_SIZE, + // int OUTPUT_MEM_STRIDE, + typename Sum_T, typename Input_T, typename Output_T, + typename Weight_T, typename Bias_T, typename Rescaling_T> +__attribute__((always_inline)) inline static +void customconvcellPropagate(const Input_T* __restrict inputs, + Output_T* __restrict outputs, + const Bias_T* __restrict biasses, + const Weight_T* __restrict weights, + const Rescaling_T& __restrict rescaling) +{ + PackSupport infoPack = {0, 0}; + + constexpr int bits_norm_in = (std::numeric_limits<Input_T>::digits >= 8) + ? 8/std::ceil(8/(float)std::numeric_limits<Input_T>::digits) + : 8/std::floor(8/(float)std::numeric_limits<Input_T>::digits); + + constexpr int bits_norm_wt = (std::numeric_limits<Weight_T>::digits >= 8) + ? 8/std::ceil(8/(float)std::numeric_limits<Weight_T>::digits) + : 8/std::floor(8/(float)std::numeric_limits<Weight_T>::digits); + + constexpr int INPUTS_BYTE + = std::ceil(((NB_CHANNELS * bits_norm_in) + + (NB_CHANNELS * bits_norm_in) % 8) / (float)8); + constexpr int WEIGHTS_BYTE + = std::ceil(((NB_CHANNELS * bits_norm_wt) + + (NB_CHANNELS * bits_norm_wt) % 8) / (float)8); + + int outputOffset = 0; + + int iy = 0; + for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) { + const int syMin = (PADDING_Y == 0) ? 0 : max(PADDING_Y - iy, 0); + const int syMax = (PADDING_Y == 0) ? KERNEL_HEIGHT + : clamp(CHANNELS_HEIGHT + PADDING_Y - iy, + 0, KERNEL_HEIGHT); + + int ix = 0; + for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) { + const int sxMin = (PADDING_X == 0) ? 0 : max(PADDING_X - ix, 0); + const int sxMax = (PADDING_X == 0) ? KERNEL_WIDTH + : clamp(CHANNELS_WIDTH + PADDING_X - ix, + 0, KERNEL_WIDTH); + + for (int och = 0; och < NB_OUTPUTS; ++och) { + Sum_T weightedSum = biasses[och]; + + for (int sy = 0; sy < KERNEL_HEIGHT; ++sy) { + + if (PADDING_Y != 0 && (sy < syMin || sy >= syMax)) { + continue; + } + const int inputsOffset = (iy + sy - PADDING_Y) * CHANNELS_WIDTH * INPUTS_BYTE + + (ix - PADDING_X) * INPUTS_BYTE; + + const int weightsOffset = och * KERNEL_HEIGHT * KERNEL_WIDTH * WEIGHTS_BYTE + + sy * KERNEL_WIDTH * WEIGHTS_BYTE; + + // if (PADDING_X == 0 + // && (NB_CHANNELS * std::numeric_limits<Weight_T>::digits % 8 == 0) + // && (NB_CHANNELS * std::numeric_limits<Input_T>::digits % 8 == 0)) { + if (PADDING_X == 0 + && (NB_CHANNELS * bits_norm_wt % 8 == 0) + && (NB_CHANNELS * bits_norm_in % 8 == 0)) { + + macsOnRange<KERNEL_WIDTH * NB_CHANNELS>(inputs + inputsOffset, + weights + weightsOffset, + weightedSum); + } + else { + for (int sx = 0; sx < KERNEL_WIDTH; ++sx) { + if(sx < sxMin || sx >= sxMax) { + continue; + } + macsOnRange<NB_CHANNELS>(inputs + inputsOffset + sx * INPUTS_BYTE, + weights + weightsOffset + sx * WEIGHTS_BYTE, + weightedSum); + } + } + } + Output_T output = sat<Output_T>(weightedSum,och, ACTIVATION, rescaling); + compact_data_during_loop(output, outputs, outputOffset, infoPack); + } + compact_data_end_loop(outputs, outputOffset, infoPack); + + ix += STRIDE_X; + } + iy += STRIDE_Y; + } +} + + +} // N2D2_Export + +#endif // __N2D2_EXPORT_ARM_CONV_CUSTOM_HPP__ diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/FullyConnected/CustomFc.hpp b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/FullyConnected/CustomFc.hpp new file mode 100644 index 0000000..dc36818 --- /dev/null +++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/FullyConnected/CustomFc.hpp @@ -0,0 +1,83 @@ +/* + (C) Copyright 2017 CEA LIST. All Rights Reserved. + Contributor(s): N2D2 Team + + This software is governed by the CeCILL-C license under French law and + abiding by the rules of distribution of free software. You can use, + modify and/ or redistribute the software under the terms of the CeCILL-C + license as circulated by CEA, CNRS and INRIA at the following URL + "http://www.cecill.info". + + As a counterpart to the access to the source code and rights to copy, + modify and redistribute granted by the license, users are provided only + with a limited warranty and the software's author, the holder of the + economic rights, and the successive licensors have only limited + liability. + + The fact that you are presently reading this means that you have had + knowledge of the CeCILL-C license and that you accept its terms. +*/ + +#ifndef __N2D2_EXPORT_CPP_CUSTOMFC_HPP__ +#define __N2D2_EXPORT_CPP_CUSTOMFC_HPP__ + +#include <cmath> + +#include "kernels/typedefs.hpp" +#include "assert.h" +#include "utils.hpp" +#include "kernels/Macs.hpp" +#include "kernels/subkernels_functions.hpp" + +namespace N2D2_Export { + +template<int NB_CHANNELS, int CHANNELS_HEIGHT, int CHANNELS_WIDTH, + int NB_OUTPUTS, int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH, + ActivationFunction_T ACTIVATION, + typename Sum_T, typename Input_T, typename Output_T, + typename Weight_T, typename Bias_T, typename Rescaling_T> +__attribute__((always_inline)) inline static +void fccellPropagate(const Input_T* __restrict inputs, + Output_T* __restrict outputs, + const Bias_T* __restrict biasses, + const Weight_T* __restrict weights, + const Rescaling_T& __restrict rescaling) +{ + static_assert(OUTPUTS_HEIGHT == 1, "Outputs height should be 1"); + static_assert(OUTPUTS_WIDTH == 1, "Outputs width should be 1"); + + PackSupport infoPack = {0, 0}; + + constexpr int INPUTS_BYTE + = std::ceil(((NB_CHANNELS * std::numeric_limits<Input_T>::digits) + + (NB_CHANNELS * std::numeric_limits<Input_T>::digits) % 8) / (float)8); + constexpr int WEIGHTS_BYTE + = std::ceil(((NB_CHANNELS * std::numeric_limits<Weight_T>::digits) + + (NB_CHANNELS * std::numeric_limits<Weight_T>::digits) % 8) / (float)8); + + int outputOffset = 0; + for (int och = 0; och < NB_OUTPUTS; ++och) { + Sum_T weightedSum = biasses[och]; + + for (int iy = 0; iy < CHANNELS_HEIGHT; ++iy) { + + for (int ix = 0; ix < CHANNELS_WIDTH; ++ix) { + + const int weightsOffset = CHANNELS_HEIGHT * CHANNELS_WIDTH * WEIGHTS_BYTE * och + + (CHANNELS_WIDTH * iy + ix) * WEIGHTS_BYTE; + const int inputsOffset = (CHANNELS_WIDTH * iy + ix) * INPUTS_BYTE; + + macsOnRange<NB_CHANNELS>(inputs + inputsOffset, + weights + weightsOffset, + weightedSum); + } + } + Output_T output = sat<Output_T>(weightedSum,och, ACTIVATION, rescaling); + compact_data_during_loop(output, outputs, outputOffset, infoPack); + } + compact_data_end_loop(outputs, outputOffset, infoPack); +} + +} // N2D2_Export + +#endif // __N2D2_EXPORT_CPP_FC_HPP__ diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Pooling/CustomPooling.hpp b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Pooling/CustomPooling.hpp new file mode 100644 index 0000000..bcc6a09 --- /dev/null +++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Pooling/CustomPooling.hpp @@ -0,0 +1,114 @@ +/* + (C) Copyright 2017 CEA LIST. All Rights Reserved. + Contributor(s): N2D2 Team + + This software is governed by the CeCILL-C license under French law and + abiding by the rules of distribution of free software. You can use, + modify and/ or redistribute the software under the terms of the CeCILL-C + license as circulated by CEA, CNRS and INRIA at the following URL + "http://www.cecill.info". + + As a counterpart to the access to the source code and rights to copy, + modify and redistribute granted by the license, users are provided only + with a limited warranty and the software's author, the holder of the + economic rights, and the successive licensors have only limited + liability. + + The fact that you are presently reading this means that you have had + knowledge of the CeCILL-C license and that you accept its terms. +*/ + +#ifndef __N2D2_EXPORT_CPP_CUSTOMPOOLING_HPP__ +#define __N2D2_EXPORT_CPP_CUSTOMPOOLING_HPP__ + +#include <cmath> + +#include "kernels/typedefs.hpp" +#include "assert.h" +#include "utils.hpp" +#include "kernels/Macs.hpp" +#include "kernels/subkernels_functions.hpp" + + +namespace N2D2_Export { + +template<int NB_CHANNELS, int CHANNELS_HEIGHT, int CHANNELS_WIDTH, + int NB_OUTPUTS, int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH, + int PADDING_Y, int PADDING_X, + int STRIDE_Y, int STRIDE_X, + int KERNEL_HEIGHT, int KERNEL_WIDTH, + Pooling_T POOLING, ActivationFunction_T ACTIVATION, + typename Input_T, typename Output_T> +__attribute__((always_inline)) inline static +void custompoolcellPropagate(const Input_T* __restrict inputs, + Output_T* __restrict outputs) +{ + static_assert(std::is_same<Input_T, Output_T>::value, "Input_T and Output_T must be the same."); + static_assert(NB_CHANNELS == NB_OUTPUTS, "nb_channels should be equal to nb_outputs."); + static_assert(POOLING == Max , "Only supports Max and Average pooling."); + static_assert(ACTIVATION == Linear, "Only supports a Linear activation."); + + PackSupport infoPack = {0, 0}; + + constexpr int INPUTS_BYTE + = std::ceil(((NB_CHANNELS * std::numeric_limits<Input_T>::digits) + + (NB_CHANNELS * std::numeric_limits<Input_T>::digits) % 8) / (float)8); + constexpr int OUTPUTS_BYTE + = std::ceil(((NB_OUTPUTS * std::numeric_limits<Output_T>::digits) + + (NB_OUTPUTS * std::numeric_limits<Output_T>::digits) % 8) / (float)8); + + int outputOffset = 0; + + int iy = 0; + for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) { + const int syMin = (PADDING_Y == 0) ? 0 : max(PADDING_Y - iy, 0); + const int syMax = (PADDING_Y == 0) ? KERNEL_HEIGHT + : clamp(CHANNELS_HEIGHT + PADDING_Y - iy, + 0, KERNEL_HEIGHT); + + int ix = 0; + for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) { + const int sxMin = (PADDING_X == 0) ? 0 : max(PADDING_X - ix, 0); + const int sxMax = (PADDING_X == 0) ? KERNEL_WIDTH + : clamp(CHANNELS_WIDTH + PADDING_X - ix, + 0, KERNEL_WIDTH); + + int och_c = 0; + while (och_c < OUTPUTS_BYTE) { + + // typename std::conditional<(!std::is_unsigned<Input_T>::value && + // std::numeric_limits<Input_T>::digits == 32), data<32>, udata<32>>::type maxVal; + // maxVal = decltype(maxVal)::lowest(); + typename std::conditional<(!std::is_unsigned<Input_T>::value && + std::numeric_limits<Input_T>::digits == 32), int32_t, uint32_t>::type maxVal; + maxVal = std::numeric_limits<decltype(maxVal)>::lowest(); + + int nb_data = min(OUTPUTS_BYTE-och_c, get_pool_nbData(std::numeric_limits<Input_T>::digits)); + + for (int sy = 0; sy < KERNEL_HEIGHT; ++sy) { + + if (PADDING_Y != 0 && (sy < syMin || sy >= syMax)) { + continue; + } + const int inputsOffset = (iy + sy - PADDING_Y) * CHANNELS_WIDTH * INPUTS_BYTE + + (ix - PADDING_X) * INPUTS_BYTE + och_c; + + for (int sx = 0; sx < KERNEL_WIDTH; ++sx) { + if(sx < sxMin || sx >= sxMax) { + continue; + } + parallelMaxPooling(inputs + inputsOffset + sx*INPUTS_BYTE, maxVal, nb_data); + } + } + storeMaxPooling(outputs, outputOffset, maxVal, nb_data); + och_c += nb_data; + } + + ix += STRIDE_X; + } + iy += STRIDE_Y; + } +} + +} +#endif \ No newline at end of file diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/Macs.hpp b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/Macs.hpp index 5aa5183..a0a1f85 100644 --- a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/Macs.hpp +++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/Macs.hpp @@ -1,211 +1,3262 @@ -/* - (C) Copyright 2017 CEA LIST. All Rights Reserved. - Contributor(s): N2D2 Team +// /* +// (C) Copyright 2017 CEA LIST. All Rights Reserved. +// Contributor(s): N2D2 Team - This software is governed by the CeCILL-C license under French law and - abiding by the rules of distribution of free software. You can use, - modify and/ or redistribute the software under the terms of the CeCILL-C - license as circulated by CEA, CNRS and INRIA at the following URL - "http://www.cecill.info". +// This software is governed by the CeCILL-C license under French law and +// abiding by the rules of distribution of free software. You can use, +// modify and/ or redistribute the software under the terms of the CeCILL-C +// license as circulated by CEA, CNRS and INRIA at the following URL +// "http://www.cecill.info". - As a counterpart to the access to the source code and rights to copy, - modify and redistribute granted by the license, users are provided only - with a limited warranty and the software's author, the holder of the - economic rights, and the successive licensors have only limited - liability. +// As a counterpart to the access to the source code and rights to copy, +// modify and redistribute granted by the license, users are provided only +// with a limited warranty and the software's author, the holder of the +// economic rights, and the successive licensors have only limited +// liability. - The fact that you are presently reading this means that you have had - knowledge of the CeCILL-C license and that you accept its terms. -*/ +// The fact that you are presently reading this means that you have had +// knowledge of the CeCILL-C license and that you accept its terms. +// */ -#ifndef __N2D2_EXPORT_CPP_MACS_HPP__ -#define __N2D2_EXPORT_CPP_MACS_HPP__ +// #ifndef __N2D2_EXPORT_CPP_MACS_HPP__ +// #define __N2D2_EXPORT_CPP_MACS_HPP__ -#include <cstdint> -#include <limits> -#include <type_traits> -#include <cmsis_compiler.h> +// #include <cstdint> +// #include <limits> +// #include <type_traits> +// #include <cmsis_compiler.h> -namespace N2D2_Export { +// #include "swar_arm_acle.h" +// namespace N2D2_Export { -template<typename Input_T> -inline static -uint32_t XTB16(uint32_t val) + +// template<typename Input_T> +// inline static +// uint32_t XTB16(uint32_t val) +// { +// return std::is_unsigned<Input_T>::value ? __UXTB16(val) : __SXTB16(val); +// } + +// template<int INPUTS_INC = 1, +// int WEIGHTS_INC = 1, +// typename Input_T, +// typename Weight_T, +// typename Sum_T> +// inline static +// Sum_T dualMac(const Input_T* __restrict inputs, +// const Weight_T* __restrict weights, +// Sum_T weightedSum) +// { +// weightedSum += inputs[0] * weights[0] +// + inputs[INPUTS_INC] * weights[WEIGHTS_INC]; + +// return weightedSum; +// } + +// template<int INPUTS_INC = 1, +// int WEIGHTS_INC = 1, +// typename Input_T, +// typename Weight_T, +// typename Sum_T, +// typename std::enable_if<std::is_floating_point<Input_T>::value>::type* = nullptr> +// inline static +// Sum_T quadMac(const Input_T* __restrict inputs, +// const Weight_T* __restrict weights, +// Sum_T weightedSum) +// { +// weightedSum += inputs[0*INPUTS_INC] * weights[0*WEIGHTS_INC] +// + inputs[1*INPUTS_INC] * weights[1*WEIGHTS_INC] +// + inputs[2*INPUTS_INC] * weights[2*WEIGHTS_INC] +// + inputs[3*INPUTS_INC] * weights[3*WEIGHTS_INC]; + +// return weightedSum; +// } + +// template<int INPUTS_INC = 1, +// int WEIGHTS_INC = 1, +// typename Input_T, +// typename Weight_T, +// typename Sum_T, +// typename std::enable_if<!std::is_floating_point<Input_T>::value>::type* = nullptr> +// inline static +// Sum_T quadMac(const Input_T* __restrict inputs, +// const Weight_T* __restrict weights, +// Sum_T weightedSum) +// { +// if(INPUTS_INC != 1 || WEIGHTS_INC != 1) { +// weightedSum += inputs[0*INPUTS_INC] * weights[0*WEIGHTS_INC] +// + inputs[1*INPUTS_INC] * weights[1*WEIGHTS_INC] +// + inputs[2*INPUTS_INC] * weights[2*WEIGHTS_INC] +// + inputs[3*INPUTS_INC] * weights[3*WEIGHTS_INC]; + +// return weightedSum; +// } + +// // Inputs loading & preparation +// uint32_t in; +// memcpy((void*) &in, inputs, sizeof(in)); + +// uint32_t in1 = XTB16<Input_T>(in); +// uint32_t in2 = XTB16<Input_T>(in >> 8); + +// // Weights loading & preparation +// uint32_t wt; +// memcpy((void*) &wt, weights, sizeof(wt)); + +// uint32_t wt1 = XTB16<Weight_T>(wt); +// uint32_t wt2 = XTB16<Weight_T>(wt >> 8); + +// // Computation +// if(std::is_same<Sum_T, int32_t>::value) { +// weightedSum = __SMLAD(in1, wt1, weightedSum); +// weightedSum = __SMLAD(in2, wt2, weightedSum); +// } +// else { +// weightedSum = __SMLALD(in1, wt1, weightedSum); +// weightedSum = __SMLALD(in2, wt2, weightedSum); + +// } + +// return weightedSum; +// } + + + + +// // ---------------------------------------------------------------------------- +// // -------------- MAC computing functions for kernel 4W-4A -------------------- +// // ---------------------------------------------------------------------------- + +// /** +// * @brief Unsigned mono mac operation (4W/4A version) +// * @details Performs one mac operation for signed 4-bits weights +// * and unsigned 4-bits inputs. +// * +// * @tparam Input_T Input type (should be udata<4>) +// * @tparam Weight_T Weight type (should be data<4>) +// * +// * @param[in] inputs Pointer to input vector +// * @param[in] weights Pointer to kernel weights +// * @param[in,out] weightedSum Accumulating sum from the +// * previous mac operations +// * @returns Updated weightedSum with +// * the result of the dual mac operation +// */ +// template<typename Input_T, typename Weight_T, typename Sum_T, +// typename std::enable_if<(std::is_unsigned<Input_T>::value +// && std::numeric_limits<Weight_T>::digits == 4 +// && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr> +// __attribute__((always_inline)) static inline +// Sum_T monoMac(const Input_T* __restrict inputs, +// const Weight_T* __restrict weights, +// Sum_T weightedSum) +// { +// weightedSum += __UBFX(inputs[0], 4, 4) * __SBFX(weights[0], 4, 4); +// return weightedSum; +// } + +// /** +// * @brief Signed mono mac operation (4W/4A version) +// * @details Performs one mac operation for signed 4-bits weights +// * and signed 4-bits inputs. +// * +// * @tparam Input_T Input type (should be data<4>) +// * @tparam Weight_T Weight type (should be data<4>) +// * +// * @param[in] inputs Pointer to input vector +// * @param[in] weights Pointer to kernel weights +// * @param[in,out] weightedSum Accumulating sum from the +// * previous mac operations +// * @returns Updated weightedSum with +// * the result of the dual mac operation +// */ +// template<typename Input_T, typename Weight_T, typename Sum_T, +// typename std::enable_if<(!std::is_unsigned<Input_T>::value +// && std::numeric_limits<Weight_T>::digits == 4 +// && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr> +// __attribute__((always_inline)) static inline +// Sum_T monoMac(const Input_T* __restrict inputs, +// const Weight_T* __restrict weights, +// Sum_T weightedSum) +// { +// weightedSum += __SBFX(inputs[0], 4, 4) * __SBFX(weights[0], 4, 4); +// return weightedSum; +// } + +// /** +// * @brief Unsigned dual mac operation (4W/4A version) +// * @details Performs two mac operations for signed 4-bits weights +// * and unsigned 4-bits inputs. Extracts the two 4-bits weights +// * from a stored 8-bits weight and associates them into +// * a 32-bits value. Then extracts the two 4-bits inputs +// * from a stored 8-bits input and associates them into +// * a 32-bits value. Finally performs a dual mac operation +// * with the __SMLAD instruction +// * +// * @tparam Input_T Input type (should be udata<4>) +// * @tparam Weight_T Weight type (should be data<4>) +// * +// * @param[in] inputs Pointer to compressed input vector +// * @param[in] weights Pointer to compressed kernel weights +// * @param[in,out] weightedSum Accumulating sum from the +// * previous mac operations +// * @returns Updated weightedSum with +// * the result of the dual mac operation +// */ +// template<typename Input_T, typename Weight_T, typename Sum_T, +// typename std::enable_if<(std::is_unsigned<Input_T>::value +// && std::numeric_limits<Weight_T>::digits == 4 +// && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr> +// __attribute__((always_inline)) static inline +// Sum_T dualMac(const Input_T* __restrict inputs, +// const Weight_T* __restrict weights, +// Sum_T weightedSum) +// { +// uint8_t wt; +// std::memcpy((void*) &wt, weights, sizeof(wt)); + +// int32_t w0 = __SBFX(wt, 0, 4); +// int32_t w1 = __SBFX(wt, 4, 4); +// uint32_t wght = __BFI(w1, w0, 16, 16); + +// uint8_t in; +// std::memcpy((void*) &in, inputs, sizeof(in)); + +// int32_t a0 = __UBFX(in, 0, 4); +// int32_t a1 = __UBFX(in, 4, 4); +// uint32_t act = __BFI(a1, a0, 16, 16); + +// weightedSum = __SMLAD(act, wght, weightedSum); + +// return weightedSum; +// } + +// /** +// * @brief Signed dual mac operation (4W/4A version) +// * @details Performs two mac operations for signed 4-bits weights +// * and signed 4-bits inputs. Extracts the two 4-bits weights +// * from a stored 8-bits weight and associates them into +// * a 32-bits value. Then extracts the two 4-bits inputs +// * from a stored 8-bits input and associates them into +// * a 32-bits value. Finally performs a dual mac operation +// * with the __SMLAD instruction +// * +// * @tparam Input_T Input type (should be data<4>) +// * @tparam Weight_T Weight type (should be data<4>) +// * +// * @param[in] inputs Pointer to compressed input vector +// * @param[in] weights Pointer to compressed kernel weights +// * @param[in,out] weightedSum Accumulating sum from the +// * previous mac operations +// * @returns Updated weightedSum with +// * the result of the dual mac operation +// */ +// template<typename Input_T, typename Weight_T, typename Sum_T, +// typename std::enable_if<(!std::is_unsigned<Input_T>::value +// && std::numeric_limits<Weight_T>::digits == 4 +// && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr> +// __attribute__((always_inline)) static inline +// Sum_T dualMac(const Input_T* __restrict inputs, +// const Weight_T* __restrict weights, +// Sum_T weightedSum) +// { +// uint8_t wt; +// std::memcpy((void*) &wt, weights, sizeof(wt)); + +// int32_t w0 = __SBFX(wt, 0, 4); +// int32_t w1 = __SBFX(wt, 4, 4); +// uint32_t wght = __BFI(w1, w0, 16, 16); + +// uint8_t in; +// std::memcpy((void*) &in, inputs, sizeof(in)); + +// int32_t a0 = __SBFX(in, 0, 4); +// int32_t a1 = __SBFX(in, 4, 4); +// uint32_t act = __BFI(a1, a0, 16, 16); + +// weightedSum = __SMLAD(act, wght, weightedSum); + +// return weightedSum; +// } + +// /** +// * @brief Unsigned quad mac operation (4W/4A version) +// * @details Performs four mac operations for signed 4-bits weights +// * and unsigned 4-bits inputs. Extracts the four 4-bits weights +// * from two stored 8-bits weights and associates them into +// * two 32-bits values. Then extracts the four 4-bits inputs +// * from two stored 8-bits inputs and associates them into +// * two 32-bits values. Finally performs a double dual mac operation +// * with the __SMLAD instruction +// * +// * @tparam Input_T Input type (should be udata<4>) +// * @tparam Weight_T Weight type (should be data<4>) +// * +// * @param[in] inputs Pointer to compressed input vector +// * @param[in] weights Pointer to compressed kernel weights +// * @param[in,out] weightedSum Accumulating sum from the +// * previous mac operations +// * @returns Updated weightedSum with +// * the result of the quad mac operation +// */ +// template<typename Input_T, typename Weight_T, typename Sum_T, +// typename std::enable_if<(std::is_unsigned<Input_T>::value +// && std::numeric_limits<Weight_T>::digits == 4 +// && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr> +// __attribute__((always_inline)) static inline +// Sum_T quadMac(const Input_T* __restrict inputs, +// const Weight_T* __restrict weights, +// Sum_T weightedSum) +// { +// uint16_t wt; +// std::memcpy((void*) &wt, weights, sizeof(wt)); + +// int32_t w0 = __SBFX(wt, 0, 4); +// int32_t w1 = __SBFX(wt, 4, 4); +// int32_t w2 = __SBFX(wt, 8, 4); +// int32_t w3 = __SBFX(wt, 12, 4); + +// uint32_t evenW1 = __BFI(w2, w0, 16, 16); +// uint32_t oddW1 = __BFI(w3, w1, 16, 16); + +// uint16_t in; +// std::memcpy((void*) &in, inputs, sizeof(in)); + +// int32_t a0 = __UBFX(in, 0, 4); +// int32_t a1 = __UBFX(in, 4, 4); +// int32_t a2 = __UBFX(in, 8, 4); +// int32_t a3 = __UBFX(in, 12, 4); + +// uint32_t evenA1 = __BFI(a2, a0, 16, 16); +// uint32_t oddA1 = __BFI(a3, a1, 16, 16); + +// weightedSum = __SMLAD(evenA1, evenW1, weightedSum); +// weightedSum = __SMLAD(oddA1, oddW1, weightedSum); + +// return weightedSum; +// } + +// /** +// * @brief Signed quad mac operation (4W/4A version) +// * @details Performs four mac operations for signed 4-bits weights +// * and signed 4-bits inputs. Extracts the four 4-bits weights +// * from two stored 8-bits weights and associates them into +// * two 32-bits values. Then extracts the four 4-bits inputs +// * from two stored 8-bits inputs and associates them into +// * two 32-bits values. Finally performs a double dual mac operation +// * with the __SMLAD instruction +// * +// * @tparam Input_T Input type (should be data<4>) +// * @tparam Weight_T Weight type (should be data<4>) +// * +// * @param[in] inputs Pointer to compressed input vector +// * @param[in] weights Pointer to compressed kernel weights +// * @param[in,out] weightedSum Accumulating sum from the +// * previous mac operations +// * @returns Updated weightedSum with +// * the result of the quad mac operation +// */ +// template<typename Input_T, typename Weight_T, typename Sum_T, +// typename std::enable_if<(!std::is_unsigned<Input_T>::value +// && std::numeric_limits<Weight_T>::digits == 4 +// && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr> +// __attribute__((always_inline)) static inline +// Sum_T quadMac(const Input_T* __restrict inputs, +// const Weight_T* __restrict weights, +// Sum_T weightedSum) +// { +// uint16_t wt; +// std::memcpy((void*) &wt, weights, sizeof(wt)); + +// int32_t w0 = __SBFX(wt, 0, 4); +// int32_t w1 = __SBFX(wt, 4, 4); +// int32_t w2 = __SBFX(wt, 8, 4); +// int32_t w3 = __SBFX(wt, 12, 4); + +// uint32_t evenW1 = __PKHBT(w2, w0, 16); +// uint32_t oddW1 = __PKHBT(w3, w1, 16); + +// uint16_t in; +// std::memcpy((void*) &in, inputs, sizeof(in)); + +// int32_t a0 = __SBFX(in, 0, 4); +// int32_t a1 = __SBFX(in, 4, 4); +// int32_t a2 = __SBFX(in, 8, 4); +// int32_t a3 = __SBFX(in, 12, 4); + +// uint32_t evenA1 = __PKHBT(a2, a0, 16); +// uint32_t oddA1 = __PKHBT(a3, a1, 16); + +// weightedSum = __SMLAD(evenA1, evenW1, weightedSum); +// weightedSum = __SMLAD(oddA1, oddW1, weightedSum); + +// return weightedSum; +// } + +// /** +// * @brief Unsigned octo mac operation (4W/4A version) +// * @details Performs eight mac operations for signed 4-bits weights +// * and unsigned 4-bits inputs. Extracts the eight 4-bits weights +// * from four stored 8-bits weights and associates them into +// * four 32-bits values. Then extracts the eight 4-bits inputs +// * from four stored 8-bits inputs and associates them into +// * four 32-bits values. Finally performs a quadruple dual mac operation +// * with the __SMLAD instruction +// * +// * @tparam Input_T Input type (should be udata<4>) +// * @tparam Weight_T Weight type (should be data<4>) +// * +// * @param[in] inputs Pointer to compressed input vector +// * @param[in] weights Pointer to compressed kernel weights +// * @param[in,out] weightedSum Accumulating sum from the +// * previous mac operations +// * @returns Updated weightedSum with +// * the result of the octo mac operation +// */ +// // template<typename Input_T, typename Weight_T, +// // typename std::enable_if<(std::is_unsigned<Input_T>::value +// // && std::numeric_limits<Weight_T>::digits == 4 +// // && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr> +// // __attribute__((always_inline)) static inline +// // Sum_T octoMac(const Input_T* __restrict inputs, +// // const Weight_T* __restrict weights, +// // Sum_T weightedSum) +// // { +// // uint32_t wt; +// // std::memcpy((void*) &wt, weights, sizeof(wt)); + +// // int32_t w0 = __SBFX(wt, 0, 4); +// // int32_t w1 = __SBFX(wt, 4, 4); +// // int32_t w2 = __SBFX(wt, 8, 4); +// // int32_t w3 = __SBFX(wt, 12, 4); +// // int32_t w4 = __SBFX(wt, 16, 4); +// // int32_t w5 = __SBFX(wt, 20, 4); +// // int32_t w6 = __SBFX(wt, 24, 4); +// // int32_t w7 = __SBFX(wt, 28, 4); + +// // // uint32_t weight0 = __BFI(w4, w0, 16, 16); +// // // uint32_t weight1 = __BFI(w5, w1, 16, 16); +// // // uint32_t weight2 = __BFI(w6, w2, 16, 16); +// // // uint32_t weight3 = __BFI(w7, w3, 16, 16); + +// // uint32_t weight0 = __PKHBT(w0, w4, 16); +// // uint32_t weight1 = __PKHBT(w1, w5, 16); +// // uint32_t weight2 = __PKHBT(w2, w6, 16); +// // uint32_t weight3 = __PKHBT(w3, w7, 16); + +// // uint32_t in; +// // std::memcpy((void*) &in, inputs, sizeof(in)); + +// // uint32_t act0 = in & 0xF000F; +// // uint32_t act1 = (in >> 4) & 0xF000F; +// // uint32_t act2 = (in >> 8) & 0xF000F; +// // uint32_t act3 = (in >> 12) & 0xF000F; + +// // weightedSum = __SMLAD(act0, weight0, weightedSum); +// // weightedSum = __SMLAD(act1, weight1, weightedSum); +// // weightedSum = __SMLAD(act2, weight2, weightedSum); +// // weightedSum = __SMLAD(act3, weight3, weightedSum); + +// // return weightedSum; +// // } + +// // template<typename Input_T, typename Weight_T, +// // typename std::enable_if<(std::is_unsigned<Input_T>::value +// // && std::numeric_limits<Weight_T>::digits == 4 +// // && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr> +// // __attribute__((always_inline)) static inline +// // Sum_T octoMac(const Input_T* __restrict inputs, +// // const Weight_T* __restrict weights, +// // Sum_T weightedSum) +// // { +// // union n2d2_dataword wt; +// // std::memcpy((void*) &wt, weights, sizeof(wt)); + +// // union n2d2_udataword in; +// // std::memcpy((void*) &in, inputs, sizeof(in)); + +// // for (int i = 0; i < 4; ++i) { +// // weightedSum += (data<32>)(in.half_bytes[i].fields.op0) * wt.half_bytes[i].fields.op0; +// // weightedSum += (data<32>)(in.half_bytes[i].fields.op1) * wt.half_bytes[i].fields.op1; +// // } + +// // // weightedSum += (data<32>)(in.half_bytes[0].fields.op0) * wt.half_bytes[0].fields.op0; +// // // weightedSum += (data<32>)(in.half_bytes[0].fields.op1) * wt.half_bytes[0].fields.op1; +// // // weightedSum += (data<32>)(in.half_bytes[1].fields.op0) * wt.half_bytes[1].fields.op0; +// // // weightedSum += (data<32>)(in.half_bytes[1].fields.op1) * wt.half_bytes[1].fields.op1; +// // // weightedSum += (data<32>)(in.half_bytes[2].fields.op0) * wt.half_bytes[2].fields.op0; +// // // weightedSum += (data<32>)(in.half_bytes[2].fields.op1) * wt.half_bytes[2].fields.op1; +// // // weightedSum += (data<32>)(in.half_bytes[3].fields.op0) * wt.half_bytes[3].fields.op0; +// // // weightedSum += (data<32>)(in.half_bytes[3].fields.op1) * wt.half_bytes[3].fields.op1; + +// // return weightedSum; +// // } + +// template<typename Input_T, typename Weight_T, typename Sum_T, +// typename std::enable_if<(std::is_unsigned<Input_T>::value +// && std::numeric_limits<Weight_T>::digits == 4 +// && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr> +// __attribute__((always_inline)) static inline +// Sum_T octoMac(const Input_T* __restrict inputs, +// const Weight_T* __restrict weights, +// Sum_T weightedSum) +// { +// uint32_t wt; +// memcpy((void*) &wt, weights, sizeof(wt)); + +// // Works with weights * 4096 (weights << 12) +// const uint32_t WeightMask = 0xF000F000; +// uint32_t weight0 = WeightMask & (wt << 12); +// uint32_t weight1 = WeightMask & (wt << 8); +// uint32_t weight2 = WeightMask & (wt << 4); +// uint32_t weight3 = WeightMask & (wt); + +// uint32_t in; +// memcpy((void*) &in, inputs, sizeof(in)); + +// const uint32_t ActMask = 0x000F000F; // to explicit instructions +// uint32_t act0 = in & ActMask; +// // Expect second operand shift +// uint32_t act1 = ActMask & (in >> 4); +// uint32_t act2 = ActMask & (in >> 8); +// uint32_t act3 = ActMask & (in >> 12); + +// Sum_T sum = 0; +// sum = __SMLAD(act0, weight0, sum); +// sum = __SMLAD(act1, weight1, sum); +// sum = __SMLAD(act2, weight2, sum); +// sum = __SMLAD(act3, weight3, sum); + +// return weightedSum + (sum >> 12); +// } + +// /** +// * @brief Signed octo mac operation (4W/4A version) +// * @details Performs eight mac operations for signed 4-bits weights +// * and signed 4-bits inputs. Extracts the eight 4-bits weights +// * from four stored 8-bits weights and associates them into +// * four 32-bits values. Then extracts the eight 4-bits inputs +// * from four stored 8-bits inputs and associates them into +// * four 32-bits values. Finally performs a quadruple dual mac operation +// * with the __SMLAD instruction +// * +// * @tparam Input_T Input type (should be data<4>) +// * @tparam Weight_T Weight type (should be data<4>) +// * +// * @param[in] inputs Pointer to compressed input vector +// * @param[in] weights Pointer to compressed kernel weights +// * @param[in,out] weightedSum Accumulating sum from the +// * previous mac operations +// * @returns Updated weightedSum with +// * the result of the octo mac operation +// */ +// template<typename Input_T, typename Weight_T, typename Sum_T, +// typename std::enable_if<(!std::is_unsigned<Input_T>::value +// && std::numeric_limits<Weight_T>::digits == 4 +// && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr> +// __attribute__((always_inline)) static inline +// Sum_T octoMac(const Input_T* __restrict inputs, +// const Weight_T* __restrict weights, +// Sum_T weightedSum) +// { +// uint32_t wt; +// std::memcpy((void*) &wt, weights, sizeof(wt)); + +// int32_t w0 = __SBFX(wt, 0, 4); +// int32_t w1 = __SBFX(wt, 4, 4); +// int32_t w2 = __SBFX(wt, 8, 4); +// int32_t w3 = __SBFX(wt, 12, 4); +// int32_t w4 = __SBFX(wt, 16, 4); +// int32_t w5 = __SBFX(wt, 20, 4); +// int32_t w6 = __SBFX(wt, 24, 4); +// int32_t w7 = __SBFX(wt, 28, 4); + +// uint32_t evenW1 = __PKHBT(w2, w0, 16); +// uint32_t oddW1 = __PKHBT(w3, w1, 16); +// uint32_t evenW2 = __PKHBT(w6, w4, 16); +// uint32_t oddW2 = __PKHBT(w7, w5, 16); + +// uint32_t in; +// std::memcpy((void*) &in, inputs, sizeof(in)); + +// int32_t a0 = __SBFX(in, 0, 4); +// int32_t a1 = __SBFX(in, 4, 4); +// int32_t a2 = __SBFX(in, 8, 4); +// int32_t a3 = __SBFX(in, 12, 4); +// int32_t a4 = __SBFX(in, 16, 4); +// int32_t a5 = __SBFX(in, 20, 4); +// int32_t a6 = __SBFX(in, 24, 4); +// int32_t a7 = __SBFX(in, 28, 4); + +// uint32_t evenA1 = __PKHBT(a2, a0, 16); +// uint32_t oddA1 = __PKHBT(a3, a1, 16); +// uint32_t evenA2 = __PKHBT(a6, a4, 16); +// uint32_t oddA2 = __PKHBT(a7, a5, 16); + +// weightedSum = __SMLAD(evenA1, evenW1, weightedSum); +// weightedSum = __SMLAD(oddA1, oddW1, weightedSum); +// weightedSum = __SMLAD(evenA2, evenW2, weightedSum); +// weightedSum = __SMLAD(oddA2, oddW2, weightedSum); + +// return weightedSum; +// } + + +// // template<typename Input_T, typename Weight_T, typename Sum_T, +// // typename std::enable_if<(std::is_unsigned<Input_T>::value +// // && std::numeric_limits<Weight_T>::digits == 4 +// // && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr> +// // void macsOnParallel(const Input_T* __restrict inputs, +// // const Weight_T* __restrict weights, +// // Sum_T* weightedSums, +// // const int nb_data) +// // { +// // uint32_t wt = 0; +// // std::memcpy((void*) &wt, weights, ceil((double)nb_data/2)); + +// // uint32_t in = 0; +// // std::memcpy((void*) &in, inputs, ceil((double)nb_data/2)); + +// // for (int i = 0; i < nb_data; ++i) { +// // weightedSums[i] += __SBFX(wt, 4*i, 4) * __UBFX(in, 4*i, 4); +// // } +// // } + +// // template<typename Input_T, typename Weight_T, typename Sum_T, +// // typename std::enable_if<(!std::is_unsigned<Input_T>::value +// // && std::numeric_limits<Weight_T>::digits == 4 +// // && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr> +// // void macsOnParallel(const Input_T* __restrict inputs, +// // const Weight_T* __restrict weights, +// // Sum_T* weightedSums, +// // const int nb_data) +// // { +// // uint32_t wt = 0; +// // std::memcpy((void*) &wt, weights, ceil((double)nb_data/2)); + +// // uint32_t in = 0; +// // std::memcpy((void*) &in, inputs, ceil((double)nb_data/2)); + +// // for (int i = 0; i < nb_data; ++i) { +// // weightedSums[i] += __SBFX(wt, 4*i, 4) * __SBFX(in, 4*i, 4); +// // } +// // } + + + + +// // ************************************************************************** +// // * Multiply-accumulate the values in inputs and weights for NB_ITERATIONS * +// // ************************************************************************** + +// template<int NB_ITERATIONS, +// int INPUTS_INC = 1, +// int WEIGHTS_INC = 1, +// class Input_T, +// class Weight_T, +// class Sum_T, +// typename std::enable_if<(NB_ITERATIONS == 0)>::type* = nullptr> +// inline static +// void macsOnRange(const Input_T* __restrict /*inputs*/, +// const Weight_T* __restrict /*weights*/, +// Sum_T& __restrict /*weightedSum*/) +// { +// // Nothing to do +// } + +// template<int NB_ITERATIONS, +// int INPUTS_INC = 1, +// int WEIGHTS_INC = 1, +// class Input_T, +// class Weight_T, +// class Sum_T, +// typename std::enable_if<(NB_ITERATIONS == 1)>::type* = nullptr> +// inline static +// void macsOnRange(const Input_T* __restrict inputs, +// const Weight_T* __restrict weights, +// Sum_T& __restrict weightedSum) +// { +// weightedSum += (*weights) * (*inputs); +// } + +// template<int NB_ITERATIONS, +// int INPUTS_INC = 1, +// int WEIGHTS_INC = 1, +// class Input_T, +// class Weight_T, +// class Sum_T, +// typename std::enable_if<(NB_ITERATIONS >= 2 && NB_ITERATIONS < 4)>::type* = nullptr> +// inline static +// void macsOnRange(const Input_T* __restrict inputs, +// const Weight_T* __restrict weights, +// Sum_T& __restrict weightedSum) +// { +// weightedSum = dualMac<INPUTS_INC, WEIGHTS_INC>(inputs, weights, weightedSum); +// macsOnRange<NB_ITERATIONS - 2, INPUTS_INC, WEIGHTS_INC>(inputs + 2*INPUTS_INC, +// weights + 2*WEIGHTS_INC, +// weightedSum); +// } + +// /** +// * @brief MACs Processing +// * @details Performs NB_ITERATIONS MACs operations, storing results into the +// * weightedSum variable. +// * +// * @tparam NB_ITERATIONS Number of MACs to perform +// * @tparam INPUTS_INC Input Stride +// * @tparam WEIGHTS_INC Weights Stride +// * @tparam Input_T Input Type +// * +// * @param inputs Pointer to inputs vector +// * @param weights Pointer to weights vector +// * @param weightedSum Pointer to weightedSum +// */ +// template<int NB_ITERATIONS, +// int INPUTS_INC = 1, +// int WEIGHTS_INC = 1, +// class Input_T, +// class Weight_T, +// class Sum_T, +// typename std::enable_if<(NB_ITERATIONS >= 4)>::type* = nullptr> +// inline static +// void macsOnRange(const Input_T* __restrict inputs, +// const Weight_T* __restrict weights, +// Sum_T& __restrict weightedSum) +// { +// weightedSum = quadMac<INPUTS_INC, WEIGHTS_INC>(inputs, weights, weightedSum); +// macsOnRange<NB_ITERATIONS - 4, INPUTS_INC, WEIGHTS_INC>(inputs + 4*INPUTS_INC, +// weights + 4*WEIGHTS_INC, +// weightedSum); +// } + + +// template<int NB_ITERATIONS, typename Input_T, typename Weight_T, typename Sum_T, +// typename std::enable_if<(NB_ITERATIONS >= 2 && NB_ITERATIONS < 4 && std::numeric_limits<Weight_T>::digits > 1)>::type* = nullptr> +// __attribute__((always_inline)) static inline +// void macsOnRange(const Input_T* __restrict inputs, +// const Weight_T* __restrict weights, +// Sum_T& weightedSum) +// { +// constexpr unsigned int idxI +// = (std::numeric_limits<Input_T>::digits > 4) ? 2 : 1; +// constexpr unsigned int idxW +// = (std::numeric_limits<Weight_T>::digits > 4) ? 2 : 1; + +// weightedSum = dualMac(inputs, weights, weightedSum); +// macsOnRange<NB_ITERATIONS - 2>(inputs + idxI, weights + idxW, weightedSum); +// } + +// template<int NB_ITERATIONS, typename Input_T, typename Weight_T, typename Sum_T, +// typename std::enable_if<NB_ITERATIONS >= 4 +// && (std::numeric_limits<Weight_T>::digits > 4)>::type* = nullptr> +// __attribute__((always_inline)) static inline +// void macsOnRange(const Input_T* __restrict inputs, +// const Weight_T* __restrict weights, +// Sum_T& weightedSum) +// { +// constexpr unsigned int idxI +// = (std::numeric_limits<Input_T>::digits > 4) +// ? 4 : (std::numeric_limits<Input_T>::digits == 4) ? 2 : 1; + +// constexpr unsigned int idxW = 4; + +// weightedSum = quadMac(inputs, weights, weightedSum); +// macsOnRange<NB_ITERATIONS - 4>(inputs + idxI, weights + idxW, weightedSum); +// } + +// template<int NB_ITERATIONS, typename Input_T, typename Weight_T, typename Sum_T, +// typename std::enable_if<(NB_ITERATIONS >= 4 && NB_ITERATIONS < 8) +// && (std::numeric_limits<Weight_T>::digits == 4)>::type* = nullptr> +// __attribute__((always_inline)) static inline +// void macsOnRange(const Input_T* __restrict inputs, +// const Weight_T* __restrict weights, +// Sum_T& weightedSum) +// { +// constexpr unsigned int idxI +// = (std::numeric_limits<Input_T>::digits > 4) +// ? 4 : (std::numeric_limits<Input_T>::digits == 4) ? 2 : 1; + +// constexpr unsigned int idxW = 2; + +// weightedSum = quadMac(inputs, weights, weightedSum); +// macsOnRange<NB_ITERATIONS - 4>(inputs + idxI, weights + idxW, weightedSum); +// } + +// template<int NB_ITERATIONS, typename Input_T, typename Weight_T, typename Sum_T, +// typename std::enable_if<NB_ITERATIONS >= 8 +// && (std::numeric_limits<Weight_T>::digits == 4)>::type* = nullptr> +// __attribute__((always_inline)) static inline +// void macsOnRange(const Input_T* __restrict inputs, +// const Weight_T* __restrict weights, +// Sum_T& weightedSum) +// { +// constexpr unsigned int idxI +// = (std::numeric_limits<Input_T>::digits > 4) +// ? 8 : (std::numeric_limits<Input_T>::digits == 4) +// ? 4 : (std::numeric_limits<Input_T>::digits == 2) +// ? 2 : 1; + +// constexpr unsigned int idxW = 4; + +// weightedSum = octoMac(inputs, weights, weightedSum); +// macsOnRange<NB_ITERATIONS - 8>(inputs + idxI, weights + idxW, weightedSum); +// } + + +// } // N2D2_Export + +// #endif // __N2D2_EXPORT_CPP_MACS_HPP__ + + + + +/** + ****************************************************************************** + * @file mac_functions.hpp + * @brief Mac operation functions for ARM Cortex m7 and m4 + * This file provides different functions to perform + * signed and unsigned mac operations. Those functions can calculate + * up to eight mac operations at once. + * The file also provides two general mac operation which can be + * used in other files, especially in Network.hpp + * + ****************************************************************************** + * @attention + * + * (C) Copyright 2021 CEA LIST. All Rights Reserved. + * Contributor(s): Vincent TEMPLIER (vincent.templier@cea.fr) + * Philippe DORE (philippe.dore@cea.fr) + * David BRIAND (david.briand@cea.fr) + * + * This file is not part of the open source version of N2D2 and is NOT under + * the CeCILL-C license. This code is the property of the CEA. It can not be + * copied or disseminated without its authorization. + * + ****************************************************************************** + */ + +#ifndef __N2D2_MAC_FUNCTIONS_HPP__ +#define __N2D2_MAC_FUNCTIONS_HPP__ + +#include <cstring> +#include "swar_arm_acle.h" +#include "kernels/typedefs.hpp" + + +// ---------------------------------------------------------------------------- +// --------------- MAC computing functions for all kernels -------------------- +// ---------------------------------------------------------------------------- + + +// ---------------------------------------------------------------------------- +// -------------- MAC computing functions for kernel 8W-8A -------------------- +// ---------------------------------------------------------------------------- + +/** + * @brief Mono mac operation (8W/8A version) + * @details Performs one mac operation for signed 8-bits weights + * and 8-bits inputs (signed or not). + * + * @tparam Input_T Input type (udata<8> or data<8>) + * @tparam Weight_T Weight type (should be data<8>) + * + * @param[in] inputs Pointer to input vector + * @param[in] weights Pointer to kernel weights + * @param[in,out] weightedSum Accumulating sum from the + * previous mac operations + * @returns Updated weightedSum with + * the result of the dual mac operation + */ +template<typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 8 + && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr> +__attribute__((always_inline)) static inline +Sum_T monoMac(const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T weightedSum) +{ + weightedSum += (Sum_T)inputs[0] * weights[0]; + return weightedSum; +} + +/** + * @brief Dual mac operation (8W/8A version) + * @details Performs two mac operations for signed 8-bits weights + * and 8-bits inputs (signed or not). + * + * @tparam Input_T Input type (udata<8> or data<8>) + * @tparam Weight_T Weight type (should be data<8>) + * + * @param[in] inputs Pointer to input vector + * @param[in] weights Pointer to kernel weights + * @param[in,out] weightedSum Accumulating sum from the + * previous mac operations + * @returns Updated weightedSum with + * the result of the dual mac operation + */ +template<typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 8 + && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr> +__attribute__((always_inline)) static inline +Sum_T dualMac(const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T weightedSum) +{ + weightedSum += (Sum_T)inputs[0] * weights[0] + (Sum_T)inputs[1] * weights[1]; + return weightedSum; +} + +/** + * @brief Unsigned quad mac operation (8W/8A version) + * @details Performs four mac operations for signed 8-bits weights + * and unsigned 8-bits inputs. Sign extends four 8-bits weights + * and associates them into two 32-bits values. Then zero extends + * four 8-bits inputs and associates them into two 32-bits values. + * Finally performs a double dual mac operation + * with the __SMLAD instruction. + * + * @tparam Input_T Input type (should be udata<8>) + * @tparam Weight_T Weight type (should be data<8>) + * + * @param[in] inputs Pointer to input vector + * @param[in] weights Pointer to kernel weights + * @param[in,out] weightedSum Accumulating sum from the + * previous mac operations + * @returns Updated weightedSum with + * the result of the quad mac operation + */ +template<typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::is_unsigned<Input_T>::value + && std::numeric_limits<Weight_T>::digits == 8 + && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr> +__attribute__((always_inline)) static inline +Sum_T quadMac(const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T weightedSum) +{ + uint32_t in; + std::memcpy((void*) &in, inputs, sizeof(in)); + + uint32_t in1 = __UXTB16(in); + uint32_t in2 = __UXTB16_RORn(in, 8); + + uint32_t wt; + std::memcpy((void*) &wt, weights, sizeof(wt)); + + uint32_t wt1 = __SXTB16(wt); + uint32_t wt2 = __SXTB16_RORn(wt, 8); + + weightedSum = __SMLAD(in1, wt1, weightedSum); + weightedSum = __SMLAD(in2, wt2, weightedSum); + + return weightedSum; +} + +/** + * @brief Signed quad mac operation (8W/8A version) + * @details Performs four mac operations for signed 8-bits weights + * and signed 8-bits inputs. Sign extends four 8-bits weights + * and associates them into two 32-bits values. Then sign extends + * four 8-bits inputs and associates them into two 32-bits values. + * Finally performs a double dual mac operation + * with the __SMLAD instruction. + * + * @tparam Input_T Input type (should be data<8>) + * @tparam Weight_T Weight type (should be data<8>) + * + * @param[in] inputs Pointer to input vector + * @param[in] weights Pointer to kernel weights + * @param[in,out] weightedSum Accumulating sum from the + * previous mac operations + * @returns Updated weightedSum with + * the result of the quad mac operation + */ +template<typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(!std::is_unsigned<Input_T>::value + && std::numeric_limits<Weight_T>::digits == 8 + && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr> +__attribute__((always_inline)) static inline +Sum_T quadMac(const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T weightedSum) +{ + uint32_t in; + std::memcpy((void*) &in, inputs, sizeof(in)); + + uint32_t in1 = __SXTB16(in); + uint32_t in2 = __SXTB16_RORn(in, 8); + + uint32_t wt; + std::memcpy((void*) &wt, weights, sizeof(wt)); + + uint32_t wt1 = __SXTB16(wt); + uint32_t wt2 = __SXTB16_RORn(wt, 8); + + weightedSum = __SMLAD(in1, wt1, weightedSum); + weightedSum = __SMLAD(in2, wt2, weightedSum); + + return weightedSum; +} + +template<typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 8 + && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr> +void macsOnParallel(const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T* weightedSums, + const int nb_data) +{ + union n2d2_dataword wt = {0}; + std::memcpy((void*) &wt, weights, nb_data); + + typename std::conditional<(!std::is_unsigned<Input_T>::value), + union n2d2_dataword, union n2d2_udataword>::type in = {0}; + std::memcpy((void*) &in, inputs, nb_data); + + for (int i = 0; i < nb_data; ++i) { + weightedSums[i] += (Sum_T)wt.bytes[i] * in.bytes[i]; + } +} + + + +// ---------------------------------------------------------------------------- +// -------------- MAC computing functions for kernel 4W-8A -------------------- +// ---------------------------------------------------------------------------- + +/** + * @brief Mono mac operation (4W/8A version) + * @details Performs one mac operation for signed 4-bits weights + * and 8-bits inputs (signed or not). + * + * @tparam Input_T Input type (udata<8> or data<8>) + * @tparam Weight_T Weight type (should be data<4>) + * + * @param[in] inputs Pointer to input vector + * @param[in] weights Pointer to kernel weights + * @param[in,out] weightedSum Accumulating sum from the + * previous mac operations + * @returns Updated weightedSum with + * the result of the dual mac operation + */ +template<typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 4 + && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr> +__attribute__((always_inline)) static inline +Sum_T monoMac(const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T weightedSum) +{ + weightedSum += (Sum_T)inputs[0] * __SBFX(weights[0], 4, 4); + return weightedSum; +} + +/** + * @brief Unsigned dual mac operation (4W/8A version) + * @details Performs two mac operations for signed 4-bits weights + * and unsigned 8-bits inputs. Extracts the two 4-bits weights + * from a stored 8-bits weight and associates them into + * a 32-bits value. Then zero extends two 8-bits inputs and + * associates them into a 32-bits value. Finally performs a + * dual mac operation with the __SMLAD instruction + * + * @tparam Input_T Input type (should be udata<8>) + * @tparam Weight_T Weight type (should be data<4>) + * + * @param[in] inputs Pointer to input vector + * @param[in] weights Pointer to compressed kernel weights + * @param[in,out] weightedSum Accumulating sum from the + * previous mac operations + * @returns Updated weightedSum with + * the result of the dual mac operation + */ +template<typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::is_unsigned<Input_T>::value + && std::numeric_limits<Weight_T>::digits == 4 + && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr> +__attribute__((always_inline)) static inline +Sum_T dualMac(const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T weightedSum) +{ + uint8_t wt; + std::memcpy((void*) &wt, weights, sizeof(wt)); + + int32_t w0 = __SBFX(wt, 0, 4); + int32_t w1 = __SBFX(wt, 4, 4); + uint32_t wght = __BFI(w0, w1, 16, 16); + + uint16_t in; + std::memcpy((void*) &in, inputs, sizeof(in)); + + uint32_t act = ((in << 8) | in); + act = __UXTB16(act); + + weightedSum = __SMLAD(act, wght, weightedSum); + + return weightedSum; +} + +/** + * @brief Signed dual mac operation (4W/8A version) + * @details Performs two mac operations for signed 4-bits weights + * and signed 8-bits inputs. Extracts the two 4-bits weights + * from a stored 8-bits weight and associates them into + * a 32-bits value. Then sign extends two 8-bits inputs and + * associates them into a 32-bits value. Finally performs a + * dual mac operation with the __SMLAD instruction + * + * @tparam Input_T Input type (should be data<8>) + * @tparam Weight_T Weight type (should be data<4>) + * + * @param[in] inputs Pointer to input vector + * @param[in] weights Pointer to compressed kernel weights + * @param[in,out] weightedSum Accumulating sum from the + * previous mac operations + * @returns Updated weightedSum with + * the result of the dual mac operation + */ +template<typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(!std::is_unsigned<Input_T>::value + && std::numeric_limits<Weight_T>::digits == 4 + && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr> +__attribute__((always_inline)) static inline +Sum_T dualMac(const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T weightedSum) +{ + uint8_t wt; + std::memcpy((void*) &wt, weights, sizeof(wt)); + + int32_t w0 = __SBFX(wt, 0, 4); + int32_t w1 = __SBFX(wt, 4, 4); + uint32_t wght = __BFI(w0, w1, 16, 16); + + uint16_t in; + std::memcpy((void*) &in, inputs, sizeof(in)); + + uint32_t act = ((in << 8) | in); + act = __SXTB16(act); + + weightedSum = __SMLAD(act, wght, weightedSum); + + return weightedSum; +} + +/** + * @brief Unsigned quad mac operation (4W/8A version) + * @details Performs four mac operations for signed 4-bits weights + * and unsigned 8-bits inputs. Extracts the four 4-bits weights + * from two stored 8-bits weights and associates them into + * two 32-bits values. Then zero extends four 8-bits inputs and + * associates them into two 32-bits values. Finally performs a + * double dual mac operation with the __SMLAD instruction + * + * @tparam Input_T Input type (should be udata<8>) + * @tparam Weight_T Weight type (should be data<4>) + * + * @param[in] inputs Pointer to input vector + * @param[in] weights Pointer to compressed kernel weights + * @param[in,out] weightedSum Accumulating sum from the + * previous mac operations + * @returns Updated weightedSum with + * the result of the quad mac operation + */ +template<typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::is_unsigned<Input_T>::value + && std::numeric_limits<Weight_T>::digits == 4 + && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr> +__attribute__((always_inline)) static inline +Sum_T quadMac(const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T weightedSum) +{ + uint16_t wt; + std::memcpy((void*) &wt, weights, sizeof(wt)); + + int32_t w0 = __SBFX(wt, 0, 4); + int32_t w1 = __SBFX(wt, 4, 4); + int32_t w2 = __SBFX(wt, 8, 4); + int32_t w3 = __SBFX(wt, 12, 4); + + uint32_t evenW1 = __PKHBT(w0, w2, 16); + uint32_t oddW1 = __PKHBT(w1, w3, 16); + + uint32_t in; + std::memcpy((void*) &in, inputs, sizeof(in)); + + uint32_t evenA1 = __UXTB16(in); + uint32_t oddA1 = __UXTB16_RORn(in, 8); + + weightedSum = __SMLAD(evenA1, oddW1, weightedSum); + weightedSum = __SMLAD(oddA1, evenW1, weightedSum); + + return weightedSum; +} + +/** + * @brief Signed quad mac operation (4W/8A version) + * @details Performs four mac operations for signed 4-bits weights + * and signed 8-bits inputs. Extracts the four 4-bits weights + * from two stored 8-bits weights and associates them into + * two 32-bits values. Then sign extends four 8-bits inputs and + * associates them into two 32-bits values. Finally performs a + * double dual mac operation with the __SMLAD instruction + * + * @tparam Input_T Input type (should be data<8>) + * @tparam Weight_T Weight type (should be data<4>) + * + * @param[in] inputs Pointer to input vector + * @param[in] weights Pointer to compressed kernel weights + * @param[in,out] weightedSum Accumulating sum from the + * previous mac operations + * @returns Updated weightedSum with + * the result of the quad mac operation + */ +template<typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(!std::is_unsigned<Input_T>::value + && std::numeric_limits<Weight_T>::digits == 4 + && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr> +__attribute__((always_inline)) static inline +Sum_T quadMac(const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T weightedSum) +{ + uint16_t wt; + std::memcpy((void*) &wt, weights, sizeof(wt)); + + int32_t w0 = __SBFX(wt, 0, 4); + int32_t w1 = __SBFX(wt, 4, 4); + int32_t w2 = __SBFX(wt, 8, 4); + int32_t w3 = __SBFX(wt, 12, 4); + + uint32_t evenW1 = __BFI(w2, w0, 16, 16); + uint32_t oddW1 = __BFI(w3, w1, 16, 16); + + uint32_t in; + std::memcpy((void*) &in, inputs, sizeof(in)); + + uint32_t evenA1 = __SXTB16(in); + uint32_t oddA1 = __SXTB16_RORn(in, 8); + + weightedSum = __SMLAD(evenA1, oddW1, weightedSum); + weightedSum = __SMLAD(oddA1, evenW1, weightedSum); + + return weightedSum; +} + +/** + * @brief Unsigned octo mac operation (4W/8A version) + * @details Performs eight mac operations for signed 4-bits weights + * and unsigned 8-bits inputs. Extracts the eight 4-bits weights + * from four stored 8-bits weights and associates them into + * four 32-bits values. Then zero extends eights 8-bits inputs and + * associates them into four 32-bits values. Finally performs a + * quadruple dual mac operation with the __SMLAD instruction + * + * @tparam Input_T Input type (should be udata<8>) + * @tparam Weight_T Weight type (should be data<4>) + * + * @param[in] inputs Pointer to input vector + * @param[in] weights Pointer to compressed kernel weights + * @param[in,out] weightedSum Accumulating sum from the + * previous mac operations + * @returns Updated weightedSum with + * the result of the octo mac operation + */ +template<typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::is_unsigned<Input_T>::value + && std::numeric_limits<Weight_T>::digits == 4 + && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr> +__attribute__((always_inline)) static inline +Sum_T octoMac(const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T weightedSum) +{ + // uint32_t wt; + // std::memcpy((void*) &wt, weights, sizeof(wt)); + + // int32_t w0 = __SBFX(wt, 0, 4); + // int32_t w1 = __SBFX(wt, 4, 4); + // int32_t w2 = __SBFX(wt, 8, 4); + // int32_t w3 = __SBFX(wt, 12, 4); + // int32_t w4 = __SBFX(wt, 16, 4); + // int32_t w5 = __SBFX(wt, 20, 4); + // int32_t w6 = __SBFX(wt, 24, 4); + // int32_t w7 = __SBFX(wt, 28, 4); + + // // uint32_t evenW1 = __BFI(w2, w0, 16, 16); + // // uint32_t oddW1 = __BFI(w3, w1, 16, 16); + // // uint32_t evenW2 = __BFI(w6, w4, 16, 16); + // // uint32_t oddW2 = __BFI(w7, w5, 16, 16); + + // uint32_t evenW1 = __PKHBT(w0, w2, 16); + // uint32_t oddW1 = __PKHBT(w1, w3, 16); + // uint32_t evenW2 = __PKHBT(w4, w6, 16); + // uint32_t oddW2 = __PKHBT(w5, w7, 16); + + // uint32_t in1, in2; + // std::memcpy((void*) &in1, inputs, sizeof(in1)); + // std::memcpy((void*) &in2, (inputs + 4), sizeof(in2)); + + // uint32_t evenA1 = __UXTB16(in1); + // uint32_t oddA1 = __UXTB16_RORn(in1, 8); + // uint32_t evenA2 = __UXTB16(in2); + // uint32_t oddA2 = __UXTB16_RORn(in2, 8); + + // weightedSum = __SMLAD(evenA1, oddW1, weightedSum); + // weightedSum = __SMLAD(oddA1, evenW1, weightedSum); + // weightedSum = __SMLAD(evenA2, oddW2, weightedSum); + // weightedSum = __SMLAD(oddA2, evenW2, weightedSum); + + // 2nd implementation + // union n2d2_dataword wt; + // std::memcpy((void*) &wt, weights, sizeof(wt)); + + // union n2d2_udataword in1, in2; + // std::memcpy((void*) &in1, inputs, sizeof(in1)); + // std::memcpy((void*) &in2, inputs + 4, sizeof(in2)); + + // weightedSum += (data<32>)(in1.bytes[0]) * wt.half_bytes[0].fields.op1; + // weightedSum += (data<32>)(in1.bytes[1]) * wt.half_bytes[0].fields.op0; + // weightedSum += (data<32>)(in1.bytes[2]) * wt.half_bytes[1].fields.op1; + // weightedSum += (data<32>)(in1.bytes[3]) * wt.half_bytes[1].fields.op0; + // weightedSum += (data<32>)(in2.bytes[0]) * wt.half_bytes[2].fields.op1; + // weightedSum += (data<32>)(in2.bytes[1]) * wt.half_bytes[2].fields.op0; + // weightedSum += (data<32>)(in2.bytes[2]) * wt.half_bytes[3].fields.op1; + // weightedSum += (data<32>)(in2.bytes[3]) * wt.half_bytes[3].fields.op0; + + uint32_t wt; + memcpy((void*) &wt, weights, sizeof(wt)); + + // Works with weights * 4096 (weights << 12) + const uint32_t WeightMask = 0xF000F000; + uint32_t weight0 = WeightMask & (wt << 12); + uint32_t weight1 = WeightMask & (wt << 8); + uint32_t weight2 = WeightMask & (wt << 4); + uint32_t weight3 = WeightMask & (wt); + + uint32_t in1, in2; + std::memcpy((void*) &in1, inputs, sizeof(in1)); + std::memcpy((void*) &in2, (inputs + 4), sizeof(in2)); + + uint32_t in_a = __PKHBT(in1, in2, 16); + uint32_t in_b = __PKHTB(in2, in1, 16); + + uint32_t evenA1 = __UXTB16(in_a); + uint32_t oddA1 = __UXTB16_RORn(in_a, 8); + uint32_t evenA2 = __UXTB16(in_b); + uint32_t oddA2 = __UXTB16_RORn(in_b, 8); + + Sum_T sum = 0; + sum = __SMLAD(oddA1, weight0, sum); + sum = __SMLAD(evenA1, weight1, sum); + sum = __SMLAD(oddA2, weight2, sum); + sum = __SMLAD(evenA2, weight3, sum); + weightedSum += sum >> 12; + + return weightedSum; +} + +/** + * @brief Signed octo mac operation (4W/8A version) + * @details Performs eight mac operations for signed 4-bits weights + * and signed 8-bits inputs. Extracts the eight 4-bits weights + * from four stored 8-bits weights and associates them into + * four 32-bits values. Then sign extends eights 8-bits inputs and + * associates them into four 32-bits values. Finally performs a + * quadruple dual mac operation with the __SMLAD instruction + * + * @tparam Input_T Input type (should be data<8>) + * @tparam Weight_T Weight type (should be data<4>) + * + * @param[in] inputs Pointer to input vector + * @param[in] weights Pointer to compressed kernel weights + * @param[in,out] weightedSum Accumulating sum from the + * previous mac operations + * @returns Updated weightedSum with + * the result of the octo mac operation + */ +template<typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(!std::is_unsigned<Input_T>::value + && std::numeric_limits<Weight_T>::digits == 4 + && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr> +__attribute__((always_inline)) static inline +Sum_T octoMac(const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T weightedSum) +{ + uint32_t wt; + std::memcpy((void*) &wt, weights, sizeof(wt)); + + int32_t w0 = __SBFX(wt, 0, 4); + int32_t w1 = __SBFX(wt, 4, 4); + int32_t w2 = __SBFX(wt, 8, 4); + int32_t w3 = __SBFX(wt, 12, 4); + int32_t w4 = __SBFX(wt, 16, 4); + int32_t w5 = __SBFX(wt, 20, 4); + int32_t w6 = __SBFX(wt, 24, 4); + int32_t w7 = __SBFX(wt, 28, 4); + + uint32_t evenW1 = __BFI(w2, w0, 16, 16); + uint32_t oddW1 = __BFI(w3, w1, 16, 16); + uint32_t evenW2 = __BFI(w6, w4, 16, 16); + uint32_t oddW2 = __BFI(w7, w5, 16, 16); + + uint32_t in1, in2; + std::memcpy((void*) &in1, inputs, sizeof(in1)); + std::memcpy((void*) &in2, (inputs + 4), sizeof(in2)); + + uint32_t evenA1 = __SXTB16(in1); + uint32_t oddA1 = __SXTB16_RORn(in1, 8); + uint32_t evenA2 = __SXTB16(in2); + uint32_t oddA2 = __SXTB16_RORn(in2, 8); + + weightedSum = __SMLAD(evenA1, oddW1, weightedSum); + weightedSum = __SMLAD(oddA1, evenW1, weightedSum); + weightedSum = __SMLAD(evenA2, oddW2, weightedSum); + weightedSum = __SMLAD(oddA2, evenW2, weightedSum); + + return weightedSum; +} + + +template<typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<( + std::numeric_limits<Weight_T>::digits == 4 && + std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr> +void macsOnParallel(const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T* weightedSums, + const int nb_data) +{ + uint32_t wt = 0; + std::memcpy((void*) &wt, weights, ceil((double)nb_data/2)); + + for (int i = 0; i < nb_data; ++i) { + weightedSums[i] += __SBFX(wt, 4*i, 4) * inputs[i]; + } +} + + +// ---------------------------------------------------------------------------- +// -------------- MAC computing functions for kernel 4W-4A -------------------- +// ---------------------------------------------------------------------------- + +/** + * @brief Unsigned mono mac operation (4W/4A version) + * @details Performs one mac operation for signed 4-bits weights + * and unsigned 4-bits inputs. + * + * @tparam Input_T Input type (should be udata<4>) + * @tparam Weight_T Weight type (should be data<4>) + * + * @param[in] inputs Pointer to input vector + * @param[in] weights Pointer to kernel weights + * @param[in,out] weightedSum Accumulating sum from the + * previous mac operations + * @returns Updated weightedSum with + * the result of the dual mac operation + */ +template<typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::is_unsigned<Input_T>::value + && std::numeric_limits<Weight_T>::digits == 4 + && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr> +__attribute__((always_inline)) static inline +Sum_T monoMac(const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T weightedSum) +{ + weightedSum += __UBFX(inputs[0], 4, 4) * __SBFX(weights[0], 4, 4); + return weightedSum; +} + +/** + * @brief Signed mono mac operation (4W/4A version) + * @details Performs one mac operation for signed 4-bits weights + * and signed 4-bits inputs. + * + * @tparam Input_T Input type (should be data<4>) + * @tparam Weight_T Weight type (should be data<4>) + * + * @param[in] inputs Pointer to input vector + * @param[in] weights Pointer to kernel weights + * @param[in,out] weightedSum Accumulating sum from the + * previous mac operations + * @returns Updated weightedSum with + * the result of the dual mac operation + */ +template<typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(!std::is_unsigned<Input_T>::value + && std::numeric_limits<Weight_T>::digits == 4 + && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr> +__attribute__((always_inline)) static inline +Sum_T monoMac(const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T weightedSum) +{ + weightedSum += __SBFX(inputs[0], 4, 4) * __SBFX(weights[0], 4, 4); + return weightedSum; +} + +/** + * @brief Unsigned dual mac operation (4W/4A version) + * @details Performs two mac operations for signed 4-bits weights + * and unsigned 4-bits inputs. Extracts the two 4-bits weights + * from a stored 8-bits weight and associates them into + * a 32-bits value. Then extracts the two 4-bits inputs + * from a stored 8-bits input and associates them into + * a 32-bits value. Finally performs a dual mac operation + * with the __SMLAD instruction + * + * @tparam Input_T Input type (should be udata<4>) + * @tparam Weight_T Weight type (should be data<4>) + * + * @param[in] inputs Pointer to compressed input vector + * @param[in] weights Pointer to compressed kernel weights + * @param[in,out] weightedSum Accumulating sum from the + * previous mac operations + * @returns Updated weightedSum with + * the result of the dual mac operation + */ +template<typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::is_unsigned<Input_T>::value + && std::numeric_limits<Weight_T>::digits == 4 + && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr> +__attribute__((always_inline)) static inline +Sum_T dualMac(const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T weightedSum) +{ + uint8_t wt; + std::memcpy((void*) &wt, weights, sizeof(wt)); + + int32_t w0 = __SBFX(wt, 0, 4); + int32_t w1 = __SBFX(wt, 4, 4); + uint32_t wght = __BFI(w1, w0, 16, 16); + + uint8_t in; + std::memcpy((void*) &in, inputs, sizeof(in)); + + int32_t a0 = __UBFX(in, 0, 4); + int32_t a1 = __UBFX(in, 4, 4); + uint32_t act = __BFI(a1, a0, 16, 16); + + weightedSum = __SMLAD(act, wght, weightedSum); + + return weightedSum; +} + +/** + * @brief Signed dual mac operation (4W/4A version) + * @details Performs two mac operations for signed 4-bits weights + * and signed 4-bits inputs. Extracts the two 4-bits weights + * from a stored 8-bits weight and associates them into + * a 32-bits value. Then extracts the two 4-bits inputs + * from a stored 8-bits input and associates them into + * a 32-bits value. Finally performs a dual mac operation + * with the __SMLAD instruction + * + * @tparam Input_T Input type (should be data<4>) + * @tparam Weight_T Weight type (should be data<4>) + * + * @param[in] inputs Pointer to compressed input vector + * @param[in] weights Pointer to compressed kernel weights + * @param[in,out] weightedSum Accumulating sum from the + * previous mac operations + * @returns Updated weightedSum with + * the result of the dual mac operation + */ +template<typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(!std::is_unsigned<Input_T>::value + && std::numeric_limits<Weight_T>::digits == 4 + && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr> +__attribute__((always_inline)) static inline +Sum_T dualMac(const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T weightedSum) +{ + uint8_t wt; + std::memcpy((void*) &wt, weights, sizeof(wt)); + + int32_t w0 = __SBFX(wt, 0, 4); + int32_t w1 = __SBFX(wt, 4, 4); + uint32_t wght = __BFI(w1, w0, 16, 16); + + uint8_t in; + std::memcpy((void*) &in, inputs, sizeof(in)); + + int32_t a0 = __SBFX(in, 0, 4); + int32_t a1 = __SBFX(in, 4, 4); + uint32_t act = __BFI(a1, a0, 16, 16); + + weightedSum = __SMLAD(act, wght, weightedSum); + + return weightedSum; +} + +/** + * @brief Unsigned quad mac operation (4W/4A version) + * @details Performs four mac operations for signed 4-bits weights + * and unsigned 4-bits inputs. Extracts the four 4-bits weights + * from two stored 8-bits weights and associates them into + * two 32-bits values. Then extracts the four 4-bits inputs + * from two stored 8-bits inputs and associates them into + * two 32-bits values. Finally performs a double dual mac operation + * with the __SMLAD instruction + * + * @tparam Input_T Input type (should be udata<4>) + * @tparam Weight_T Weight type (should be data<4>) + * + * @param[in] inputs Pointer to compressed input vector + * @param[in] weights Pointer to compressed kernel weights + * @param[in,out] weightedSum Accumulating sum from the + * previous mac operations + * @returns Updated weightedSum with + * the result of the quad mac operation + */ +template<typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::is_unsigned<Input_T>::value + && std::numeric_limits<Weight_T>::digits == 4 + && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr> +__attribute__((always_inline)) static inline +Sum_T quadMac(const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T weightedSum) +{ + uint16_t wt; + std::memcpy((void*) &wt, weights, sizeof(wt)); + + int32_t w0 = __SBFX(wt, 0, 4); + int32_t w1 = __SBFX(wt, 4, 4); + int32_t w2 = __SBFX(wt, 8, 4); + int32_t w3 = __SBFX(wt, 12, 4); + + uint32_t evenW1 = __BFI(w2, w0, 16, 16); + uint32_t oddW1 = __BFI(w3, w1, 16, 16); + + uint16_t in; + std::memcpy((void*) &in, inputs, sizeof(in)); + + int32_t a0 = __UBFX(in, 0, 4); + int32_t a1 = __UBFX(in, 4, 4); + int32_t a2 = __UBFX(in, 8, 4); + int32_t a3 = __UBFX(in, 12, 4); + + uint32_t evenA1 = __BFI(a2, a0, 16, 16); + uint32_t oddA1 = __BFI(a3, a1, 16, 16); + + weightedSum = __SMLAD(evenA1, evenW1, weightedSum); + weightedSum = __SMLAD(oddA1, oddW1, weightedSum); + + return weightedSum; +} + +/** + * @brief Signed quad mac operation (4W/4A version) + * @details Performs four mac operations for signed 4-bits weights + * and signed 4-bits inputs. Extracts the four 4-bits weights + * from two stored 8-bits weights and associates them into + * two 32-bits values. Then extracts the four 4-bits inputs + * from two stored 8-bits inputs and associates them into + * two 32-bits values. Finally performs a double dual mac operation + * with the __SMLAD instruction + * + * @tparam Input_T Input type (should be data<4>) + * @tparam Weight_T Weight type (should be data<4>) + * + * @param[in] inputs Pointer to compressed input vector + * @param[in] weights Pointer to compressed kernel weights + * @param[in,out] weightedSum Accumulating sum from the + * previous mac operations + * @returns Updated weightedSum with + * the result of the quad mac operation + */ +template<typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(!std::is_unsigned<Input_T>::value + && std::numeric_limits<Weight_T>::digits == 4 + && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr> +__attribute__((always_inline)) static inline +Sum_T quadMac(const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T weightedSum) +{ + uint16_t wt; + std::memcpy((void*) &wt, weights, sizeof(wt)); + + int32_t w0 = __SBFX(wt, 0, 4); + int32_t w1 = __SBFX(wt, 4, 4); + int32_t w2 = __SBFX(wt, 8, 4); + int32_t w3 = __SBFX(wt, 12, 4); + + uint32_t evenW1 = __PKHBT(w2, w0, 16); + uint32_t oddW1 = __PKHBT(w3, w1, 16); + + uint16_t in; + std::memcpy((void*) &in, inputs, sizeof(in)); + + int32_t a0 = __SBFX(in, 0, 4); + int32_t a1 = __SBFX(in, 4, 4); + int32_t a2 = __SBFX(in, 8, 4); + int32_t a3 = __SBFX(in, 12, 4); + + uint32_t evenA1 = __PKHBT(a2, a0, 16); + uint32_t oddA1 = __PKHBT(a3, a1, 16); + + weightedSum = __SMLAD(evenA1, evenW1, weightedSum); + weightedSum = __SMLAD(oddA1, oddW1, weightedSum); + + return weightedSum; +} + +/** + * @brief Unsigned octo mac operation (4W/4A version) + * @details Performs eight mac operations for signed 4-bits weights + * and unsigned 4-bits inputs. Extracts the eight 4-bits weights + * from four stored 8-bits weights and associates them into + * four 32-bits values. Then extracts the eight 4-bits inputs + * from four stored 8-bits inputs and associates them into + * four 32-bits values. Finally performs a quadruple dual mac operation + * with the __SMLAD instruction + * + * @tparam Input_T Input type (should be udata<4>) + * @tparam Weight_T Weight type (should be data<4>) + * + * @param[in] inputs Pointer to compressed input vector + * @param[in] weights Pointer to compressed kernel weights + * @param[in,out] weightedSum Accumulating sum from the + * previous mac operations + * @returns Updated weightedSum with + * the result of the octo mac operation + */ +// template<typename Input_T, typename Weight_T, +// typename std::enable_if<(std::is_unsigned<Input_T>::value +// && std::numeric_limits<Weight_T>::digits == 4 +// && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr> +// __attribute__((always_inline)) static inline +// Sum_T octoMac(const Input_T* __restrict inputs, +// const Weight_T* __restrict weights, +// Sum_T weightedSum) +// { +// uint32_t wt; +// std::memcpy((void*) &wt, weights, sizeof(wt)); + +// int32_t w0 = __SBFX(wt, 0, 4); +// int32_t w1 = __SBFX(wt, 4, 4); +// int32_t w2 = __SBFX(wt, 8, 4); +// int32_t w3 = __SBFX(wt, 12, 4); +// int32_t w4 = __SBFX(wt, 16, 4); +// int32_t w5 = __SBFX(wt, 20, 4); +// int32_t w6 = __SBFX(wt, 24, 4); +// int32_t w7 = __SBFX(wt, 28, 4); + +// // uint32_t weight0 = __BFI(w4, w0, 16, 16); +// // uint32_t weight1 = __BFI(w5, w1, 16, 16); +// // uint32_t weight2 = __BFI(w6, w2, 16, 16); +// // uint32_t weight3 = __BFI(w7, w3, 16, 16); + +// uint32_t weight0 = __PKHBT(w0, w4, 16); +// uint32_t weight1 = __PKHBT(w1, w5, 16); +// uint32_t weight2 = __PKHBT(w2, w6, 16); +// uint32_t weight3 = __PKHBT(w3, w7, 16); + +// uint32_t in; +// std::memcpy((void*) &in, inputs, sizeof(in)); + +// uint32_t act0 = in & 0xF000F; +// uint32_t act1 = (in >> 4) & 0xF000F; +// uint32_t act2 = (in >> 8) & 0xF000F; +// uint32_t act3 = (in >> 12) & 0xF000F; + +// weightedSum = __SMLAD(act0, weight0, weightedSum); +// weightedSum = __SMLAD(act1, weight1, weightedSum); +// weightedSum = __SMLAD(act2, weight2, weightedSum); +// weightedSum = __SMLAD(act3, weight3, weightedSum); + +// return weightedSum; +// } + +// template<typename Input_T, typename Weight_T, +// typename std::enable_if<(std::is_unsigned<Input_T>::value +// && std::numeric_limits<Weight_T>::digits == 4 +// && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr> +// __attribute__((always_inline)) static inline +// Sum_T octoMac(const Input_T* __restrict inputs, +// const Weight_T* __restrict weights, +// Sum_T weightedSum) +// { +// union n2d2_dataword wt; +// std::memcpy((void*) &wt, weights, sizeof(wt)); + +// union n2d2_udataword in; +// std::memcpy((void*) &in, inputs, sizeof(in)); + +// for (int i = 0; i < 4; ++i) { +// weightedSum += (data<32>)(in.half_bytes[i].fields.op0) * wt.half_bytes[i].fields.op0; +// weightedSum += (data<32>)(in.half_bytes[i].fields.op1) * wt.half_bytes[i].fields.op1; +// } + +// // weightedSum += (data<32>)(in.half_bytes[0].fields.op0) * wt.half_bytes[0].fields.op0; +// // weightedSum += (data<32>)(in.half_bytes[0].fields.op1) * wt.half_bytes[0].fields.op1; +// // weightedSum += (data<32>)(in.half_bytes[1].fields.op0) * wt.half_bytes[1].fields.op0; +// // weightedSum += (data<32>)(in.half_bytes[1].fields.op1) * wt.half_bytes[1].fields.op1; +// // weightedSum += (data<32>)(in.half_bytes[2].fields.op0) * wt.half_bytes[2].fields.op0; +// // weightedSum += (data<32>)(in.half_bytes[2].fields.op1) * wt.half_bytes[2].fields.op1; +// // weightedSum += (data<32>)(in.half_bytes[3].fields.op0) * wt.half_bytes[3].fields.op0; +// // weightedSum += (data<32>)(in.half_bytes[3].fields.op1) * wt.half_bytes[3].fields.op1; + +// return weightedSum; +// } + +template<typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::is_unsigned<Input_T>::value + && std::numeric_limits<Weight_T>::digits == 4 + && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr> +__attribute__((always_inline)) static inline +Sum_T octoMac(const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T weightedSum) +{ + uint32_t wt; + memcpy((void*) &wt, weights, sizeof(wt)); + + // Works with weights * 4096 (weights << 12) + const uint32_t WeightMask = 0xF000F000; + uint32_t weight0 = WeightMask & (wt << 12); + uint32_t weight1 = WeightMask & (wt << 8); + uint32_t weight2 = WeightMask & (wt << 4); + uint32_t weight3 = WeightMask & (wt); + + uint32_t in; + memcpy((void*) &in, inputs, sizeof(in)); + + const uint32_t ActMask = 0x000F000F; // to explicit instructions + uint32_t act0 = in & ActMask; + // Expect second operand shift + uint32_t act1 = ActMask & (in >> 4); + uint32_t act2 = ActMask & (in >> 8); + uint32_t act3 = ActMask & (in >> 12); + + Sum_T sum = 0; + sum = __SMLAD(act0, weight0, sum); + sum = __SMLAD(act1, weight1, sum); + sum = __SMLAD(act2, weight2, sum); + sum = __SMLAD(act3, weight3, sum); + + return weightedSum + (sum >> 12); +} + +/** + * @brief Signed octo mac operation (4W/4A version) + * @details Performs eight mac operations for signed 4-bits weights + * and signed 4-bits inputs. Extracts the eight 4-bits weights + * from four stored 8-bits weights and associates them into + * four 32-bits values. Then extracts the eight 4-bits inputs + * from four stored 8-bits inputs and associates them into + * four 32-bits values. Finally performs a quadruple dual mac operation + * with the __SMLAD instruction + * + * @tparam Input_T Input type (should be data<4>) + * @tparam Weight_T Weight type (should be data<4>) + * + * @param[in] inputs Pointer to compressed input vector + * @param[in] weights Pointer to compressed kernel weights + * @param[in,out] weightedSum Accumulating sum from the + * previous mac operations + * @returns Updated weightedSum with + * the result of the octo mac operation + */ +template<typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(!std::is_unsigned<Input_T>::value + && std::numeric_limits<Weight_T>::digits == 4 + && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr> +__attribute__((always_inline)) static inline +Sum_T octoMac(const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T weightedSum) +{ + uint32_t wt; + std::memcpy((void*) &wt, weights, sizeof(wt)); + + int32_t w0 = __SBFX(wt, 0, 4); + int32_t w1 = __SBFX(wt, 4, 4); + int32_t w2 = __SBFX(wt, 8, 4); + int32_t w3 = __SBFX(wt, 12, 4); + int32_t w4 = __SBFX(wt, 16, 4); + int32_t w5 = __SBFX(wt, 20, 4); + int32_t w6 = __SBFX(wt, 24, 4); + int32_t w7 = __SBFX(wt, 28, 4); + + uint32_t evenW1 = __PKHBT(w2, w0, 16); + uint32_t oddW1 = __PKHBT(w3, w1, 16); + uint32_t evenW2 = __PKHBT(w6, w4, 16); + uint32_t oddW2 = __PKHBT(w7, w5, 16); + + uint32_t in; + std::memcpy((void*) &in, inputs, sizeof(in)); + + int32_t a0 = __SBFX(in, 0, 4); + int32_t a1 = __SBFX(in, 4, 4); + int32_t a2 = __SBFX(in, 8, 4); + int32_t a3 = __SBFX(in, 12, 4); + int32_t a4 = __SBFX(in, 16, 4); + int32_t a5 = __SBFX(in, 20, 4); + int32_t a6 = __SBFX(in, 24, 4); + int32_t a7 = __SBFX(in, 28, 4); + + uint32_t evenA1 = __PKHBT(a2, a0, 16); + uint32_t oddA1 = __PKHBT(a3, a1, 16); + uint32_t evenA2 = __PKHBT(a6, a4, 16); + uint32_t oddA2 = __PKHBT(a7, a5, 16); + + weightedSum = __SMLAD(evenA1, evenW1, weightedSum); + weightedSum = __SMLAD(oddA1, oddW1, weightedSum); + weightedSum = __SMLAD(evenA2, evenW2, weightedSum); + weightedSum = __SMLAD(oddA2, oddW2, weightedSum); + + return weightedSum; +} + + +template<typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::is_unsigned<Input_T>::value + && std::numeric_limits<Weight_T>::digits == 4 + && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr> +void macsOnParallel(const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T* weightedSums, + const int nb_data) +{ + uint32_t wt = 0; + std::memcpy((void*) &wt, weights, ceil((double)nb_data/2)); + + uint32_t in = 0; + std::memcpy((void*) &in, inputs, ceil((double)nb_data/2)); + + for (int i = 0; i < nb_data; ++i) { + weightedSums[i] += __SBFX(wt, 4*i, 4) * __UBFX(in, 4*i, 4); + } +} + +template<typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(!std::is_unsigned<Input_T>::value + && std::numeric_limits<Weight_T>::digits == 4 + && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr> +void macsOnParallel(const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T* weightedSums, + const int nb_data) +{ + uint32_t wt = 0; + std::memcpy((void*) &wt, weights, ceil((double)nb_data/2)); + + uint32_t in = 0; + std::memcpy((void*) &in, inputs, ceil((double)nb_data/2)); + + for (int i = 0; i < nb_data; ++i) { + weightedSums[i] += __SBFX(wt, 4*i, 4) * __SBFX(in, 4*i, 4); + } +} + + +// ---------------------------------------------------------------------------- +// ------------------ Notes about performing MAC operations ------------------- +// --------------------------- with 1-bit weights ----------------------------- +// ---------------------------------------------------------------------------- + +/** + * @note How to perform MAC operations with 1-bit weight + * + * Working with an 1-bit weight means working only with two possible values + * for each weight. Thus, it has been defined a convention that will be used + * in the following functions in this file. + * Convention: when the value of a weight is 0, it means 1 + * when the value of a weight is 1, it means -1 + * + * Example: let's take a simple dual MAC operation + * weightedSum = w0 * a0 + w1 * a1; + * + * if w0 = 0x00 and w1 = 0x01 then weightedSum should be: + * weightedSum = a0 - a1; + * + * To easily perform MAC operations and use as often as possible + * SIMD instructions to parallelize and speed up MAC calculations, most of + * the following functions use the same scheme: + * + * - Perform a parallel subtraction of 0 and the weights + * Some SIMD instructions as __USUB16 and __USUB8 can perform + * parallel subtractions and activate a Greater or Equal flag (GE) if + * the results of each subtraction is positive. + * Thus, if the result of 0 - w0 >= 0 ==> GE[0] = 1 + * 0 - w0 < 0 ==> GE[0] = 0 + * (the results of the subtractions are not saved because only the + * GE flags trigger is required) + * + * - Use of the __SEL instruction to read the GE flags + * The __SEL can select an input from two values according to the + * the GE flag provided by the previous subtraction. In the case of + * the 1W/8A project, the two possible values selected by __SEL are + * (+input) or (-input). Thus, __SEL is often used like "__SEL(in, -in)" + * The results of __SEL are saved as MAC results + * + * - Addition of the accumuling sums with the results of the MAC operations + * Use of __SADD16 or __SADD8 for signed additions + * + */ + +// ---------------------------------------------------------------------------- +// ----------------- MAC computing functions for kernel ----------------------- +// ------------------------------- 1W / 8A ------------------------------------ +// ------------------------------- 1W / 7A ------------------------------------ +// ------------------------------- 1W / 6A ------------------------------------ +// ------------------------------- 1W / 5A ------------------------------------ +// ---------------------------------------------------------------------------- + +template<typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1 + && std::numeric_limits<Input_T>::digits <= 8 + && std::numeric_limits<Input_T>::digits > 4)>::type* = nullptr> +__attribute__((always_inline)) static inline +Sum_T monoMac (const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T weightedSum) +{ + weightedSum += (weights[0].fields.op7) ? (Sum_T)(-(inputs[0])) : (Sum_T)(inputs[0]); + return weightedSum; +} + +template<int NB_ITERATIONS, + typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1 + && std::numeric_limits<Input_T>::digits <= 8 + && std::numeric_limits<Input_T>::digits > 4 + && NB_ITERATIONS == 2)>::type* = nullptr> +__attribute__((always_inline)) static inline +void macsOnRange (const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T& weightedSum) +{ + weightedSum += (weights[0].fields.op7) ? (Sum_T)(-(inputs[0])) : (Sum_T)(inputs[0]); + weightedSum += (weights[0].fields.op6) ? (Sum_T)(-(inputs[1])) : (Sum_T)(inputs[1]); +} + +template<int NB_ITERATIONS, + typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1 + && std::numeric_limits<Input_T>::digits <= 8 + && std::numeric_limits<Input_T>::digits > 4 + && NB_ITERATIONS == 3)>::type* = nullptr> +__attribute__((always_inline)) static inline +void macsOnRange (const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T& weightedSum) +{ + weightedSum += (weights[0].fields.op7) ? (Sum_T)(-(inputs[0])) : (Sum_T)(inputs[0]); + weightedSum += (weights[0].fields.op6) ? (Sum_T)(-(inputs[1])) : (Sum_T)(inputs[1]); + weightedSum += (weights[0].fields.op5) ? (Sum_T)(-(inputs[2])) : (Sum_T)(inputs[2]); +} + +template<int NB_ITERATIONS, + typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1 + && std::numeric_limits<Input_T>::digits <= 8 + && std::numeric_limits<Input_T>::digits > 4 + && NB_ITERATIONS == 4)>::type* = nullptr> +__attribute__((always_inline)) static inline +void macsOnRange (const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T& weightedSum) +{ + weightedSum += (weights[0].fields.op7) ? (Sum_T)(-(inputs[0])) : (Sum_T)(inputs[0]); + weightedSum += (weights[0].fields.op6) ? (Sum_T)(-(inputs[1])) : (Sum_T)(inputs[1]); + weightedSum += (weights[0].fields.op5) ? (Sum_T)(-(inputs[2])) : (Sum_T)(inputs[2]); + weightedSum += (weights[0].fields.op4) ? (Sum_T)(-(inputs[3])) : (Sum_T)(inputs[3]); +} + +template<int NB_ITERATIONS, + typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1 + && std::numeric_limits<Input_T>::digits <= 8 + && std::numeric_limits<Input_T>::digits > 4 + && NB_ITERATIONS == 5)>::type* = nullptr> +__attribute__((always_inline)) static inline +void macsOnRange (const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T& weightedSum) +{ + weightedSum += (weights[0].fields.op7) ? (Sum_T)(-(inputs[0])) : (Sum_T)(inputs[0]); + weightedSum += (weights[0].fields.op6) ? (Sum_T)(-(inputs[1])) : (Sum_T)(inputs[1]); + weightedSum += (weights[0].fields.op5) ? (Sum_T)(-(inputs[2])) : (Sum_T)(inputs[2]); + weightedSum += (weights[0].fields.op4) ? (Sum_T)(-(inputs[3])) : (Sum_T)(inputs[3]); + weightedSum += (weights[0].fields.op3) ? (Sum_T)(-(inputs[4])) : (Sum_T)(inputs[4]); +} + +template<int NB_ITERATIONS, + typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1 + && std::numeric_limits<Input_T>::digits <= 8 + && std::numeric_limits<Input_T>::digits > 4 + && NB_ITERATIONS == 6)>::type* = nullptr> +__attribute__((always_inline)) static inline +void macsOnRange (const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T& weightedSum) +{ + weightedSum += (weights[0].fields.op7) ? (Sum_T)(-(inputs[0])) : (Sum_T)(inputs[0]); + weightedSum += (weights[0].fields.op6) ? (Sum_T)(-(inputs[1])) : (Sum_T)(inputs[1]); + weightedSum += (weights[0].fields.op5) ? (Sum_T)(-(inputs[2])) : (Sum_T)(inputs[2]); + weightedSum += (weights[0].fields.op4) ? (Sum_T)(-(inputs[3])) : (Sum_T)(inputs[3]); + weightedSum += (weights[0].fields.op3) ? (Sum_T)(-(inputs[4])) : (Sum_T)(inputs[4]); + weightedSum += (weights[0].fields.op2) ? (Sum_T)(-(inputs[5])) : (Sum_T)(inputs[5]); +} + +template<int NB_ITERATIONS, + typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1 + && std::numeric_limits<Input_T>::digits <= 8 + && std::numeric_limits<Input_T>::digits > 4 + && NB_ITERATIONS == 7)>::type* = nullptr> +__attribute__((always_inline)) static inline +void macsOnRange (const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T& weightedSum) +{ + weightedSum += (weights[0].fields.op7) ? (Sum_T)(-(inputs[0])) : (Sum_T)(inputs[0]); + weightedSum += (weights[0].fields.op6) ? (Sum_T)(-(inputs[1])) : (Sum_T)(inputs[1]); + weightedSum += (weights[0].fields.op5) ? (Sum_T)(-(inputs[2])) : (Sum_T)(inputs[2]); + weightedSum += (weights[0].fields.op4) ? (Sum_T)(-(inputs[3])) : (Sum_T)(inputs[3]); + weightedSum += (weights[0].fields.op3) ? (Sum_T)(-(inputs[4])) : (Sum_T)(inputs[4]); + weightedSum += (weights[0].fields.op2) ? (Sum_T)(-(inputs[5])) : (Sum_T)(inputs[5]); + weightedSum += (weights[0].fields.op1) ? (Sum_T)(-(inputs[6])) : (Sum_T)(inputs[6]); +} + + +// ---------------------------------------------------------------------------- +// ----------------- MAC computing functions for kernel ----------------------- +// ------------------------------- 1W / 8A ------------------------------------ +// ---------------------------------------------------------------------------- + +template<typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1 + && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr> +__attribute__((always_inline)) static inline +Sum_T octoMac (const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T weightedSum) +{ + uint32_t mac_result = 0; + uint32_t in; + uint32_t wt = 0; + std::memcpy((void*) &wt, weights, 1); + wt |= wt << 16; + + memcpy((void*) &in, inputs, sizeof(in)); + uint32_t evenA1 = __UXTB16(in); + uint32_t oddA1 = __UXTB16_RORn(in, 8); + uint32_t neg_evenA1 = __SSUB16(0, evenA1); + uint32_t neg_oddA1 = __SSUB16(0, oddA1); + + __USUB16(0, wt & 0x40001); + mac_result = __SEL(evenA1, neg_evenA1); + weightedSum = __SADD16(mac_result, weightedSum); + + __USUB16(0, wt & 0x80002); + mac_result = __SEL(oddA1, neg_oddA1); + weightedSum = __SADD16(mac_result, weightedSum); + + + memcpy((void*) &in, inputs + 4, sizeof(in)); + evenA1 = __UXTB16(in); + oddA1 = __UXTB16_RORn(in, 8); + neg_evenA1 = __SSUB16(0, evenA1); + neg_oddA1 = __SSUB16(0, oddA1); + + __USUB16(0, wt & 0x400010); + mac_result = __SEL(evenA1, neg_evenA1); + weightedSum = __SADD16(mac_result, weightedSum); + + __USUB16(0, wt & 0x800020); + mac_result = __SEL(oddA1, neg_oddA1); + weightedSum = __SADD16(mac_result, weightedSum); + + return weightedSum; +} + +template<typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1 + && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr> +__attribute__((always_inline)) static inline +Sum_T quadquadMac (const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T weightedSum) +{ + uint32_t mac_result = 0; + uint32_t in; + uint32_t wt = 0; + std::memcpy((void*) &wt, weights, 2); + wt |= wt << 16; + + memcpy((void*) &in, inputs, sizeof(in)); + uint32_t evenA1 = __UXTB16(in); + uint32_t oddA1 = __UXTB16_RORn(in, 8); + uint32_t neg_evenA1 = __SSUB16(0, evenA1); + uint32_t neg_oddA1 = __SSUB16(0, oddA1); + + __USUB16(0, wt & 0x40001); + mac_result = __SEL(evenA1, neg_evenA1); + weightedSum = __SADD16(mac_result, weightedSum); + + __USUB16(0, wt & 0x80002); + mac_result = __SEL(oddA1, neg_oddA1); + weightedSum = __SADD16(mac_result, weightedSum); + + + memcpy((void*) &in, inputs + 4, sizeof(in)); + evenA1 = __UXTB16(in); + oddA1 = __UXTB16_RORn(in, 8); + neg_evenA1 = __SSUB16(0, evenA1); + neg_oddA1 = __SSUB16(0, oddA1); + + __USUB16(0, wt & 0x400010); + mac_result = __SEL(evenA1, neg_evenA1); + weightedSum = __SADD16(mac_result, weightedSum); + + __USUB16(0, wt & 0x800020); + mac_result = __SEL(oddA1, neg_oddA1); + weightedSum = __SADD16(mac_result, weightedSum); + + + memcpy((void*) &in, inputs + 8, sizeof(in)); + evenA1 = __UXTB16(in); + oddA1 = __UXTB16_RORn(in, 8); + neg_evenA1 = __SSUB16(0, evenA1); + neg_oddA1 = __SSUB16(0, oddA1); + + __USUB16(0, wt & 0x4000100); + mac_result = __SEL(evenA1, neg_evenA1); + weightedSum = __SADD16(mac_result, weightedSum); + + __USUB16(0, wt & 0x8000200); + mac_result = __SEL(oddA1, neg_oddA1); + weightedSum = __SADD16(mac_result, weightedSum); + + + memcpy((void*) &in, inputs + 12, sizeof(in)); + evenA1 = __UXTB16(in); + oddA1 = __UXTB16_RORn(in, 8); + neg_evenA1 = __SSUB16(0, evenA1); + neg_oddA1 = __SSUB16(0, oddA1); + + __USUB16(0, wt & 0x40001000); + mac_result = __SEL(evenA1, neg_evenA1); + weightedSum = __SADD16(mac_result, weightedSum); + + __USUB16(0, wt & 0x80002000); + mac_result = __SEL(oddA1, neg_oddA1); + weightedSum = __SADD16(mac_result, weightedSum); + + return weightedSum; +} + +template<typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1 + && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr> +__attribute__((always_inline)) static inline +Sum_T octoquadMac (const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T weightedSum) +{ + uint32_t mac_result = 0; + uint32_t in; + uint32_t wt; + memcpy((void*) &wt, weights, 4); + uint32_t wt1 = __PKHBT(wt, wt, 16); + uint32_t wt2 = __PKHTB(wt, wt, 16); + + memcpy((void*) &in, inputs, sizeof(in)); + uint32_t evenA1 = __UXTB16(in); + uint32_t oddA1 = __UXTB16_RORn(in, 8); + uint32_t neg_evenA1 = __SSUB16(0, evenA1); + uint32_t neg_oddA1 = __SSUB16(0, oddA1); + + __USUB16(0, wt & 0x40001); + mac_result = __SEL(evenA1, neg_evenA1); + weightedSum = __SADD16(mac_result, weightedSum); + + __USUB16(0, wt & 0x80002); + mac_result = __SEL(oddA1, neg_oddA1); + weightedSum = __SADD16(mac_result, weightedSum); + + + memcpy((void*) &in, inputs + 4, sizeof(in)); + evenA1 = __UXTB16(in); + oddA1 = __UXTB16_RORn(in, 8); + neg_evenA1 = __SSUB16(0, evenA1); + neg_oddA1 = __SSUB16(0, oddA1); + + __USUB16(0, wt & 0x400010); + mac_result = __SEL(evenA1, neg_evenA1); + weightedSum = __SADD16(mac_result, weightedSum); + + __USUB16(0, wt & 0x800020); + mac_result = __SEL(oddA1, neg_oddA1); + weightedSum = __SADD16(mac_result, weightedSum); + + + memcpy((void*) &in, inputs + 8, sizeof(in)); + evenA1 = __UXTB16(in); + oddA1 = __UXTB16_RORn(in, 8); + neg_evenA1 = __SSUB16(0, evenA1); + neg_oddA1 = __SSUB16(0, oddA1); + + __USUB16(0, wt & 0x4000100); + mac_result = __SEL(evenA1, neg_evenA1); + weightedSum = __SADD16(mac_result, weightedSum); + + __USUB16(0, wt & 0x8000200); + mac_result = __SEL(oddA1, neg_oddA1); + weightedSum = __SADD16(mac_result, weightedSum); + + + memcpy((void*) &in, inputs + 12, sizeof(in)); + evenA1 = __UXTB16(in); + oddA1 = __UXTB16_RORn(in, 8); + neg_evenA1 = __SSUB16(0, evenA1); + neg_oddA1 = __SSUB16(0, oddA1); + + __USUB16(0, wt & 0x40001000); + mac_result = __SEL(evenA1, neg_evenA1); + weightedSum = __SADD16(mac_result, weightedSum); + + __USUB16(0, wt & 0x80002000); + mac_result = __SEL(oddA1, neg_oddA1); + weightedSum = __SADD16(mac_result, weightedSum); + + + memcpy((void*) &in, inputs + 16, sizeof(in)); + evenA1 = __UXTB16(in); + oddA1 = __UXTB16_RORn(in, 8); + neg_evenA1 = __SSUB16(0, evenA1); + neg_oddA1 = __SSUB16(0, oddA1); + + __USUB16(0, wt2 & 0x40001); + mac_result = __SEL(evenA1, neg_evenA1); + weightedSum = __SADD16(mac_result, weightedSum); + + __USUB16(0, wt2 & 0x80002); + mac_result = __SEL(oddA1, neg_oddA1); + weightedSum = __SADD16(mac_result, weightedSum); + + + memcpy((void*) &in, inputs + 20, sizeof(in)); + evenA1 = __UXTB16(in); + oddA1 = __UXTB16_RORn(in, 8); + neg_evenA1 = __SSUB16(0, evenA1); + neg_oddA1 = __SSUB16(0, oddA1); + + __USUB16(0, wt2 & 0x400010); + mac_result = __SEL(evenA1, neg_evenA1); + weightedSum = __SADD16(mac_result, weightedSum); + + __USUB16(0, wt2 & 0x800020); + mac_result = __SEL(oddA1, neg_oddA1); + weightedSum = __SADD16(mac_result, weightedSum); + + + memcpy((void*) &in, inputs + 24, sizeof(in)); + evenA1 = __UXTB16(in); + oddA1 = __UXTB16_RORn(in, 8); + neg_evenA1 = __SSUB16(0, evenA1); + neg_oddA1 = __SSUB16(0, oddA1); + + __USUB16(0, wt2 & 0x4000100); + mac_result = __SEL(evenA1, neg_evenA1); + weightedSum = __SADD16(mac_result, weightedSum); + + __USUB16(0, wt2 & 0x8000200); + mac_result = __SEL(oddA1, neg_oddA1); + weightedSum = __SADD16(mac_result, weightedSum); + + + memcpy((void*) &in, inputs + 28, sizeof(in)); + evenA1 = __UXTB16(in); + oddA1 = __UXTB16_RORn(in, 8); + neg_evenA1 = __SSUB16(0, evenA1); + neg_oddA1 = __SSUB16(0, oddA1); + + __USUB16(0, wt2 & 0x40001000); + mac_result = __SEL(evenA1, neg_evenA1); + weightedSum = __SADD16(mac_result, weightedSum); + + __USUB16(0, wt2 & 0x80002000); + mac_result = __SEL(oddA1, neg_oddA1); + weightedSum = __SADD16(mac_result, weightedSum); + + return weightedSum; +} + +// ---------------------------------------------------------------------------- +// ----------------- MAC computing functions for kernel ----------------------- +// ------------------------------- 1W / 7A ------------------------------------ +// ---------------------------------------------------------------------------- + +template<typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1 + && std::numeric_limits<Input_T>::digits == 7)>::type* = nullptr> +__attribute__((always_inline)) static inline +Sum_T octoMac (const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T weightedSum) +{ + uint32_t mac_result = 0; + uint32_t in; + uint32_t neg_in; + uint32_t wt = 0; + std::memcpy((void*) &wt, weights, 1); + wt |= wt << 8; + wt |= wt << 16; + + memcpy((void*) &in, inputs, sizeof(in)); + + // Sign extend + if (!std::is_unsigned<Input_T>::value) + in = (in + 0xC0C0C0C0) ^ 0xC0C0C0C0; + + neg_in = __SSUB8(0, in); + __USUB8(0, wt & 0x08040201); + mac_result = __SEL(in, neg_in); + weightedSum = __SXTAB16(weightedSum, mac_result); + weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); + + memcpy((void*) &in, inputs + 4, sizeof(in)); + + // Sign extend + if (!std::is_unsigned<Input_T>::value) + in = (in + 0xC0C0C0C0) ^ 0xC0C0C0C0; + + neg_in = __SSUB8(0, in); + __USUB8(0, wt & 0x80402010); + mac_result = __SEL(in, neg_in); + weightedSum = __SXTAB16(weightedSum, mac_result); + weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); + + return weightedSum; +} + +template<typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1 + && std::numeric_limits<Input_T>::digits == 7)>::type* = nullptr> +__attribute__((always_inline)) static inline +Sum_T quadquadMac (const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T weightedSum) { - return std::is_unsigned<Input_T>::value ? __UXTB16(val) : __SXTB16(val); + uint32_t mac_result = 0; + uint32_t in; + uint32_t neg_in; + uint32_t wt = 0; + std::memcpy((void*) &wt, weights, 4); + + memcpy((void*) &in, inputs, sizeof(in)); + neg_in = __SSUB8(0, in); + __USUB8(0, wt & 0x01010101); + mac_result = __SEL(in, neg_in); + weightedSum = __SXTAB16(weightedSum, mac_result); + weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); + + memcpy((void*) &in, inputs + 4, sizeof(in)); + neg_in = __SSUB8(0, in); + __USUB8(0, wt & 0x02020202); + mac_result = __SEL(in, neg_in); + weightedSum = __SXTAB16(weightedSum, mac_result); + weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); + + memcpy((void*) &in, inputs + 8, sizeof(in)); + neg_in = __SSUB8(0, in); + __USUB8(0, wt & 0x04040404); + mac_result = __SEL(in, neg_in); + weightedSum = __SXTAB16(weightedSum, mac_result); + weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); + + memcpy((void*) &in, inputs + 12, sizeof(in)); + neg_in = __SSUB8(0, in); + __USUB8(0, wt & 0x08080808); + mac_result = __SEL(in, neg_in); + weightedSum = __SXTAB16(weightedSum, mac_result); + weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); + + return weightedSum; } -template<int INPUTS_INC = 1, - int WEIGHTS_INC = 1, - typename Input_T, - typename Weight_T, - typename Sum_T> -inline static -Sum_T dualMac(const Input_T* __restrict inputs, - const Weight_T* __restrict weights, - Sum_T weightedSum) +template<typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1 + && std::numeric_limits<Input_T>::digits == 7)>::type* = nullptr> +__attribute__((always_inline)) static inline +Sum_T octoquadMac (const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T weightedSum) { - weightedSum += inputs[0] * weights[0] - + inputs[INPUTS_INC] * weights[WEIGHTS_INC]; + uint32_t mac_result = 0; + uint32_t in; + uint32_t neg_in; + uint32_t wt = 0; + std::memcpy((void*) &wt, weights, 4); + + memcpy((void*) &in, inputs, sizeof(in)); + neg_in = __SSUB8(0, in); + __USUB8(0, wt & 0x01010101); + mac_result = __SEL(in, neg_in); + weightedSum = __SXTAB16(weightedSum, mac_result); + weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); + + memcpy((void*) &in, inputs + 4, sizeof(in)); + neg_in = __SSUB8(0, in); + __USUB8(0, wt & 0x02020202); + mac_result = __SEL(in, neg_in); + weightedSum = __SXTAB16(weightedSum, mac_result); + weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); + + memcpy((void*) &in, inputs + 8, sizeof(in)); + neg_in = __SSUB8(0, in); + __USUB8(0, wt & 0x04040404); + mac_result = __SEL(in, neg_in); + weightedSum = __SXTAB16(weightedSum, mac_result); + weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); + + memcpy((void*) &in, inputs + 12, sizeof(in)); + neg_in = __SSUB8(0, in); + __USUB8(0, wt & 0x08080808); + mac_result = __SEL(in, neg_in); + weightedSum = __SXTAB16(weightedSum, mac_result); + weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); + + memcpy((void*) &in, inputs + 16, sizeof(in)); + neg_in = __SSUB8(0, in); + __USUB8(0, wt & 0x10101010); + mac_result = __SEL(in, neg_in); + weightedSum = __SXTAB16(weightedSum, mac_result); + weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); + + memcpy((void*) &in, inputs + 20, sizeof(in)); + neg_in = __SSUB8(0, in); + __USUB8(0, wt & 0x20202020); + mac_result = __SEL(in, neg_in); + weightedSum = __SXTAB16(weightedSum, mac_result); + weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); + + memcpy((void*) &in, inputs + 24, sizeof(in)); + neg_in = __SSUB8(0, in); + __USUB8(0, wt & 0x40404040); + mac_result = __SEL(in, neg_in); + weightedSum = __SXTAB16(weightedSum, mac_result); + weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); + + memcpy((void*) &in, inputs + 28, sizeof(in)); + neg_in = __SSUB8(0, in); + __USUB8(0, wt & 0x80808080); + mac_result = __SEL(in, neg_in); + weightedSum = __SXTAB16(weightedSum, mac_result); + weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); return weightedSum; } -template<int INPUTS_INC = 1, - int WEIGHTS_INC = 1, - typename Input_T, - typename Weight_T, - typename Sum_T, - typename std::enable_if<std::is_floating_point<Input_T>::value>::type* = nullptr> -inline static -Sum_T quadMac(const Input_T* __restrict inputs, - const Weight_T* __restrict weights, - Sum_T weightedSum) +// ---------------------------------------------------------------------------- +// ----------------- MAC computing functions for kernel ----------------------- +// ------------------------------- 1W / 5A ------------------------------------ +// ---------------------------------------------------------------------------- + +template<typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1 + && std::numeric_limits<Input_T>::digits == 5)>::type* = nullptr> +__attribute__((always_inline)) static inline +Sum_T octoMac (const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T weightedSum) { - weightedSum += inputs[0*INPUTS_INC] * weights[0*WEIGHTS_INC] - + inputs[1*INPUTS_INC] * weights[1*WEIGHTS_INC] - + inputs[2*INPUTS_INC] * weights[2*WEIGHTS_INC] - + inputs[3*INPUTS_INC] * weights[3*WEIGHTS_INC]; + uint32_t sum = 0; + uint32_t mac_result = 0; + uint32_t in; + uint32_t neg_in; + uint32_t wt = 0; + std::memcpy((void*) &wt, weights, 1); + wt |= wt << 8; + wt |= wt << 16; + + memcpy((void*) &in, inputs, sizeof(in)); + + // Sign extend + if (!std::is_unsigned<Input_T>::value) + in = (in + 0x70707070) ^ 0x70707070; + + neg_in = __SSUB8(0, in); + __USUB8(0, wt & 0x08040201); + sum = __SEL(in, neg_in); + + memcpy((void*) &in, inputs + 4, sizeof(in)); + + // Sign extend + if (!std::is_unsigned<Input_T>::value) + in = (in + 0x70707070) ^ 0x70707070; + + neg_in = __SSUB8(0, in); + __USUB8(0, wt & 0x80402010); + mac_result = __SEL(in, neg_in); + + sum = __QADD8(sum, mac_result); return weightedSum; } -template<int INPUTS_INC = 1, - int WEIGHTS_INC = 1, - typename Input_T, - typename Weight_T, - typename Sum_T, - typename std::enable_if<!std::is_floating_point<Input_T>::value>::type* = nullptr> -inline static -Sum_T quadMac(const Input_T* __restrict inputs, - const Weight_T* __restrict weights, - Sum_T weightedSum) +template<typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1 + && std::numeric_limits<Input_T>::digits == 5)>::type* = nullptr> +__attribute__((always_inline)) static inline +Sum_T quadquadMac (const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T weightedSum) { - if(INPUTS_INC != 1 || WEIGHTS_INC != 1) { - weightedSum += inputs[0*INPUTS_INC] * weights[0*WEIGHTS_INC] - + inputs[1*INPUTS_INC] * weights[1*WEIGHTS_INC] - + inputs[2*INPUTS_INC] * weights[2*WEIGHTS_INC] - + inputs[3*INPUTS_INC] * weights[3*WEIGHTS_INC]; + uint32_t sum = 0; + uint32_t mac_result = 0; + uint32_t in; + uint32_t neg_in; + uint32_t wt = 0; + std::memcpy((void*) &wt, weights, 2); - return weightedSum; - } + memcpy((void*) &in, inputs, sizeof(in)); + neg_in = __SSUB8(0, in); + __USUB8(0, wt & 0x01010101); + sum = __SEL(in, neg_in); - // Inputs loading & preparation + memcpy((void*) &in, inputs + 4, sizeof(in)); + neg_in = __SSUB8(0, in); + __USUB8(0, wt & 0x02020202); + mac_result = __SEL(in, neg_in); + sum = __QADD8(sum, mac_result); + + memcpy((void*) &in, inputs + 8, sizeof(in)); + neg_in = __SSUB8(0, in); + __USUB8(0, wt & 0x04040404); + mac_result = __SEL(in, neg_in); + sum = __QADD8(sum, mac_result); + + memcpy((void*) &in, inputs + 12, sizeof(in)); + neg_in = __SSUB8(0, in); + __USUB8(0, wt & 0x08080808); + mac_result = __SEL(in, neg_in); + sum = __QADD8(sum, mac_result); + + weightedSum = __SXTAB16(weightedSum, sum); + weightedSum = __SXTAB16_RORn(weightedSum, sum, 8); + + return weightedSum; +} + +template<typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1 + && std::numeric_limits<Input_T>::digits == 5)>::type* = nullptr> +__attribute__((always_inline)) static inline +Sum_T octoquadMac (const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T weightedSum) +{ + uint32_t sum = 0; + uint32_t mac_result = 0; uint32_t in; + uint32_t neg_in; + uint32_t wt = 0; + std::memcpy((void*) &wt, weights, 4); + memcpy((void*) &in, inputs, sizeof(in)); - - uint32_t in1 = XTB16<Input_T>(in); - uint32_t in2 = XTB16<Input_T>(in >> 8); - - // Weights loading & preparation - uint32_t wt; - memcpy((void*) &wt, weights, sizeof(wt)); - - uint32_t wt1 = XTB16<Weight_T>(wt); - uint32_t wt2 = XTB16<Weight_T>(wt >> 8); + neg_in = __SSUB8(0, in); + __USUB8(0, wt & 0x01010101); + sum = __SEL(in, neg_in); + + memcpy((void*) &in, inputs + 4, sizeof(in)); + neg_in = __SSUB8(0, in); + __USUB8(0, wt & 0x02020202); + mac_result = __SEL(in, neg_in); + sum = __QADD8(sum, mac_result); + + memcpy((void*) &in, inputs + 8, sizeof(in)); + neg_in = __SSUB8(0, in); + __USUB8(0, wt & 0x04040404); + mac_result = __SEL(in, neg_in); + sum = __QADD8(sum, mac_result); + + memcpy((void*) &in, inputs + 12, sizeof(in)); + neg_in = __SSUB8(0, in); + __USUB8(0, wt & 0x08080808); + mac_result = __SEL(in, neg_in); + sum = __QADD8(sum, mac_result); + + memcpy((void*) &in, inputs + 16, sizeof(in)); + neg_in = __SSUB8(0, in); + __USUB8(0, wt & 0x10101010); + mac_result = __SEL(in, neg_in); + sum = __QADD8(sum, mac_result); + + memcpy((void*) &in, inputs + 20, sizeof(in)); + neg_in = __SSUB8(0, in); + __USUB8(0, wt & 0x20202020); + mac_result = __SEL(in, neg_in); + sum = __QADD8(sum, mac_result); + + memcpy((void*) &in, inputs + 24, sizeof(in)); + neg_in = __SSUB8(0, in); + __USUB8(0, wt & 0x40404040); + mac_result = __SEL(in, neg_in); + sum = __QADD8(sum, mac_result); + + memcpy((void*) &in, inputs + 28, sizeof(in)); + neg_in = __SSUB8(0, in); + __USUB8(0, wt & 0x80808080); + mac_result = __SEL(in, neg_in); + sum = __QADD8(sum, mac_result); + + weightedSum = __SXTAB16(weightedSum, sum); + weightedSum = __SXTAB16_RORn(weightedSum, sum, 8); - // Computation - if(std::is_same<Sum_T, int32_t>::value) { - weightedSum = __SMLAD(in1, wt1, weightedSum); - weightedSum = __SMLAD(in2, wt2, weightedSum); - } - else { - weightedSum = __SMLALD(in1, wt1, weightedSum); - weightedSum = __SMLALD(in2, wt2, weightedSum); - - } - return weightedSum; } +template<int NB_ITERATIONS, + typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1 + && std::numeric_limits<Input_T>::digits <= 8 + && std::numeric_limits<Input_T>::digits > 4 + && NB_ITERATIONS >= 8 && NB_ITERATIONS < 16)>::type* = nullptr> +__attribute__((always_inline)) static inline +void macsOnRange (const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T& weightedSum) +{ + weightedSum = octoMac(inputs, weights, weightedSum); + macsOnRange<NB_ITERATIONS-8>(inputs + 8, weights + 1, weightedSum); +} + +template<int NB_ITERATIONS, + typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1 + && std::numeric_limits<Input_T>::digits <= 8 + && std::numeric_limits<Input_T>::digits > 4 + && NB_ITERATIONS >= 16 && NB_ITERATIONS < 32)>::type* = nullptr> +__attribute__((always_inline)) static inline +void macsOnRange (const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T& weightedSum) +{ + weightedSum = quadquadMac(inputs, weights, weightedSum); + macsOnRange<NB_ITERATIONS-16>(inputs + 16, weights + 2, weightedSum); +} + +template<int NB_ITERATIONS, + typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1 + && std::numeric_limits<Input_T>::digits <= 8 + && std::numeric_limits<Input_T>::digits > 4 + && NB_ITERATIONS >= 32)>::type* = nullptr> +__attribute__((always_inline)) static inline +void macsOnRange (const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T& weightedSum) +{ + weightedSum = octoquadMac(inputs, weights, weightedSum); + macsOnRange<NB_ITERATIONS-32>(inputs + 32, weights + 4, weightedSum); +} + + +// ---------------------------------------------------------------------------- +// ----------------- MAC computing functions for kernel ----------------------- +// ------------------------------- 1W / 4A ------------------------------------ +// ---------------------------------------------------------------------------- -// ************************************************************************** -// * Multiply-accumulate the values in inputs and weights for NB_ITERATIONS * -// ************************************************************************** +template<typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1 + && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr> +__attribute__((always_inline)) static inline +Sum_T monoMac (const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T weightedSum) +{ + weightedSum += (weights[0].fields.op7) ? (Sum_T)(-(inputs[0].fields.op1)) : (Sum_T)(inputs[0].fields.op1); + return weightedSum; +} template<int NB_ITERATIONS, - int INPUTS_INC = 1, - int WEIGHTS_INC = 1, - class Input_T, - class Weight_T, - class Sum_T, - typename std::enable_if<(NB_ITERATIONS == 0)>::type* = nullptr> -inline static -void macsOnRange(const Input_T* __restrict /*inputs*/, - const Weight_T* __restrict /*weights*/, - Sum_T& __restrict /*weightedSum*/) + typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1 + && std::numeric_limits<Input_T>::digits == 4 + && NB_ITERATIONS == 2)>::type* = nullptr> +__attribute__((always_inline)) static inline +void macsOnRange (const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T& weightedSum) { - // Nothing to do + weightedSum += (weights[0].fields.op7) ? (Sum_T)(-(inputs[0].fields.op1)) : (Sum_T)(inputs[0].fields.op1); + weightedSum += (weights[0].fields.op6) ? (Sum_T)(-(inputs[0].fields.op0)) : (Sum_T)(inputs[0].fields.op0); } template<int NB_ITERATIONS, - int INPUTS_INC = 1, - int WEIGHTS_INC = 1, - class Input_T, - class Weight_T, - class Sum_T, - typename std::enable_if<(NB_ITERATIONS == 1)>::type* = nullptr> -inline static -void macsOnRange(const Input_T* __restrict inputs, - const Weight_T* __restrict weights, - Sum_T& __restrict weightedSum) + typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1 + && std::numeric_limits<Input_T>::digits == 4 + && NB_ITERATIONS == 3)>::type* = nullptr> +__attribute__((always_inline)) static inline +void macsOnRange (const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T& weightedSum) { - weightedSum += (*weights) * (*inputs); + weightedSum += (weights[0].fields.op7) ? (Sum_T)(-(inputs[0].fields.op1)) : (Sum_T)(inputs[0].fields.op1); + weightedSum += (weights[0].fields.op6) ? (Sum_T)(-(inputs[0].fields.op0)) : (Sum_T)(inputs[0].fields.op0); + weightedSum += (weights[0].fields.op5) ? (Sum_T)(-(inputs[1].fields.op1)) : (Sum_T)(inputs[1].fields.op1); } template<int NB_ITERATIONS, - int INPUTS_INC = 1, - int WEIGHTS_INC = 1, - class Input_T, - class Weight_T, - class Sum_T, - typename std::enable_if<(NB_ITERATIONS >= 2 && NB_ITERATIONS < 4)>::type* = nullptr> -inline static -void macsOnRange(const Input_T* __restrict inputs, - const Weight_T* __restrict weights, - Sum_T& __restrict weightedSum) + typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1 + && std::numeric_limits<Input_T>::digits == 4 + && NB_ITERATIONS == 4)>::type* = nullptr> +__attribute__((always_inline)) static inline +void macsOnRange (const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T& weightedSum) { - weightedSum = dualMac<INPUTS_INC, WEIGHTS_INC>(inputs, weights, weightedSum); - macsOnRange<NB_ITERATIONS - 2, INPUTS_INC, WEIGHTS_INC>(inputs + 2*INPUTS_INC, - weights + 2*WEIGHTS_INC, - weightedSum); + weightedSum += (weights[0].fields.op7) ? (Sum_T)(-(inputs[0].fields.op1)) : (Sum_T)(inputs[0].fields.op1); + weightedSum += (weights[0].fields.op6) ? (Sum_T)(-(inputs[0].fields.op0)) : (Sum_T)(inputs[0].fields.op0); + weightedSum += (weights[0].fields.op5) ? (Sum_T)(-(inputs[1].fields.op1)) : (Sum_T)(inputs[1].fields.op1); + weightedSum += (weights[0].fields.op4) ? (Sum_T)(-(inputs[1].fields.op0)) : (Sum_T)(inputs[1].fields.op0); +} + +template<int NB_ITERATIONS, + typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1 + && std::numeric_limits<Input_T>::digits == 4 + && NB_ITERATIONS == 5)>::type* = nullptr> +__attribute__((always_inline)) static inline +void macsOnRange (const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T& weightedSum) +{ + weightedSum += (weights[0].fields.op7) ? (Sum_T)(-(inputs[0].fields.op1)) : (Sum_T)(inputs[0].fields.op1); + weightedSum += (weights[0].fields.op6) ? (Sum_T)(-(inputs[0].fields.op0)) : (Sum_T)(inputs[0].fields.op0); + weightedSum += (weights[0].fields.op5) ? (Sum_T)(-(inputs[1].fields.op1)) : (Sum_T)(inputs[1].fields.op1); + weightedSum += (weights[0].fields.op4) ? (Sum_T)(-(inputs[1].fields.op0)) : (Sum_T)(inputs[1].fields.op0); + weightedSum += (weights[0].fields.op3) ? (Sum_T)(-(inputs[2].fields.op1)) : (Sum_T)(inputs[2].fields.op1); +} + +template<int NB_ITERATIONS, + typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1 + && std::numeric_limits<Input_T>::digits == 4 + && NB_ITERATIONS == 6)>::type* = nullptr> +__attribute__((always_inline)) static inline +void macsOnRange (const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T& weightedSum) +{ + weightedSum += (weights[0].fields.op7) ? (Sum_T)(-(inputs[0].fields.op1)) : (Sum_T)(inputs[0].fields.op1); + weightedSum += (weights[0].fields.op6) ? (Sum_T)(-(inputs[0].fields.op0)) : (Sum_T)(inputs[0].fields.op0); + weightedSum += (weights[0].fields.op5) ? (Sum_T)(-(inputs[1].fields.op1)) : (Sum_T)(inputs[1].fields.op1); + weightedSum += (weights[0].fields.op4) ? (Sum_T)(-(inputs[1].fields.op0)) : (Sum_T)(inputs[1].fields.op0); + weightedSum += (weights[0].fields.op3) ? (Sum_T)(-(inputs[2].fields.op1)) : (Sum_T)(inputs[2].fields.op1); + weightedSum += (weights[0].fields.op2) ? (Sum_T)(-(inputs[2].fields.op0)) : (Sum_T)(inputs[2].fields.op0); +} + +template<int NB_ITERATIONS, + typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1 + && std::numeric_limits<Input_T>::digits == 4 + && NB_ITERATIONS == 7)>::type* = nullptr> +__attribute__((always_inline)) static inline +void macsOnRange (const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T& weightedSum) +{ + weightedSum += (weights[0].fields.op7) ? (Sum_T)(-(inputs[0].fields.op1)) : (Sum_T)(inputs[0].fields.op1); + weightedSum += (weights[0].fields.op6) ? (Sum_T)(-(inputs[0].fields.op0)) : (Sum_T)(inputs[0].fields.op0); + weightedSum += (weights[0].fields.op5) ? (Sum_T)(-(inputs[1].fields.op1)) : (Sum_T)(inputs[1].fields.op1); + weightedSum += (weights[0].fields.op4) ? (Sum_T)(-(inputs[1].fields.op0)) : (Sum_T)(inputs[1].fields.op0); + weightedSum += (weights[0].fields.op3) ? (Sum_T)(-(inputs[2].fields.op1)) : (Sum_T)(inputs[2].fields.op1); + weightedSum += (weights[0].fields.op2) ? (Sum_T)(-(inputs[2].fields.op0)) : (Sum_T)(inputs[2].fields.op0); + weightedSum += (weights[0].fields.op1) ? (Sum_T)(-(inputs[3].fields.op1)) : (Sum_T)(inputs[3].fields.op1); +} + +template<typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1 + && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr> +__attribute__((always_inline)) static inline +Sum_T octoMac (const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T weightedSum) +{ + uint32_t sum = 0; + uint32_t mac_result = 0; + uint32_t in; + uint32_t neg_in; + uint32_t wt = 0; + std::memcpy((void*) &wt, weights, 1); + wt |= wt << 8; + wt |= wt << 16; + + memcpy((void*) &in, inputs, sizeof(in)); + + neg_in = __SSUB8(0, in & 0x0F0F0F0F); + __USUB8(0, wt & 0x40100401); + mac_result = __SEL(in, neg_in); + sum = __QADD8(sum, mac_result); + + neg_in = __SSUB8(0, (in >> 4) & 0xF0F0F0F0); + __USUB8(0, wt & 0x80200802); + mac_result = __SEL(in, neg_in); + sum = __QADD8(sum, mac_result); + + return weightedSum; +} + +template<typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1 + && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr> +__attribute__((always_inline)) static inline +Sum_T quadquadMac (const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T weightedSum) +{ + uint32_t sum = 0; + uint32_t mac_result = 0; + uint32_t in; + uint32_t neg_in; + uint32_t wt = 0; + std::memcpy((void*) &wt, weights, 4); + + memcpy((void*) &in, inputs, sizeof(in)); + + neg_in = __SSUB8(0, in & 0x0F0F0F0F); + __USUB8(0, wt & 0x01010101); + mac_result = __SEL(in, neg_in); + sum = __QADD8(sum, mac_result); + + neg_in = __SSUB8(0, (in >> 4) & 0x0F0F0F0F); + __USUB8(0, wt & 0x02020202); + mac_result = __SEL(in, neg_in); + sum = __QADD8(sum, mac_result); + + memcpy((void*) &in, inputs + 4, sizeof(in)); + + neg_in = __SSUB8(0, in & 0x0F0F0F0F); + __USUB8(0, wt & 0x04040404); + mac_result = __SEL(in, neg_in); + sum = __QADD8(sum, mac_result); + + neg_in = __SSUB8(0, (in >> 4) & 0x0F0F0F0F); + __USUB8(0, wt & 0x08080808); + mac_result = __SEL(in, neg_in); + sum = __QADD8(sum, mac_result); + + + weightedSum = __SXTAB16(weightedSum, sum); + weightedSum = __SXTAB16_RORn(weightedSum, sum, 8); + + return weightedSum; +} + +template<typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1 + && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr> +__attribute__((always_inline)) static inline +Sum_T octoquadMac (const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T weightedSum) +{ + uint32_t sum = 0; + uint32_t mac_result = 0; + uint32_t in; + uint32_t neg_in; + uint32_t wt = 0; + std::memcpy((void*) &wt, weights, 4); + + memcpy((void*) &in, inputs, sizeof(in)); + + neg_in = __SSUB8(0, in & 0x0F0F0F0F); + __USUB8(0, wt & 0x01010101); + mac_result = __SEL(in, neg_in); + sum = __QADD8(sum, mac_result); + + neg_in = __SSUB8(0, (in >> 4) & 0x0F0F0F0F); + __USUB8(0, wt & 0x02020202); + mac_result = __SEL(in, neg_in); + sum = __QADD8(sum, mac_result); + + memcpy((void*) &in, inputs + 4, sizeof(in)); + + neg_in = __SSUB8(0, in & 0x0F0F0F0F); + __USUB8(0, wt & 0x04040404); + mac_result = __SEL(in, neg_in); + sum = __QADD8(sum, mac_result); + + neg_in = __SSUB8(0, (in >> 4) & 0x0F0F0F0F); + __USUB8(0, wt & 0x08080808); + mac_result = __SEL(in, neg_in); + sum = __QADD8(sum, mac_result); + + memcpy((void*) &in, inputs + 8, sizeof(in)); + + neg_in = __SSUB8(0, in & 0x0F0F0F0F); + __USUB8(0, wt & 0x10101010); + mac_result = __SEL(in, neg_in); + sum = __QADD8(sum, mac_result); + + neg_in = __SSUB8(0, (in >> 4) & 0x0F0F0F0F); + __USUB8(0, wt & 0x20202020); + mac_result = __SEL(in, neg_in); + sum = __QADD8(sum, mac_result); + + memcpy((void*) &in, inputs + 12, sizeof(in)); + + neg_in = __SSUB8(0, in & 0x0F0F0F0F); + __USUB8(0, wt & 0x40404040); + mac_result = __SEL(in, neg_in); + sum = __QADD8(sum, mac_result); + + neg_in = __SSUB8(0, (in >> 4) & 0x0F0F0F0F); + __USUB8(0, wt & 0x80808080); + mac_result = __SEL(in, neg_in); + sum = __QADD8(sum, mac_result); + + weightedSum = __SXTAB16(weightedSum, sum); + weightedSum = __SXTAB16_RORn(weightedSum, sum, 8); + + return weightedSum; +} + + +template<int NB_ITERATIONS, + typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1 + && std::numeric_limits<Input_T>::digits == 4 + && NB_ITERATIONS >= 8 && NB_ITERATIONS < 16)>::type* = nullptr> +__attribute__((always_inline)) static inline +void macsOnRange (const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T& weightedSum) +{ + weightedSum = octoMac(inputs, weights, weightedSum); + macsOnRange<NB_ITERATIONS-8>(inputs + 4, weights + 1, weightedSum); +} + +template<int NB_ITERATIONS, + typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1 + && std::numeric_limits<Input_T>::digits == 4 + && NB_ITERATIONS >= 16 && NB_ITERATIONS < 32)>::type* = nullptr> +__attribute__((always_inline)) static inline +void macsOnRange (const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T& weightedSum) +{ + weightedSum = quadquadMac(inputs, weights, weightedSum); + macsOnRange<NB_ITERATIONS-16>(inputs + 8, weights + 2, weightedSum); } -/** - * @brief MACs Processing - * @details Performs NB_ITERATIONS MACs operations, storing results into the - * weightedSum variable. - * - * @tparam NB_ITERATIONS Number of MACs to perform - * @tparam INPUTS_INC Input Stride - * @tparam WEIGHTS_INC Weights Stride - * @tparam Input_T Input Type - * - * @param inputs Pointer to inputs vector - * @param weights Pointer to weights vector - * @param weightedSum Pointer to weightedSum -*/ template<int NB_ITERATIONS, - int INPUTS_INC = 1, - int WEIGHTS_INC = 1, - class Input_T, - class Weight_T, - class Sum_T, - typename std::enable_if<(NB_ITERATIONS >= 4)>::type* = nullptr> -inline static -void macsOnRange(const Input_T* __restrict inputs, - const Weight_T* __restrict weights, - Sum_T& __restrict weightedSum) + typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1 + && std::numeric_limits<Input_T>::digits == 4 + && NB_ITERATIONS >= 32)>::type* = nullptr> +__attribute__((always_inline)) static inline +void macsOnRange (const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T& weightedSum) +{ + weightedSum = octoquadMac(inputs, weights, weightedSum); + macsOnRange<NB_ITERATIONS-32>(inputs + 16, weights + 4, weightedSum); +} + + +// ---------------------------------------------------------------------------- +// -------------- MAC computing functions for kernel 1W-7A -------------------- +// ---------------------------------------------------------------------------- + +template<typename Input_T, + typename std::enable_if<(std::numeric_limits<Input_T>::digits == 7)>::type* = nullptr> +__attribute__((always_inline)) static inline +uint32_t quadMacInter(const Input_T* __restrict inputs, + const uint32_t weight, + uint32_t weightedSum) +{ + uint32_t in; + memcpy((void*) &in, inputs, sizeof(in)); + + // Sign extend + if (!std::is_unsigned<Input_T>::value) + in = (in + 0xC0C0C0C0) ^ 0xC0C0C0C0; + + uint32_t neg_in = __SSUB8(0, in); + + __USUB8(0, weight); + uint32_t mac_result = __SEL(in, neg_in); + + uint32_t evenA1 = __SXTB16(mac_result); + uint32_t oddA1 = __SXTB16_RORn(mac_result, 8); + + weightedSum = __SADD16(evenA1, weightedSum); + weightedSum = __SADD16(oddA1, weightedSum); + + return weightedSum; +} + +template<typename Input_T, + typename std::enable_if<(std::numeric_limits<Input_T>::digits == 7)>::type* = nullptr> +__attribute__((always_inline)) static inline +uint32_t quadMacInterV2(const Input_T* __restrict inputs, + const uint32_t weight, + uint32_t weightedSum) +{ + uint32_t in; + memcpy((void*) &in, inputs, sizeof(in)); + + // Sign extend + if (!std::is_unsigned<Input_T>::value) + in = (in + 0xC0C0C0C0) ^ 0xC0C0C0C0; + + uint32_t neg_in = __SSUB8(0, in); + + __USUB8(0, weight); + uint32_t mac_result = __SEL(in, neg_in); + + weightedSum = __SXTAB16(weightedSum, mac_result); + weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); + + return weightedSum; +} + + +// ---------------------------------------------------------------------------- +// -------------- MAC computing functions for kernel 1W-5A -------------------- +// ---------------------------------------------------------------------------- + +template<typename Input_T, + typename std::enable_if<(std::numeric_limits<Input_T>::digits == 5)>::type* = nullptr> +__attribute__((always_inline)) static inline +uint32_t quadMacInter(const Input_T* __restrict inputs, + const uint32_t weight, + uint32_t weightedSum) +{ + uint32_t in; + memcpy((void*) &in, inputs, sizeof(in)); + + // Sign extend + if (!std::is_unsigned<Input_T>::value) + in = (in + 0x70707070) ^ 0x70707070; + + uint32_t neg_in = __SSUB8(0, in); + + __USUB8(0, weight); + uint32_t mac_result = __SEL(in, neg_in); + + weightedSum = __QADD8(weightedSum, mac_result); + + return weightedSum; +} + + +// ---------------------------------------------------------------------------- +// ------------------- MAC computing general functions ------------------------ +// ---------------------------------------------------------------------------- + +template<int NB_ITERATIONS, typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(NB_ITERATIONS == 0)>::type* = nullptr> +__attribute__((always_inline)) static inline +void macsOnRange(const Input_T* __restrict /*inputs*/, + const Weight_T* __restrict /*weights*/, + Sum_T& /*weightedSum*/) +{ + // Nothing should happen +} + +template<int NB_ITERATIONS, typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(NB_ITERATIONS == 1)>::type* = nullptr> +__attribute__((always_inline)) static inline +void macsOnRange(const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T& weightedSum) +{ + weightedSum = monoMac(inputs, weights, weightedSum); +} + +template<int NB_ITERATIONS, typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(NB_ITERATIONS >= 2 && NB_ITERATIONS < 4 && std::numeric_limits<Weight_T>::digits > 1)>::type* = nullptr> +__attribute__((always_inline)) static inline +void macsOnRange(const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T& weightedSum) +{ + constexpr unsigned int idxI + = (std::numeric_limits<Input_T>::digits > 4) ? 2 : 1; + constexpr unsigned int idxW + = (std::numeric_limits<Weight_T>::digits > 4) ? 2 : 1; + + weightedSum = dualMac(inputs, weights, weightedSum); + macsOnRange<NB_ITERATIONS - 2>(inputs + idxI, weights + idxW, weightedSum); +} + +template<int NB_ITERATIONS, typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<NB_ITERATIONS >= 4 + && (std::numeric_limits<Weight_T>::digits > 4)>::type* = nullptr> +__attribute__((always_inline)) static inline +void macsOnRange(const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T& weightedSum) +{ + constexpr unsigned int idxI + = (std::numeric_limits<Input_T>::digits > 4) + ? 4 : (std::numeric_limits<Input_T>::digits == 4) ? 2 : 1; + + constexpr unsigned int idxW = 4; + + weightedSum = quadMac(inputs, weights, weightedSum); + macsOnRange<NB_ITERATIONS - 4>(inputs + idxI, weights + idxW, weightedSum); +} + +template<int NB_ITERATIONS, typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<(NB_ITERATIONS >= 4 && NB_ITERATIONS < 8) + && (std::numeric_limits<Weight_T>::digits == 4)>::type* = nullptr> +__attribute__((always_inline)) static inline +void macsOnRange(const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T& weightedSum) { - weightedSum = quadMac<INPUTS_INC, WEIGHTS_INC>(inputs, weights, weightedSum); - macsOnRange<NB_ITERATIONS - 4, INPUTS_INC, WEIGHTS_INC>(inputs + 4*INPUTS_INC, - weights + 4*WEIGHTS_INC, - weightedSum); + constexpr unsigned int idxI + = (std::numeric_limits<Input_T>::digits > 4) + ? 4 : (std::numeric_limits<Input_T>::digits == 4) ? 2 : 1; + + constexpr unsigned int idxW = 2; + + weightedSum = quadMac(inputs, weights, weightedSum); + macsOnRange<NB_ITERATIONS - 4>(inputs + idxI, weights + idxW, weightedSum); } +template<int NB_ITERATIONS, typename Input_T, typename Weight_T, typename Sum_T, + typename std::enable_if<NB_ITERATIONS >= 8 + && (std::numeric_limits<Weight_T>::digits == 4)>::type* = nullptr> +__attribute__((always_inline)) static inline +void macsOnRange(const Input_T* __restrict inputs, + const Weight_T* __restrict weights, + Sum_T& weightedSum) +{ + constexpr unsigned int idxI + = (std::numeric_limits<Input_T>::digits > 4) + ? 8 : (std::numeric_limits<Input_T>::digits == 4) + ? 4 : (std::numeric_limits<Input_T>::digits == 2) + ? 2 : 1; + + constexpr unsigned int idxW = 4; + + weightedSum = octoMac(inputs, weights, weightedSum); + macsOnRange<NB_ITERATIONS - 8>(inputs + idxI, weights + idxW, weightedSum); +} -} // N2D2_Export -#endif // __N2D2_EXPORT_CPP_MACS_HPP__ +#endif // __N2D2_MAC_FUNCTIONS_HPP__ diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/nn_scaling_functions.hpp b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/nn_scaling_functions.hpp index 68d6f21..94615a5 100644 --- a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/nn_scaling_functions.hpp +++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/nn_scaling_functions.hpp @@ -27,15 +27,15 @@ namespace N2D2_Export { -static int64_t toInt64(uint32_t lo, uint32_t hi) { - return (int64_t) (((uint64_t) hi) << 32ull) | ((uint64_t) lo); -} - -static int64_t smlal(int32_t lhs, int32_t rhs, - uint32_t accumLo, uint32_t accumHi) -{ - return ((int64_t) lhs) * ((int64_t) rhs) + toInt64(accumLo, accumHi); -} +// static int64_t toInt64(uint32_t lo, uint32_t hi) { +// return (int64_t) (((uint64_t) hi) << 32ull) | ((uint64_t) lo); +// } + +// static int64_t smlal(int32_t lhs, int32_t rhs, +// uint32_t accumLo, uint32_t accumHi) +// { +// return ((int64_t) lhs) * ((int64_t) rhs) + toInt64(accumLo, accumHi); +// } // --------------------------------------------------- // ------------------- No Scaling -------------------- diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/subkernels_functions.hpp b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/subkernels_functions.hpp new file mode 100644 index 0000000..62743db --- /dev/null +++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/subkernels_functions.hpp @@ -0,0 +1,312 @@ +/** + ****************************************************************************** + * @file subkernels_functions.hpp + * @brief Header file for the network subkernels + * + ****************************************************************************** + * @attention + * + * (C) Copyright 2021 CEA LIST. All Rights Reserved. + * Contributor(s): Vincent TEMPLIER (vincent.templier@cea.fr) + * + * This file is not part of the open source version of N2D2 and is NOT under + * the CeCILL-C license. This code is the property of the CEA. It can not be + * copied or disseminated without its authorization. + * + ****************************************************************************** + */ + +#ifndef __SUBKERNELS_FUNCTIONS_H__ +#define __SUBKERNELS_FUNCTIONS_H__ + +#include <cstring> +#include <cmsis_compiler.h> +#include "typedefs.hpp" +#include "assert.h" + + +// ---------------------------------------------------------------------------- +// -------------------------- Compression functions --------------------------- +// ---------------------------------------------------------------------------- + +/** + * @brief Compact data during a loop with an accumulator + * @details This function is used in the network functions to compress + * and store a value in the outputs vector. The function adds + * the value to an accumulator. If the accumulator is full + * (ie all the available slots are taken), then the accumulator + * is stored in the outputs. Otherwise, the accumulator temporaly + * keeps the previous values and it is shifted by + * the number of bits required to store the quantized values. + * + * @param[in] value Value to be stored in the accumulator + * @param[in,out] outputs Pointer to compressed output vector + * @param[in,out] outputOffset Pointer to the current output index + * @param[in,out] infoPack Object containing the accumulator + * @returns None + * + */ +template<typename Output_T, typename std::enable_if_t<std::numeric_limits<Output_T>::digits < 8, int> = 0> +__attribute__((always_inline)) static inline +void compact_data_during_loop (Output_T value, + Output_T* __restrict outputs, + int& outputOffset, + PackSupport& infoPack) +{ + if (std::numeric_limits<Output_T>::digits < 8) { + constexpr uint8_t mask = (1U << std::numeric_limits<Output_T>::digits) - 1; + constexpr uint8_t nbSlot = ceil((double)8/std::numeric_limits<Output_T>::digits); + + infoPack.accumulator |= value.value & mask; + infoPack.cptAccumulator += 1; + + if (infoPack.cptAccumulator == nbSlot) { + outputs[outputOffset] = (Output_T) infoPack.accumulator; + ++outputOffset; + infoPack.cptAccumulator = 0; + infoPack.accumulator = 0; + } + else { + infoPack.accumulator <<= std::numeric_limits<Output_T>::digits; + } + } else { + outputs[outputOffset] = (Output_T) value; + ++outputOffset; + } +} + +template<typename Output_T, typename std::enable_if_t<std::numeric_limits<Output_T>::digits >= 8, int> = 0> +__attribute__((always_inline)) static inline +void compact_data_during_loop (const Output_T value, + Output_T* __restrict outputs, + int& outputOffset, + PackSupport& infoPack) +{ + outputs[outputOffset] = value; +} + +/** + * @brief Compact data after a loop with an accumulator + * @details It may happen that the accumulator is not completely filled + * after calling "compact_data_during_loop" and the stored + * quantized values in the accumulator have not been saved + * in the outputs. Thus, this function adds extra zeros to the + * accumulator until it is full. Then the accumulator is + * stored in the outputs. + * This function should always be called at the end of a loop + * where "compact_data_during_loop" is called + * + * @param[in,out] outputs Pointer to compressed output vector + * @param[in,out] outputOffset Current output index + * @param[in,out] infoPack Object containing the accumulator + * @returns None + * + */ +template<typename Output_T, typename std::enable_if_t<std::numeric_limits<Output_T>::digits < 8, int> = 0> +__attribute__((always_inline)) static inline +void compact_data_end_loop (Output_T* __restrict outputs, + int& outputOffset, + PackSupport& infoPack) +{ + if (std::numeric_limits<Output_T>::digits < 8) { + + // if data still accumulated but not stored + if (infoPack.cptAccumulator != 0) { + constexpr unsigned int nbSlot = ceil((double)8/std::numeric_limits<Output_T>::digits); + + // Add extra zero to shift data to the left + infoPack.cptAccumulator += 1; + while (infoPack.cptAccumulator < nbSlot) { + infoPack.accumulator <<= std::numeric_limits<Output_T>::digits; + infoPack.cptAccumulator += 1; + } + outputs[outputOffset] = infoPack.accumulator; + ++outputOffset; + infoPack.cptAccumulator = 0; + infoPack.accumulator = 0; + } + } +} + +template<typename Output_T, typename std::enable_if_t<std::numeric_limits<Output_T>::digits >= 8, int> = 0> +__attribute__((always_inline)) static inline +void compact_data_end_loop (Output_T* __restrict outputs, + int& outputOffset, + PackSupport& infoPack) +{ + // Nothing +} + + + +// ---------------------------------------------------------------------------- +// ------------------------- Pooling subfunctions ----------------------------- +// ------------------------------ Max Pooling --------------------------------- +// ---------------------------------------------------------------------------- + +__attribute__((always_inline)) static inline +int get_pool_nbData (const int nbBits) +{ + int nb_data = 1; + switch (nbBits) + { + case 8: nb_data = 4; + break; + case 4: nb_data = 2; + break; + case 16: nb_data = 2; + break; + default: + break; + } + return nb_data; +} + +template<typename Output_T, + typename std::enable_if<std::numeric_limits<Output_T>::digits == 4>::type* = nullptr> +__attribute__((always_inline)) static inline +void storeMaxPooling (Output_T* __restrict outputs, + int& outputOffset, + const uint32_t maxVal, + const int nb_data) +{ + uint32_t data_val = maxVal; + assert(nb_data == 2 || nb_data == 1); + + // Gather bytes in pairs of bytes + // Ex: 0x0A050403 -> 0x00A50043 + data_val = ((data_val & 0x0F000F00) >> 4) | (data_val & 0x000F000F); + + // Output compression and storage + for (int index = 0; index < nb_data; ++index) { + outputs[outputOffset] = (uint8_t) ((data_val >> 16*index) & 0xFF); + outputOffset += 1; + } +} + +template<typename Output_T, + typename std::enable_if<std::numeric_limits<Output_T>::digits == 8>::type* = nullptr> +__attribute__((always_inline)) static inline +void storeMaxPooling (Output_T* __restrict outputs, + int& outputOffset, + const uint32_t maxVal, + const int nb_data) +{ + memcpy(outputs, &maxVal, nb_data*sizeof(uint8_t)); +} + +template<typename Input_T, + typename std::enable_if<(std::is_unsigned<Input_T>::value + && std::numeric_limits<Input_T>::digits == 16)>::type* = nullptr> +__attribute__((always_inline)) static inline +void parallelMaxPooling (const Input_T* __restrict inputs, + uint32_t& maxVal, + const int nb_data) +{ + assert(nb_data == 2 || nb_data == 1); + + uint32_t in = 0; + memcpy((void*) &in, inputs, nb_data*sizeof(uint16_t)); + + maxVal = __UQSUB16(maxVal, in); + maxVal = __UQADD16(maxVal, in); +} + +template<typename Input_T, + typename std::enable_if<(!std::is_unsigned<Input_T>::value + && std::numeric_limits<Input_T>::digits == 16)>::type* = nullptr> +__attribute__((always_inline)) static inline +void parallelMaxPooling (const Input_T* __restrict inputs, + uint32_t maxVal, + const int nb_data) +{ + assert(nb_data == 2 || nb_data == 1); + + uint32_t in = 0; + memcpy((void*) &in, inputs, nb_data*sizeof(uint16_t)); + + maxVal = __SSUB16(maxVal, in); + maxVal = __SEL(maxVal, 0); + maxVal = __SADD16(maxVal, in); +} + +template<typename Input_T, + typename std::enable_if<(std::is_unsigned<Input_T>::value + && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr> +__attribute__((always_inline)) static inline +void parallelMaxPooling (const Input_T* __restrict inputs, + uint32_t& maxVal, + const int nb_data) +{ + assert(nb_data <= 4 && nb_data >= 1); + + uint32_t in = 0; + memcpy((void*) &in, inputs, nb_data*sizeof(uint8_t)); + + maxVal = __UQSUB8(maxVal, in); + maxVal = __UQADD8(maxVal, in); +} + +template<typename Input_T, + typename std::enable_if<(!std::is_unsigned<Input_T>::value + && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr> +__attribute__((always_inline)) static inline +void parallelMaxPooling (const Input_T* __restrict inputs, + uint32_t maxVal, + const int nb_data) +{ + assert(nb_data <= 4 && nb_data >= 1); + + uint32_t in = 0; + memcpy((void*) &in, inputs, nb_data*sizeof(uint8_t)); + + maxVal = __SSUB8(maxVal, in); + maxVal = __SEL(maxVal, 0); + maxVal = __SADD8(maxVal, in); +} + +template<typename Input_T, + typename std::enable_if<(std::is_unsigned<Input_T>::value + && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr> +__attribute__((always_inline)) static inline +void parallelMaxPooling (const Input_T* __restrict inputs, + uint32_t& maxVal, + const int nb_data) +{ + assert(nb_data == 2 || nb_data == 1); + + uint32_t in = 0; + memcpy((void*) &in, inputs, nb_data*sizeof(uint8_t)); + + in = (in | in << 8) & 0xFF00FF; + in = (in | in << 4) & 0xF0F0F0F; + + maxVal = __UQSUB8(maxVal, in); + maxVal = __UQADD8(maxVal, in); +} + +template<typename Input_T, + typename std::enable_if<(!std::is_unsigned<Input_T>::value + && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr> +__attribute__((always_inline)) static inline +void parallelMaxPooling (const Input_T* __restrict inputs, + uint32_t maxVal, + const int nb_data) +{ + assert(nb_data == 2 || nb_data == 1); + + uint32_t in = 0; + memcpy((void*) &in, inputs, nb_data*sizeof(uint8_t)); + + in = (in | in << 8) & 0xFF00FF; + in = (in | in << 4) & 0xF0F0F0F; + in += 0x78787878; + in ^= 0x78787878; + + maxVal = __SSUB8(maxVal, in); + maxVal = __SEL(maxVal, 0); + maxVal = __SADD8(maxVal, in); +} + + +#endif \ No newline at end of file diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/swar_arm_acle.h b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/swar_arm_acle.h new file mode 100644 index 0000000..31223f2 --- /dev/null +++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/swar_arm_acle.h @@ -0,0 +1,356 @@ +/** + ****************************************************************************** + * @file swar_arm_acle.h + * @brief Complete ARM Non-NEON ACLE intrinsics for Cortex m7 and m4 + * + ****************************************************************************** + * @attention + * + * (C) Copyright 2021 CEA LIST. All Rights Reserved. + * Contributor(s): Vincent TEMPLIER (vincent.templier@cea.fr) + * Philippe DORE (philippe.dore@cea.fr) + * + * This file is not part of the open source version of N2D2 and is NOT under + * the CeCILL-C license. This code is the property of the CEA. It can not be + * copied or disseminated without its authorization. + * + ****************************************************************************** + */ + +#ifndef _SWAR_ARM_ACLE_H +#define _SWAR_ARM_ACLE_H + +#include <cmsis_compiler.h> +#include "assert.h" +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief Rotate right and perform dual extracted 8-bit to 16-bit signed addition + * @details This function rotates op2, extracts two 8-bit values from op2 (at bit positions [7:0] and [23:16]), + * sign-extend them to 16-bits each, and add the results to op1 + * @param[in] op1 Two 16-bit values in op1[15:0] and op1[31:16] + * @param[in] op2 Two 8-bit values in op2[7:0] and op2[23:16] to be sign-extended + * @param[in] ror Number of bits to rotate op2. Only 8,16 and 24 are accepted + * @returns The addition of op1 and op2, where op2 has been rotated, the 8-bit values in op2[7:0] + * and op2[23:16] have been extracted and sign-extended prior to the addition + * + */ +__attribute__((always_inline)) __STATIC_INLINE +int32_t __SXTAB16_RORn (const int32_t op1, const int32_t op2, const int8_t ror) +{ + int32_t result; + + assert((ror == 0) || (ror == 8) || (ror == 16) || (ror == 24)); + __ASM volatile ("sxtab16 %0, %1, %2, ROR %3" : "=r" (result) : "r" (op1) , "r" (op2) , "i" (ror) ); + return result; +} + + +/** + * @brief Rotate right, dual extract 8-bits and sign extend each to 16-bits + * @param[in] op1 Two 8-bit values in op1[7:0] and op1[23:16] to be sign-extended + * @param[in] ror Number of bits to rotate op1. Only 8,16 and 24 are accepted + * @returns The 8-bit values sign-extended to 16-bit values + * + */ +__attribute__((always_inline)) __STATIC_INLINE +int32_t __SXTB16_RORn (const int32_t op1, const int8_t ror) +{ + int32_t result; + + assert((ror == 0) || (ror == 8) || (ror == 16) || (ror == 24)); + __ASM volatile ("sxtb16 %0, %1, ROR %2" : "=r" (result) : "r" (op1), "i" (ror) ); + return result; +} + + +/** + * @brief Rotate right and perform dual extracted 8-bit to 16-bit zero addition + * @details This function rotates op2, extracts two 8-bit values from op2 (at bit positions [7:0] and [23:16]), + * zero-extend them to 16-bits each, and add the results to op1 + * @param[in] op1 Two 16-bit values in op1[15:0] and op1[31:16] + * @param[in] op2 Two 8-bit values in op2[7:0] and op2[23:16] to be zero-extended + * @param[in] ror Number of bits to rotate op2. Only 8,16 and 24 are accepted + * @returns The addition of op1 and op2, where op2 has been rotated, the 8-bit values in op2[7:0] + * and op2[23:16] have been extracted and zero-extended prior to the addition + * + */ +__attribute__((always_inline)) __STATIC_INLINE +uint32_t __UXTAB16_RORn (const uint32_t op1, const uint32_t op2, const int8_t ror) +{ + uint32_t result; + + assert((ror == 0) || (ror == 8) || (ror == 16) || (ror == 24)); + __ASM volatile ("uxtab16 %0, %1, %2, ROR %3" : "=r" (result) : "r" (op1) , "r" (op2) , "i" (ror) ); + return result; +} + + +/** + * @brief Rotate right, dual extract 8-bits and zero extend each to 16-bits + * @param[in] op1 Two 8-bit values in op1[7:0] and op1[23:16] to be zero-extended + * @param[in] ror Number of bits to rotate op1. Only 8,16 and 24 are accepted + * @returns The 8-bit values zero-extended to 16-bit values + * + */ +__attribute__((always_inline)) __STATIC_INLINE +uint32_t __UXTB16_RORn (const uint32_t op1, const int8_t ror) +{ + uint32_t result; + + assert((ror == 0) || (ror == 8) || (ror == 16) || (ror == 24)); + __ASM volatile ("uxtb16 %0, %1, ROR %2" : "=r" (result) : "r" (op1), "i" (ror) ); + return result; +} + + +/** + * @brief Sign extend Halfword + * @details Extends a 16-bit value to a signed 32-bit value + * @param[in] op1 op1[15:0] to be sign-extended + * @returns Register holding the sign-extended 32-bit value + * + */ +__attribute__((always_inline)) __STATIC_INLINE +uint32_t __SXTH (const uint32_t op1) +{ + uint32_t result; + + __ASM volatile ("sxth %0, %1" : "=r" (result) : "r" (op1)); + return result; +} + + +/** + * @brief Zero extend Halfword + * @details Extends a 16-bit value to an unsigned 32-bit value + * @param[in] op1 op1[15:0] to be zero-extended + * @returns Register holding the zero-extended 32-bit value + * + */ +__attribute__((always_inline)) __STATIC_INLINE +uint32_t __UXTH (const uint32_t op1) +{ + uint32_t result; + + __ASM volatile ("uxth %0, %1" : "=r" (result) : "r" (op1)); + return result; +} + + +/** + * @brief Rotate right and sign extend halfword + * @param[in] op1 op1[15:0] to be sign-extended + * @param[in] ror Number of bits to rotate op1. Only 8,16 and 24 are accepted + * @returns Register holding the sign-extended 32-bit value + * + */ +__attribute__((always_inline)) __STATIC_INLINE +uint32_t __SXTH_RORn (const uint32_t op1, const int8_t ror) +{ + uint32_t result; + + assert((ror == 0) || (ror == 8) || (ror == 16) || (ror == 24)); + __ASM volatile ("sxth %0, %1, ROR %2" : "=r" (result) : "r" (op1), "i" (ror) ); + return result; +} + + +/** + * @brief Rotate right and zero extend halfword + * @param[in] op1 op1[15:0] to be zero-extended + * @param[in] ror Number of bits to rotate op1. Only 8,16 and 24 are accepted + * @returns Register holding the zero-extended 32-bit value + * + */ +__attribute__((always_inline)) __STATIC_INLINE +uint32_t __UXTH_RORn (const uint32_t op1, const int8_t ror) +{ + uint32_t result; + + assert((ror == 0) || (ror == 8) || (ror == 16) || (ror == 24)); + __ASM volatile ("uxth %0, %1, ROR %2" : "=r" (result) : "r" (op1), "i" (ror) ); + return result; +} + + +/** + * @brief Sign extend Byte + * @details Extends a 8-bit value to a signed 32-bit value + * @param[in] op1 op1[7:0] to be sign-extended + * @returns Register holding the sign-extended 32-bit value + * + */ +__attribute__((always_inline)) __STATIC_INLINE +uint32_t __SXTB (const uint32_t op1) +{ + uint32_t result; + + __ASM volatile ("sxtb %0, %1" : "=r" (result) : "r" (op1)); + return result; +} + + +/** + * @brief Zero extend Byte + * @details Extends a 8-bit value to an unsigned 32-bit value + * @param[in] op1 op1[7:0] to be zero-extended + * @returns Register holding the zero-extended 32-bit value + * + */ +__attribute__((always_inline)) __STATIC_INLINE +uint32_t __UXTB (const uint32_t op1) +{ + uint32_t result; + + __ASM volatile ("uxtb %0, %1" : "=r" (result) : "r" (op1)); + return result; +} + + +/** + * @brief Rotate right and sign extend byte + * @param[in] op1 op1[7:0] to be sign-extended + * @param[in] ror Number of bits to rotate op1. Only 8,16 and 24 are accepted + * @returns Register holding the sign-extended 32-bit value + * + */ +__attribute__((always_inline)) __STATIC_INLINE +uint32_t __SXTB_RORn (const uint32_t op1, const int8_t ror) +{ + uint32_t result; + + assert((ror == 0) || (ror == 8) || (ror == 16) || (ror == 24)); + __ASM volatile ("sxtb %0, %1, ROR %2" : "=r" (result) : "r" (op1), "i" (ror) ); + return result; +} + + +/** + * @brief Rotate right and zero extend byte + * @param[in] op1 op1[7:0] to be zero-extended + * @param[in] ror Number of bits to rotate op1. Only 8,16 and 24 are accepted + * @returns Register holding the zero-extended 32-bit value + * + */ +__attribute__((always_inline)) __STATIC_INLINE +uint32_t __UXTB_RORn (const uint32_t op1, const int8_t ror) +{ + uint32_t result; + + assert((ror == 0) || (ror == 8) || (ror == 16) || (ror == 24)); + __ASM volatile ("uxtb %0, %1, ROR %2" : "=r" (result) : "r" (op1), "i" (ror) ); + return result; +} + + +/** + * @brief Signed Bit Field Extract + * @details Copies adjacent bits from one register into the least significant bits + * of a second register, and sign extends to 32 bits + * @param[in] op1 Value to be extracted + * @param[in] lsb Position of the least significant bit of the bit field + * @param[in] width Width of the bit field + * @returns Extracted bitfield and sign extended to 32 bits + * + */ +__attribute__((always_inline)) __STATIC_INLINE +int32_t __SBFX (const uint32_t op1, const int8_t lsb, const int8_t width) +{ + int32_t result; + + assert((lsb >= 0) && (lsb < 32) && (width >= 0) && (width < 32-lsb)); + __ASM volatile ("sbfx %0, %1, %2, %3" : "=r" (result) : "r" (op1), "i" (lsb), "i" (width) ); + return result; +} + + +/** + * @brief Unsigned Bit Field Extract + * @details Copies adjacent bits from one register into the least significant bits + * of a second register, and zero extends to 32 bits + * @param[in] op1 Value to be extracted + * @param[in] lsb Position of the least significant bit of the bit field + * @param[in] width Width of the bit field + * @returns Extracted bitfield and zero extended to 32 bits + * + */ +__attribute__((always_inline)) __STATIC_INLINE +uint32_t __UBFX (const uint32_t op1, const int8_t lsb, const int8_t width) +{ + uint32_t result; + + assert((lsb >= 0) && (lsb < 32) && (width >= 0) && (width < 32-lsb)); + __ASM volatile ("ubfx %0, %1, %2, %3" : "=r" (result) : "r" (op1), "i" (lsb), "i" (width) ); + return result; +} + + +/** + * @brief Bit Field Insert + * @details Copies a bitfield into one register from another register + * It replaces width bits in op2 starting at the position lsb, + * with width bits from op1 starting at bit[0]. + * Other bits in op2 are unchanged + * @param[in] op1 Source value + * @param[in,out] op2 Destination value + * @param[in] lsb Position of the least significant bit of the bit field + * @param[in] width Width of the bit field + * @returns The register which contains op2 and the added bitfield + * + */ +__attribute__((always_inline)) __STATIC_INLINE +uint32_t __BFI (uint32_t op1, uint32_t op2, const int8_t lsb, const int8_t width) +{ + assert((lsb >= 0) && (lsb < 32) && (width >= 0) && (width < 32-lsb)); + __ASM volatile ("bfi %0, %1, %2, %3" : "+r" (op2) : "r" (op1), "i" (lsb), "i" (width), "0" (op2) ); + return op2; +} + + +/** + * @brief Signed Divide + * @details Performs a signed integer division of the value in op1 + * by the value in op2. + * @param[in] op1 Register holding the value to be divided + * @param[in] op2 Register holding the divisor + * @returns Register holding the signed result op1/op2 + * + */ +__attribute__((always_inline)) __STATIC_INLINE +uint32_t __SDIV (const uint32_t op1, const uint32_t op2) +{ + uint32_t result; + + __ASM volatile ("sdiv %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return result; +} + + +/** + * @brief Unsigned Divide + * @details Performs an unsigned integer division of the value in op1 + * by the value in op2. + * @param[in] op1 Register holding the value to be divided + * @param[in] op2 Register holding the divisor + * @returns Register holding the unsigned result op1/op2 + * + */ +__attribute__((always_inline)) __STATIC_INLINE +uint32_t __UDIV (const uint32_t op1, const uint32_t op2) +{ + uint32_t result; + + __ASM volatile ("udiv %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return result; +} + + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/utils.hpp b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/utils.hpp index 6b1228b..9111d1c 100644 --- a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/utils.hpp +++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/utils.hpp @@ -28,7 +28,7 @@ #include <cstring> #include <limits> -#include "typedefs.h" +#include "kernels/typedefs.hpp" namespace N2D2_Export { -- GitLab