diff --git a/aidge_export_cpp/kernels/add.hpp b/aidge_export_cpp/kernels/add.hpp new file mode 100644 index 0000000000000000000000000000000000000000..52b58f5a09591781357f64f27090a7e347850d35 --- /dev/null +++ b/aidge_export_cpp/kernels/add.hpp @@ -0,0 +1,99 @@ +#ifndef __AIDGE_EXPORT_CPP_KERNELS_ADD__ +#define __AIDGE_EXPORT_CPP_KERNELS_ADD__ + +#include "network/typedefs.hpp" +#include "kernels/activation.hpp" + + +template<int NB_ELTS, + int INPUT_A_DIMS[], int INPUT_B_DIMS[], int OUTPUT_DIMS[], + int SIZE_DIM_IN_A, int SIZE_DIM_IN_B, int SIZE_DIM_OUT, int OUT_SIZE, + ActivationFunction_T ACTIVATION, + typename Input_T, typename Output_T> +__attribute__((always_inline)) inline +void add_forward ( + Output_T* __restrict outputs, + const Input_T* __restrict inputs1, + const Input_T* __restrict inputs2) +{ + int ndim_a[SIZE_DIM_OUT]; + int ndim_b[SIZE_DIM_OUT]; + for (int i= 0; i<SIZE_DIM_OUT; i++){ + int idx = SIZE_DIM_OUT-SIZE_DIM_IN_A; + ndim_a[i] = (i< idx) ? 1 : INPUT_A_DIMS[i-idx]; + } + for (int i= 0; i<SIZE_DIM_OUT; i++){ + int idx = SIZE_DIM_OUT-SIZE_DIM_IN_B; + ndim_b[i] = (i< idx) ? 1 : INPUT_B_DIMS[i-idx]; + } + + // Find the highest equal dimension + int contiguousidx = SIZE_DIM_OUT -1 ; + + for (int i = contiguousidx ; ndim_a[i] == ndim_b[i]; i--) { + contiguousidx = i; + } + + // Compute the highest number of contiguous data for each Tensor + int input0_contiguous_size = 1; + for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){ + input0_contiguous_size *= ndim_a[i]; + } + int input1_contiguous_size = 1; + for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){ + input1_contiguous_size *= ndim_b[i]; + } + int output_contiguous_size = 1; + for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){ + output_contiguous_size *= OUTPUT_DIMS[i]; + } + // initialize strides to iterate through data because of broadcasting + int stride_post0[contiguousidx ] ; + int stride_post1[contiguousidx ] ; + int stride_step0[contiguousidx ] ; + int stride_step1[contiguousidx ] ; + if (contiguousidx > 0) { + stride_post0[contiguousidx - 1] = 1; + stride_post1[contiguousidx - 1] = 1; + for (int i = contiguousidx -2; i != -1; --i) { + stride_post0[i] = stride_post0[i+1]*ndim_a[i+1]; + stride_post1[i] = stride_post1[i+1]*ndim_b[i+1]; + } + for (int i = 0; i < contiguousidx ; ++i) { + stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1; + stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1; + } + } + int offsetIn0 = 0; + int offsetIn1 = 0; + int offsetOut = 0; + int nbMatrices = 1; + for(int i = 0; i<contiguousidx ; ++i){ + nbMatrices *= OUTPUT_DIMS[i]; + + } + int dim = contiguousidx - 1; + for(int stack = 0; stack < nbMatrices;){ + for(int i = 0; i < output_contiguous_size; ++i){ + int in0_id = (input0_contiguous_size != 1) ? i : 0; + int in1_id = (input1_contiguous_size != 1) ? i : 0; + outputs[i + offsetOut*output_contiguous_size] = inputs1[in0_id + offsetIn0*input0_contiguous_size] + inputs2[in1_id + offsetIn1*input1_contiguous_size]; + } + if (++stack < nbMatrices) { + int tmp_stack = stack; + while(tmp_stack % OUTPUT_DIMS[dim] == 0) { + tmp_stack /= OUTPUT_DIMS[dim]; + dim--; + } + offsetIn0 += stride_step0[dim]; + offsetIn1 += stride_step1[dim]; + ++offsetOut; + dim = contiguousidx - 1; + } + } +} + + + + +#endif // __AIDGE_EXPORT_CPP_KERNELS_ADD__ \ No newline at end of file diff --git a/aidge_export_cpp/kernels/batchnorm.hpp b/aidge_export_cpp/kernels/batchnorm.hpp index 740ea21e6f66ba338985db4f724a5d57377e1f81..4100e6d1ef0d07ff16ad25b33cb497e1143f0c0b 100644 --- a/aidge_export_cpp/kernels/batchnorm.hpp +++ b/aidge_export_cpp/kernels/batchnorm.hpp @@ -3,6 +3,7 @@ #include "network/typedefs.hpp" #include "kernels/rescaling.hpp" +#include "kernels/activation.hpp" #include <math.h> // WARNING: this kernel only works for 32-bits floating point values @@ -12,30 +13,40 @@ template<int NB_OUTPUTS, ActivationFunction_T ACTIVATION, typename Input_T, typename Output_T, typename Param_T> -__attribute__((always_inline)) inline +__attribute__((always_inline)) inline void batchnorm_forward ( const Input_T* __restrict inputs, Output_T* __restrict outputs, + const Param_T* __restrict scales, const Param_T* __restrict biases, - const Param_T* __restrict variances, const Param_T* __restrict means, - const Param_T* __restrict scales, + const Param_T* __restrict variances, const double epsilon) { - for (unsigned int output = 0; output < NB_OUTPUTS; ++output) { - const Output_T var = sqrt(variances[output] + epsilon); - for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) { - for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) { - const int outputOffset = OUTPUTS_HEIGHT * oy + ox; - - const Output_T normalized = (inputs[outputOffset + output] - means[output]) / var; - const Output_T sAs = scales[output] * normalized + biases[output]; - outputs[outputOffset + output] = sat<Output_T>(sAs, output, ACTIVATION, NoScaling); - } + int featureMapSize = OUTPUTS_HEIGHT * OUTPUTS_WIDTH; +#ifdef _OPENMP + #pragma omp parallel for +#endif + for (int ch = 0; ch < NB_OUTPUTS; ++ch) { + int ioIndex = ch * featureMapSize; +#ifdef _OPENMP + #pragma omp parallel for +#endif + for (int i = ioIndex; i < ioIndex + featureMapSize; i++) { + outputs[i] = biases[ch]; + } + float var = sqrt(variances[ch] + epsilon); +#ifdef _OPENMP + #pragma omp parallel for +#endif + for (int feature = 0; feature < featureMapSize; ++feature) { + outputs[ioIndex + feature] += (scales[ch] * (inputs[ioIndex + feature] - means[ch]) / var); } } + } + #endif // __AIDGE_EXPORT_CPP_KERNELS_BATCHNORM__ diff --git a/aidge_export_cpp/kernels/convolution.hpp b/aidge_export_cpp/kernels/convolution.hpp index 6ea9f0579b84dd5a28a5ea66a778326fcd9c84ce..11553398a24351d7d6f4b7d31e5fe4e3cd0c2654 100644 --- a/aidge_export_cpp/kernels/convolution.hpp +++ b/aidge_export_cpp/kernels/convolution.hpp @@ -6,30 +6,44 @@ #include "network/utils.hpp" #include "kernels/macs.hpp" #include "kernels/activation.hpp" +#include <omp.h> +#include <iostream> + +// Weights index en NHWC +constexpr int inds_pos(int n, int c, int h, int w, int N, int C, int H, int W) { + return n * (H * W * C) + + h * (W * C) + + w * C + + c; +} +// Image index in CHW +constexpr int inds_pos(int c, int h, int w, int C, int H, int W) { + return c * (H * W) + + h * W + + w; +} -template<int NB_CHANNELS, - int CHANNELS_HEIGHT, int CHANNELS_WIDTH, - int NB_OUTPUTS, - int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH, +template<int NB_CHANNELS, + int IN_HEIGHT, int IN_WIDTH, + int NB_OUTPUTS, int GROUPS, + int OUT_HEIGHT, int OUT_WIDTH, int PADDING_Y, int PADDING_X, int STRIDE_Y, int STRIDE_X, int DILATION_Y, int DILATION_X, int KERNEL_HEIGHT, int KERNEL_WIDTH, ActivationFunction_T ACTIVATION, - typename Input_T, typename Output_T, + typename Input_T, typename Output_T, typename Weight_T, typename Bias_T, typename Rescaling_T> -__attribute__((always_inline)) inline +__attribute__((always_inline)) inline void convolution_forward( - const Input_T* __restrict inputs, + const Input_T* __restrict inputs, Output_T* __restrict outputs, const Weight_T* __restrict weights, const Bias_T* __restrict biases, const Rescaling_T& __restrict rescaling) { - constexpr int DILATED_KERNEL_HEIGHT - = KERNEL_HEIGHT + (DILATION_Y - 1) * (KERNEL_HEIGHT - 1); constexpr int DILATED_KERNEL_WIDTH = KERNEL_WIDTH + (DILATION_X - 1) * (KERNEL_WIDTH - 1); diff --git a/aidge_export_cpp/kernels/convolution_groups.hpp b/aidge_export_cpp/kernels/convolution_groups.hpp new file mode 100644 index 0000000000000000000000000000000000000000..17cb1bfa7962063c59fca2395b1110c526e68015 --- /dev/null +++ b/aidge_export_cpp/kernels/convolution_groups.hpp @@ -0,0 +1,85 @@ +#ifndef __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION__ +#define __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION__ + +#include "network/typedefs.hpp" +#include "kernels/rescaling.hpp" +#include "network/utils.hpp" +#include "kernels/macs.hpp" +#include "kernels/activation.hpp" + +// Weights index en NHWC +constexpr int inds_pos(int n, int c, int h, int w, int N, int C, int H, int W) { + return n * (H * W * C) + + h * (W * C) + + w * C + + c; +} + +// Image index in CHW +constexpr int inds_pos(int c, int h, int w, int C, int H, int W) { + return c * (H * W) + + h * W + + w; +} + + + +template<int NB_CHANNELS, + int IN_HEIGHT, int IN_WIDTH, + int NB_OUTPUTS, int GROUPS, + int OUT_HEIGHT, int OUT_WIDTH, + int PADDING_Y, int PADDING_X, + int STRIDE_Y, int STRIDE_X, + int DILATION_Y, int DILATION_X, + int KERNEL_HEIGHT, int KERNEL_WIDTH, + ActivationFunction_T ACTIVATION, + typename Input_T, typename Output_T, + typename Weight_T, typename Bias_T, + typename Rescaling_T> +__attribute__((always_inline)) inline +void convolution_forward( + const Input_T* __restrict inputs, + Output_T* __restrict outputs, + const Weight_T* __restrict weights, + const Bias_T* __restrict biases, + const Rescaling_T& __restrict rescaling) +{ + + if (NB_CHANNELS % GROUPS != 0 || NB_OUTPUTS % GROUPS != 0) { + throw std::invalid_argument("Groups must be a divisor of both NB_CHANNELS and NB_OUTPUTS!"); + } + + int c_in_g = NB_CHANNELS / GROUPS; + int c_out_g = NB_OUTPUTS / GROUPS; +#ifdef _OPENMP + #pragma omp parallel for collapse(3) +#endif + for (int oc = 0; oc < NB_OUTPUTS; oc++) { + for (int i = 0; i < OUT_HEIGHT; ++i) { + for (int j = 0; j < OUT_WIDTH; ++j) { + int g_oc = oc / c_out_g; + Output_T value = biases[oc]; + for (int ic = g_oc * c_in_g; ic < (g_oc + 1) * c_in_g; ++ic) { +#ifdef _OPENMP + #pragma omp parallel for +#endif + for (int m = 0; m < KERNEL_HEIGHT; ++m) { +#ifdef _OPENMP + #pragma omp parallel for +#endif + for (int n = 0; n < KERNEL_WIDTH; ++n) { + int i_p = i * STRIDE_X - PADDING_X + m * DILATION_X; + int j_p = j * STRIDE_Y - PADDING_Y + n * DILATION_Y; + if (i_p >= 0 && i_p < IN_HEIGHT && j_p >= 0 && j_p < IN_WIDTH) { + value += weights[inds_pos(oc, ic % c_in_g, m, n, NB_OUTPUTS, c_in_g, KERNEL_HEIGHT, KERNEL_WIDTH)] * + inputs[inds_pos(ic, i_p, j_p, NB_CHANNELS, IN_HEIGHT, IN_WIDTH)]; + } + } + } + } + outputs[inds_pos(oc, i, j, NB_OUTPUTS, OUT_HEIGHT, OUT_WIDTH)] = activation_forward_value<Output_T>(value, oc, ACTIVATION, rescaling); + } + } + } +} +#endif // __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION__ \ No newline at end of file diff --git a/aidge_export_cpp/kernels/div.hpp b/aidge_export_cpp/kernels/div.hpp new file mode 100644 index 0000000000000000000000000000000000000000..44640aafcf745e837bb872c56df70eab74ffb4dd --- /dev/null +++ b/aidge_export_cpp/kernels/div.hpp @@ -0,0 +1,99 @@ +#ifndef __AIDGE_EXPORT_CPP_KERNELS_DIV__ +#define __AIDGE_EXPORT_CPP_KERNELS_DIV__ + +#include "network/typedefs.hpp" +#include "kernels/activation.hpp" + +template<int NB_ELTS, + int INPUT_A_DIMS[], int INPUT_B_DIMS[], int OUTPUT_DIMS[], + int SIZE_DIM_IN_A, int SIZE_DIM_IN_B, int SIZE_DIM_OUT, int OUT_SIZE, + ActivationFunction_T ACTIVATION, + typename Input_T, typename Output_T> +__attribute__((always_inline)) inline +void div_forward ( + Output_T* __restrict outputs, + const Input_T* __restrict inputs1, + const Input_T* __restrict inputs2) +{ + + int ndim_a[SIZE_DIM_OUT]; + int ndim_b[SIZE_DIM_OUT]; + for (int i= 0; i<SIZE_DIM_OUT; i++){ + int idx = SIZE_DIM_OUT-SIZE_DIM_IN_A; + ndim_a[i] = (i< idx) ? 1 : INPUT_A_DIMS[i-idx]; + } + for (int i= 0; i<SIZE_DIM_OUT; i++){ + int idx = SIZE_DIM_OUT-SIZE_DIM_IN_B; + ndim_b[i] = (i< idx) ? 1 : INPUT_B_DIMS[i-idx]; + } + + // Find the highest equal dimension + int contiguousidx = SIZE_DIM_OUT -1 ; + + for (int i = contiguousidx ; ndim_a[i] == ndim_b[i]; i--) { + contiguousidx = i; + } + + // Compute the highest number of contiguous data for each Tensor + int input0_contiguous_size = 1; + for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){ + input0_contiguous_size *= ndim_a[i]; + } + int input1_contiguous_size = 1; + for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){ + input1_contiguous_size *= ndim_b[i]; + } + int output_contiguous_size = 1; + for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){ + output_contiguous_size *= OUTPUT_DIMS[i]; + } + // initialize strides to iterate through data because of broadcasting + int stride_post0[contiguousidx ] ; + int stride_post1[contiguousidx ] ; + int stride_step0[contiguousidx ] ; + int stride_step1[contiguousidx ] ; + if (contiguousidx > 0) { + stride_post0[contiguousidx - 1] = 1; + stride_post1[contiguousidx - 1] = 1; + for (int i = contiguousidx -2; i != -1; --i) { + stride_post0[i] = stride_post0[i+1]*ndim_a[i+1]; + stride_post1[i] = stride_post1[i+1]*ndim_b[i+1]; + } + for (int i = 0; i < contiguousidx ; ++i) { + stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1; + stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1; + } + } + int offsetIn0 = 0; + int offsetIn1 = 0; + int offsetOut = 0; + int nbMatrices = 1; + for(int i = 0; i<contiguousidx ; ++i){ + nbMatrices *= OUTPUT_DIMS[i]; + + } + int dim = contiguousidx - 1; + for(int stack = 0; stack < nbMatrices;){ + for(int i = 0; i < output_contiguous_size; ++i){ + int in0_id = (input0_contiguous_size != 1) ? i : 0; + int in1_id = (input1_contiguous_size != 1) ? i : 0; + outputs[i + offsetOut*output_contiguous_size] = inputs1[in0_id + offsetIn0*input0_contiguous_size] / inputs2[in1_id + offsetIn1*input1_contiguous_size]; + } + if (++stack < nbMatrices) { + int tmp_stack = stack; + while(tmp_stack % OUTPUT_DIMS[dim] == 0) { + tmp_stack /= OUTPUT_DIMS[dim]; + dim--; + } + offsetIn0 += stride_step0[dim]; + offsetIn1 += stride_step1[dim]; + ++offsetOut; + dim = contiguousidx - 1; + } + } +} + + + + +#endif // __AIDGE_EXPORT_CPP_KERNELS_DIV__ \ No newline at end of file diff --git a/aidge_export_cpp/kernels/elemwise.hpp b/aidge_export_cpp/kernels/elemwise.hpp index 67ee574c1cb7d197f3c976ce80a2a63d36aec873..9b979591dbc367f9b9d3520714e4b6871804538d 100644 --- a/aidge_export_cpp/kernels/elemwise.hpp +++ b/aidge_export_cpp/kernels/elemwise.hpp @@ -4,13 +4,12 @@ #include "network/typedefs.hpp" #include "kernels/activation.hpp" -// Generic function for two inputs -template<int NB_ELTS, - ElemWise_T ELEM_OP, - ActivationFunction_T ACTIVATION, - typename Input_T, typename Output_T, - typename Rescaling_T> +template<int NB_ELTS, ElemWise_T ELEM_OP, + int INPUT_A_DIMS[], int INPUT_B_DIMS[], int OUTPUT_DIMS[], + int SIZE_DIM_IN_A, int SIZE_DIM_IN_B, int SIZE_DIM_OUT, int OUT_SIZE, + ActivationFunction_T ACTIVATION, + typename Input_T, typename Output_T, typename Rescaling_T> __attribute__((always_inline)) inline void elemwise_forward ( Output_T* __restrict outputs, @@ -21,32 +20,313 @@ void elemwise_forward ( if (std::is_floating_point<Input_T>::value) { Input_T val = 0; - + switch (ELEM_OP) { case Add: { - for (int i = 0; i < NB_ELTS; ++i) { - val = inputs1[i] + inputs2[i]; - outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling); + int ndim_a[SIZE_DIM_OUT]; + int ndim_b[SIZE_DIM_OUT]; + for (int i= 0; i<SIZE_DIM_OUT; i++){ + int idx = SIZE_DIM_OUT-SIZE_DIM_IN_A; + ndim_a[i] = (i< idx) ? 1 : INPUT_A_DIMS[i-idx]; + } + for (int i= 0; i<SIZE_DIM_OUT; i++){ + int idx = SIZE_DIM_OUT-SIZE_DIM_IN_B; + ndim_b[i] = (i< idx) ? 1 : INPUT_B_DIMS[i-idx]; + } + + // Find the highest equal dimension + int contiguousidx = SIZE_DIM_OUT -1 ; + for (int i = contiguousidx ; ndim_a[i] == ndim_b[i]; i--) { + contiguousidx = i; + } + // Compute the highest number of contiguous data for each Tensor + int input0_contiguous_size = 1; + for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){ + input0_contiguous_size *= ndim_a[i]; + } + int input1_contiguous_size = 1; + for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){ + input1_contiguous_size *= ndim_b[i]; + } + int output_contiguous_size = 1; + for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){ + output_contiguous_size *= OUTPUT_DIMS[i]; + } + // initialize strides to iterate through data because of broadcasting + int stride_post0[contiguousidx ] ; + int stride_post1[contiguousidx ] ; + int stride_step0[contiguousidx ] ; + int stride_step1[contiguousidx ] ; + + if (contiguousidx > 0) { + stride_post0[contiguousidx - 1] = 1; + stride_post1[contiguousidx - 1] = 1; + for (int i = contiguousidx -2; i != -1; --i) { + stride_post0[i] = stride_post0[i+1]*ndim_a[i+1]; + stride_post1[i] = stride_post1[i+1]*ndim_b[i+1]; + } + for (int i = 0; i < contiguousidx ; ++i) { + stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1; + stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1; + } + } + int offsetIn0 = 0; + int offsetIn1 = 0; + int offsetOut = 0; + int nbMatrices = 1; + for(int i = 0; i<contiguousidx ; ++i){ + nbMatrices *= OUTPUT_DIMS[i]; + + } + int dim = contiguousidx - 1; + for(int stack = 0; stack < nbMatrices;){ + for(int i = 0; i < output_contiguous_size; ++i){ + int in0_id = (input0_contiguous_size != 1) ? i : 0; + int in1_id = (input1_contiguous_size != 1) ? i : 0; + outputs[i + offsetOut*output_contiguous_size] = inputs1[in0_id + offsetIn0*input0_contiguous_size] + inputs2[in1_id + offsetIn1*input1_contiguous_size]; + } + if (++stack < nbMatrices) { + int tmp_stack = stack; + while(tmp_stack % OUTPUT_DIMS[dim] == 0) { + tmp_stack /= OUTPUT_DIMS[dim]; + dim--; + } + offsetIn0 += stride_step0[dim]; + offsetIn1 += stride_step1[dim]; + ++offsetOut; + dim = contiguousidx - 1; + } } break; + } - case Sub: { - for (int i = 0; i < NB_ELTS; ++i) { - val = inputs1[i] - inputs2[i]; - outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling); + case Sub: { + + int ndim_a[SIZE_DIM_OUT]; + int ndim_b[SIZE_DIM_OUT]; + for (int i= 0; i<SIZE_DIM_OUT; i++){ + int idx = SIZE_DIM_OUT-SIZE_DIM_IN_A; + ndim_a[i] = (i< idx) ? 1 : INPUT_A_DIMS[i-idx]; + } + for (int i= 0; i<SIZE_DIM_OUT; i++){ + int idx = SIZE_DIM_OUT-SIZE_DIM_IN_B; + ndim_b[i] = (i< idx) ? 1 : INPUT_B_DIMS[i-idx]; + } + // Find the highest equal dimension + int contiguousIdx = SIZE_DIM_OUT-1; + for (int i = contiguousIdx ; ndim_a[i] == ndim_b[i]; i--) { + contiguousIdx = i; + } + // Compute the highest number of contiguous data for each Tensor + int input0_contiguous_size = 1; + for(int i = contiguousIdx; i<SIZE_DIM_OUT; ++i){ + input0_contiguous_size *= ndim_a[i]; + } + int input1_contiguous_size = 1; + for(int i = contiguousIdx; i<SIZE_DIM_OUT; ++i){ + input1_contiguous_size *= ndim_b[i]; + } + int output_contiguous_size = 1; + for(int i = contiguousIdx; i<SIZE_DIM_OUT; ++i){ + output_contiguous_size *= OUTPUT_DIMS[i]; } + // initialize strides to iterate through data because of broadcasting + int stride_post0[contiguousIdx] ; + int stride_post1[contiguousIdx] ; + int stride_step0[contiguousIdx] ; + int stride_step1[contiguousIdx] ; + if (contiguousIdx > 0) { + stride_post0[contiguousIdx - 1] = 1; + stride_post1[contiguousIdx - 1] = 1; + for (int i = contiguousIdx-2; i != -1; --i) { + stride_post0[i] = stride_post0[i+1]*ndim_a[i+1]; + stride_post1[i] = stride_post1[i+1]*ndim_b[i+1]; + } + for (int i = 0; i < contiguousIdx; ++i) { + stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1; + stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1; + } + } + int offsetIn0 = 0; + int offsetIn1 = 0; + int offsetOut = 0; + int nbMatrices = 1; + for(int i = 0; i<contiguousIdx; ++i){ + nbMatrices *= OUTPUT_DIMS[i]; + } + int dim = contiguousIdx - 1; + for(int stack = 0; stack < nbMatrices;){ + for(int i = 0; i < output_contiguous_size; ++i){ + int in0_id = (input0_contiguous_size != 1) ? i : 0; + int in1_id = (input1_contiguous_size != 1) ? i : 0; + outputs[i + offsetOut*output_contiguous_size] = inputs1[in0_id + offsetIn0*input0_contiguous_size] - inputs2[in1_id + offsetIn1*input1_contiguous_size]; + } + if (++stack < nbMatrices) { + int tmp_stack = stack; + while(tmp_stack % OUTPUT_DIMS[dim] == 0) { + tmp_stack /= OUTPUT_DIMS[dim]; + dim--; + } + offsetIn0 += stride_step0[dim]; + offsetIn1 += stride_step1[dim]; + ++offsetOut; + dim = contiguousIdx - 1; + } + } + break; } case Mul: { - for (int i = 0; i < NB_ELTS; ++i) { - val = inputs1[i] * inputs2[i]; - outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling); + int ndim_a[SIZE_DIM_OUT]; + int ndim_b[SIZE_DIM_OUT]; + + for (int i= 0; i<SIZE_DIM_OUT; i++){ + int idx = SIZE_DIM_OUT-SIZE_DIM_IN_A; + ndim_a[i] = (i< idx) ? 1 : INPUT_A_DIMS[i-idx]; + } + for (int i= 0; i<SIZE_DIM_OUT; i++){ + int idx = SIZE_DIM_OUT-SIZE_DIM_IN_B; + ndim_b[i] = (i< idx) ? 1 : INPUT_B_DIMS[i-idx]; + } + // Find the highest equal dimension + int contiguousIdx = SIZE_DIM_OUT-1; + for (int i = contiguousIdx; ndim_a[i] == ndim_b[i]; i--) { + contiguousIdx = i; + } + // Compute the highest number of contiguous data for each Tensor + int input0_contiguous_size = 1; + for(int i = contiguousIdx; i<SIZE_DIM_OUT; ++i){ + input0_contiguous_size *= ndim_a[i]; + } + int input1_contiguous_size = 1; + for(int i = contiguousIdx; i<SIZE_DIM_OUT; ++i){ + input1_contiguous_size *= ndim_b[i]; + } + int output_contiguous_size = 1; + for(int i = contiguousIdx; i<SIZE_DIM_OUT; ++i){ + output_contiguous_size *= OUTPUT_DIMS[i]; + } + // initialize strides to iterate through data because of broadcasting + int stride_post0[contiguousIdx] ; + int stride_post1[contiguousIdx] ; + int stride_step0[contiguousIdx] ; + int stride_step1[contiguousIdx] ; + if (contiguousIdx > 0) { + stride_post0[contiguousIdx - 1] = 1; + stride_post1[contiguousIdx - 1] = 1; + for (int i = contiguousIdx-2; i != -1; --i) { + stride_post0[i] = stride_post0[i+1]*ndim_a[i+1]; + stride_post1[i] = stride_post1[i+1]*ndim_b[i+1]; + } + for (int i = 0; i < contiguousIdx; ++i) { + stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1; + stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1; + } + } + int offsetIn0 = 0; + int offsetIn1 = 0; + int offsetOut = 0; + int nbMatrices = 1; + for(int i = 0; i<contiguousIdx; ++i){ + nbMatrices *= OUTPUT_DIMS[i]; + } + int dim = contiguousIdx - 1; + for(int stack = 0; stack < nbMatrices;){ + for(int i = 0; i < output_contiguous_size; ++i){ + int in0_id = (input0_contiguous_size != 1) ? i : 0; + int in1_id = (input1_contiguous_size != 1) ? i : 0; + outputs[i + offsetOut*output_contiguous_size] = inputs1[in0_id + offsetIn0*input0_contiguous_size] * inputs2[in1_id + offsetIn1*input1_contiguous_size]; + } + if (++stack < nbMatrices) { + int tmp_stack = stack; + while(tmp_stack % OUTPUT_DIMS[dim] == 0) { + tmp_stack /= OUTPUT_DIMS[dim]; + dim--; + } + offsetIn0 += stride_step0[dim]; + offsetIn1 += stride_step1[dim]; + ++offsetOut; + dim = contiguousIdx - 1; + } + } + break; + } + case Div: { + int ndim_a[SIZE_DIM_OUT]; + int ndim_b[SIZE_DIM_OUT]; + for (int i= 0; i<SIZE_DIM_OUT; i++){ + int idx = SIZE_DIM_OUT-SIZE_DIM_IN_A; + ndim_a[i] = (i< idx) ? 1 : INPUT_A_DIMS[i-idx]; + } + for (int i= 0; i<SIZE_DIM_OUT; i++){ + int idx = SIZE_DIM_OUT-SIZE_DIM_IN_B; + ndim_b[i] = (i< idx) ? 1 : INPUT_B_DIMS[i-idx]; + } + // Find the highest equal dimension + int contiguousIdx = SIZE_DIM_OUT-1; + for (int i = contiguousIdx; ndim_a[i] == ndim_b[i]; i--) { + contiguousIdx = i; + } + // Compute the highest number of contiguous data for each Tensor + int input0_contiguous_size = 1; + for(int i = contiguousIdx; i<SIZE_DIM_OUT; ++i){ + input0_contiguous_size *= ndim_a[i]; + } + int input1_contiguous_size = 1; + for(int i = contiguousIdx; i<SIZE_DIM_OUT; ++i){ + input1_contiguous_size *= ndim_b[i]; + } + int output_contiguous_size = 1; + for(int i = contiguousIdx; i<SIZE_DIM_OUT; ++i){ + output_contiguous_size *= OUTPUT_DIMS[i]; + } + // initialize strides to iterate through data because of broadcasting + int stride_post0[contiguousIdx] ; + int stride_post1[contiguousIdx] ; + int stride_step0[contiguousIdx] ; + int stride_step1[contiguousIdx] ; + if (contiguousIdx > 0) { + stride_post0[contiguousIdx - 1] = 1; + stride_post1[contiguousIdx - 1] = 1; + for (int i = contiguousIdx-2; i != -1; --i) { + stride_post0[i] = stride_post0[i+1]*ndim_a[i+1]; + stride_post1[i] = stride_post1[i+1]*ndim_b[i+1]; + } + for (int i = 0; i < contiguousIdx; ++i) { + stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1; + stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1; + } + } + int offsetIn0 = 0; + int offsetIn1 = 0; + int offsetOut = 0; + int nbMatrices = 1; + for(int i = 0; i<contiguousIdx; ++i){ + nbMatrices *= OUTPUT_DIMS[i]; + } + int dim = contiguousIdx - 1; + for(int stack = 0; stack < nbMatrices;){ + for(int i = 0; i < output_contiguous_size; ++i){ + int in0_id = (input0_contiguous_size != 1) ? i : 0; + int in1_id = (input1_contiguous_size != 1) ? i : 0; + outputs[i + offsetOut*output_contiguous_size] = inputs1[in0_id + offsetIn0*input0_contiguous_size] / inputs2[in1_id + offsetIn1*input1_contiguous_size]; + } + if (++stack < nbMatrices) { + int tmp_stack = stack; + while(tmp_stack % OUTPUT_DIMS[dim] == 0) { + tmp_stack /= OUTPUT_DIMS[dim]; + dim--; + } + offsetIn0 += stride_step0[dim]; + offsetIn1 += stride_step1[dim]; + ++offsetOut; + dim = contiguousIdx - 1; + } } break; } default: { - // Copy inputs1 in outputs for default case for (int i = 0; i < NB_ELTS; ++i) { val = inputs1[i]; outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling); @@ -58,7 +338,6 @@ void elemwise_forward ( else { int32_t val = 0; - switch (ELEM_OP) { case Add: { for (int i = 0; i < NB_ELTS; ++i) { @@ -81,6 +360,13 @@ void elemwise_forward ( } break; } + case Div: { + for (int i = 0; i < NB_ELTS; ++i) { + val = inputs1[i] / inputs2[i] ; + outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling); + } + break; + } default: { // Copy inputs1 in outputs for default case for (int i = 0; i < NB_ELTS; ++i) { @@ -94,78 +380,6 @@ void elemwise_forward ( } -// Generic function for multiple inputs -// Not working - -// template<ElemWise_T ELEM_OP, typename Output_T> -// __attribute__((always_inline)) inline -// Output_T elemWise (int /*pos*/, int /*ch*/) -// { -// return 0; -// } - -// template<ElemWise_T ELEM_OP, -// int NB_CHANNELS, -// // For next inputs -// int... ARGS, -// typename... INPUTS, -// // Types -// typename Input_T, typename Output_T> -// __attribute__((always_inline)) inline -// Output_T elemWise (int pos, int ch, -// const Input_T* __restrict firstInputs, -// INPUTS... inputs) -// { -// int iOffset = NB_CHANNELS * pos; - -// return firstInputs[iOffset + ch] -// + elemWise<ELEM_OP, ARGS...>(pos, ch, inputs...); -// } - -// template<// For all inputs -// int NB_CHANNELS, -// int CHANNELS_HEIGHT, int CHANNELS_WIDTH, -// int NB_ELTS, -// int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH, -// ElemWise_T ELEM_OP, -// ActivationFunction_T ACTIVATION, -// // For next inputs -// int... ARGS, -// typename... INPUTS, -// // Types -// typename Input_T, typename Output_T, -// typename Rescaling_T> -// __attribute__((always_inline)) inline -// void elemWise_forward ( -// Output_T* __restrict outputs, -// const Rescaling_T& __restrict rescaling, -// const Input_T* __restrict firstInputs, -// INPUTS... inputs) -// { -// for (int oy = 0; oy < OUTPUTS_HEIGHT; oy++) { -// for (int ox = 0; ox < OUTPUTS_WIDTH; ox++) { -// const int pos = (ox + OUTPUTS_WIDTH * oy); -// int oOffset = NB_ELTS * pos; - -// for (int ch = 0; ch < NB_ELTS; ++ch) { -// const Add_T val = elemWise<ELEM_OP, -// INPUT_NB_CHANNELS, -// INPUT_MEM_CONT_OFFSET, -// INPUT_MEM_CONT_NB_ELTS, -// INPUT_MEM_WRAP_OFFSET, -// INPUT_MEM_WRAP_NB_ELTS, -// INPUT_MEM_STRIDE, -// ARGS...>(pos, ch, firstInputs, inputs...); - -// outputs[oOffset + ch] -// = sat<Output_T>(val, ch, ACTIVATION, rescaling); -// } -// } -// } -// } - - - -#endif // __AIDGE_EXPORT_CPP_KERNELS_ELEMWISE__ +#endif // __AIDGE_EXPORT_CPP_KERNELS_ELEMWISE__ \ No newline at end of file diff --git a/aidge_export_cpp/kernels/erf.hpp b/aidge_export_cpp/kernels/erf.hpp new file mode 100644 index 0000000000000000000000000000000000000000..88aafe2407c4a1106b1e05acf2406a1337fa5ac6 --- /dev/null +++ b/aidge_export_cpp/kernels/erf.hpp @@ -0,0 +1,40 @@ +#ifndef __AIDGE_EXPORT_CPP_KERNELS_ERF__ +#define __AIDGE_EXPORT_CPP_KERNELS_ERF__ + +#include "network/typedefs.hpp" +#include <cmath> +#include <math.h> + +template<int _NB_ELTS, + typename Input_T, typename Output_T> +__attribute__((always_inline)) inline +void erf_forward ( + const Input_T* __restrict inputs, + Output_T* __restrict outputs) +{ + double a1 = 0.254829592; + double a2 = -0.284496736; + double a3 = 1.421413741; + double a4 = -1.453152027; + double a5 = 1.061405429; + double p = 0.3275911; + +#ifdef _OPENMP +#pragma omp parallel for +#endif + for (int i = 0; i < _NB_ELTS; ++i) { + int sign = 1; + if (inputs[i] < 0) + sign = -1; + double abs_value = abs(inputs[i]); + + // A&S formula 7.1.26 + double t = 1.0/(1.0 + p*abs_value); + double y = 1.0 - (((((a5*t + a4)*t) + a3)*t + a2)*t + a1)*t*exp(-abs_value*abs_value); + outputs[i] = sign*y; + + } +} + + +#endif // __AIDGE_EXPORT_CPP_KERNELS_ERF_ \ No newline at end of file diff --git a/aidge_export_cpp/kernels/fullyconnected.hpp b/aidge_export_cpp/kernels/fullyconnected.hpp index 895ed1c21d35e7e266f788407dd7f42719607ad7..97547f806e97a8b187bbf260ef6ffaeac33cfaf6 100644 --- a/aidge_export_cpp/kernels/fullyconnected.hpp +++ b/aidge_export_cpp/kernels/fullyconnected.hpp @@ -28,7 +28,7 @@ void fullyconnected_forward ( // It is only an issue if the FC was after a flatten layer. // Otherwise it is not an issue for the other FC because CHANNELS_WIDTH = CHANNELS_HEIGHT = 1 // Solution: Add a system to check dataformat - for (int och = 0; och < NB_OUTPUTS; och++) { + /*for (int och = 0; och < NB_OUTPUTS; och++) { Bias_T weightedSum = biases[och]; @@ -42,9 +42,9 @@ void fullyconnected_forward ( } outputs[och] = activation_forward_value<Output_T>(weightedSum, och, ACTIVATION, rescaling); - } -/* -Here the kernel to use with inputs in NHWC and weights in NHWC + }*/ + +//Here the kernel to use with inputs in NHWC and weights in NHWC #pragma omp parallel for for (int och = 0; och < NB_OUTPUTS; och++) { @@ -65,7 +65,7 @@ Here the kernel to use with inputs in NHWC and weights in NHWC outputs[och] = activation_forward_value<Output_T>(weightedSum, och, ACTIVATION, rescaling); } -*/ + } diff --git a/aidge_export_cpp/kernels/matmul.hpp b/aidge_export_cpp/kernels/matmul.hpp index 4500993e02cf42fb698bc9004462800bdd3f7dc4..1403a010298e32a6a5670edb5e1efeb03ef5d37d 100644 --- a/aidge_export_cpp/kernels/matmul.hpp +++ b/aidge_export_cpp/kernels/matmul.hpp @@ -6,28 +6,106 @@ // Generic function for matmul and activation -template<int M, - int K, - int N, +template<int INPUT_A_DIMS[], int INPUT_B_DIMS[], int OUTPUT_DIMS[], + int _SIZE_DIM_IN_A, int _SIZE_DIM_IN_B, int SIZE_DIM_OUT, ActivationFunction_T ACTIVATION, - typename Input_T, typename Output_T, - typename Rescaling_T> + typename Input_T, typename Output_T> __attribute__((always_inline)) inline void matmul_forward ( const Input_T* __restrict inputs1, const Input_T* __restrict inputs2, - Output_T* __restrict outputs, - const Rescaling_T& __restrict rescaling) + Output_T* __restrict outputs) { - for (int m = 0; m < M; ++m) { - for (int n = 0; n < N; ++n) { - Output_T sum = Output_T(0); - for (int k = 0; k < K; ++k) { - sum += inputs1[K*m + k] * inputs2[N*k + n]; + + //initialize arrays storing broadcasted(or not) dims + int ndim_a[SIZE_DIM_OUT]; + int ndim_b[SIZE_DIM_OUT]; + if ( _SIZE_DIM_IN_A == 1){ + ndim_a[0] = 1; + ndim_a[1] =INPUT_A_DIMS[0]; + } + if ( _SIZE_DIM_IN_B == 1){ + ndim_b[0] =INPUT_B_DIMS[0]; + ndim_b[1] = 1; + } + + for (int i= 0; i<SIZE_DIM_OUT; i++){ + int idx = SIZE_DIM_OUT-_SIZE_DIM_IN_A; + ndim_a[i] = (i< idx) ? 1 :INPUT_A_DIMS[i-idx]; + } + for (int i= 0; i<SIZE_DIM_OUT; i++){ + int idx = SIZE_DIM_OUT-_SIZE_DIM_IN_B; + ndim_b[i] = (i< idx) ? 1 :INPUT_B_DIMS[i-idx]; + } + + // initialize strides to iterate through data because of broadcasting + int stride_post0[SIZE_DIM_OUT-2] ; + int stride_post1[SIZE_DIM_OUT-2] ; + int stride_step0[SIZE_DIM_OUT-2] ; + int stride_step1[SIZE_DIM_OUT-2] ; + if (SIZE_DIM_OUT > 2){ + stride_post0[SIZE_DIM_OUT - 3] = 1; + stride_post1[SIZE_DIM_OUT - 3] = 1; + for (int i = SIZE_DIM_OUT-4; i != -1; --i) { + stride_post0[i] = stride_post0[i+1]*ndim_a[i+1]; + stride_post1[i] = stride_post1[i+1]*ndim_b[i+1]; + } + for (int i = 0; i < SIZE_DIM_OUT-2; ++i) { + stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1; + stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1; + } + + } + + + // if _SIZE_DIM_IN_B == _SIZE_DIM_IN_A, then _SIZE_DIM_IN_A == SIZE_DIM_OUT == _SIZE_DIM_IN_B; + // else it will be broadcasted to the correct dims + + int nbMatrices = 1; + for(int i = SIZE_DIM_OUT -3; i>=0; --i){ + nbMatrices *= OUTPUT_DIMS[i]; + } + int dim = SIZE_DIM_OUT -3; + + + int offsetIn0 = 0; + int offsetIn1 = 0; + int offsetOut = 0; + const int n = ndim_a[SIZE_DIM_OUT - 2]; + const int k = ndim_a[SIZE_DIM_OUT - 1]; + const int m = ndim_b[SIZE_DIM_OUT - 1]; + const int matrix0Size = n*k; + const int matrix1Size = k*m; + const int matrixOutSize = n*m; + + for(int stack = 0; stack < nbMatrices;){ + + for (int i = 0; i < n; ++i) { + + for (int j = 0; j < m; ++j) { + float sum = 0; + + for (int l = 0; l < k; ++l) { + sum += (inputs1[ offsetIn0*matrix0Size + i*k + l] * inputs2[offsetIn1*matrix1Size + l*m + j]); + } + outputs[ offsetOut*matrixOutSize + i*m + j] = sum; + } + } + + if (++stack < nbMatrices) { + int tmp_stack = stack; + while(tmp_stack % OUTPUT_DIMS[dim] == 0) { + tmp_stack /= OUTPUT_DIMS[dim]; + dim--; } - outputs[N*m + n] = activation_forward_value<Output_T>(sum, 0/*not applicable*/, ACTIVATION, rescaling); + offsetIn0 += stride_step0[dim]; + offsetIn1 += stride_step1[dim]; + ++offsetOut; + dim = SIZE_DIM_OUT -3; } + } + } -#endif // __AIDGE_EXPORT_CPP_KERNELS_MATMUL__ +#endif // __AIDGE_EXPORT_CPP_KERNELS_MATMUL__ \ No newline at end of file diff --git a/aidge_export_cpp/kernels/mul.hpp b/aidge_export_cpp/kernels/mul.hpp new file mode 100644 index 0000000000000000000000000000000000000000..5c1ba622ef5971dbac8979d89931dc1f4e13cce5 --- /dev/null +++ b/aidge_export_cpp/kernels/mul.hpp @@ -0,0 +1,98 @@ +#ifndef __AIDGE_EXPORT_CPP_KERNELS_MUL__ +#define __AIDGE_EXPORT_CPP_KERNELS_MUL__ + +#include "network/typedefs.hpp" +#include "kernels/activation.hpp" + +template<int NB_ELTS, + int INPUT_A_DIMS[], int INPUT_B_DIMS[], int OUTPUT_DIMS[], + int SIZE_DIM_IN_A, int SIZE_DIM_IN_B, int SIZE_DIM_OUT, int OUT_SIZE, + ActivationFunction_T ACTIVATION, + typename Input_T, typename Output_T> +__attribute__((always_inline)) inline +void mul_forward ( + Output_T* __restrict outputs, + const Input_T* __restrict inputs1, + const Input_T* __restrict inputs2) +{ + int ndim_a[SIZE_DIM_OUT]; + int ndim_b[SIZE_DIM_OUT]; + for (int i= 0; i<SIZE_DIM_OUT; i++){ + int idx = SIZE_DIM_OUT-SIZE_DIM_IN_A; + ndim_a[i] = (i< idx) ? 1 : INPUT_A_DIMS[i-idx]; + } + for (int i= 0; i<SIZE_DIM_OUT; i++){ + int idx = SIZE_DIM_OUT-SIZE_DIM_IN_B; + ndim_b[i] = (i< idx) ? 1 : INPUT_B_DIMS[i-idx]; + } + + // Find the highest equal dimension + int contiguousidx = SIZE_DIM_OUT -1 ; + + for (int i = contiguousidx ; ndim_a[i] == ndim_b[i]; i--) { + contiguousidx = i; + } + + // Compute the highest number of contiguous data for each Tensor + int input0_contiguous_size = 1; + for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){ + input0_contiguous_size *= ndim_a[i]; + } + int input1_contiguous_size = 1; + for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){ + input1_contiguous_size *= ndim_b[i]; + } + int output_contiguous_size = 1; + for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){ + output_contiguous_size *= OUTPUT_DIMS[i]; + } + // initialize strides to iterate through data because of broadcasting + int stride_post0[contiguousidx ] ; + int stride_post1[contiguousidx ] ; + int stride_step0[contiguousidx ] ; + int stride_step1[contiguousidx ] ; + if (contiguousidx > 0) { + stride_post0[contiguousidx - 1] = 1; + stride_post1[contiguousidx - 1] = 1; + for (int i = contiguousidx -2; i != -1; --i) { + stride_post0[i] = stride_post0[i+1]*ndim_a[i+1]; + stride_post1[i] = stride_post1[i+1]*ndim_b[i+1]; + } + for (int i = 0; i < contiguousidx ; ++i) { + stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1; + stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1; + } + } + int offsetIn0 = 0; + int offsetIn1 = 0; + int offsetOut = 0; + int nbMatrices = 1; + for(int i = 0; i<contiguousidx ; ++i){ + nbMatrices *= OUTPUT_DIMS[i]; + + } + int dim = contiguousidx - 1; + for(int stack = 0; stack < nbMatrices;){ + for(int i = 0; i < output_contiguous_size; ++i){ + int in0_id = (input0_contiguous_size != 1) ? i : 0; + int in1_id = (input1_contiguous_size != 1) ? i : 0; + outputs[i + offsetOut*output_contiguous_size] = inputs1[in0_id + offsetIn0*input0_contiguous_size] * inputs2[in1_id + offsetIn1*input1_contiguous_size]; + } + if (++stack < nbMatrices) { + int tmp_stack = stack; + while(tmp_stack % OUTPUT_DIMS[dim] == 0) { + tmp_stack /= OUTPUT_DIMS[dim]; + dim--; + } + offsetIn0 += stride_step0[dim]; + offsetIn1 += stride_step1[dim]; + ++offsetOut; + dim = contiguousidx - 1; + } + } +} + + + + +#endif // __AIDGE_EXPORT_CPP_KERNELS_MUL__ \ No newline at end of file diff --git a/aidge_export_cpp/kernels/pooling.hpp b/aidge_export_cpp/kernels/pooling.hpp index 478b6a58aed45e2bce0ed1683ad113f9c7a8bffb..8f6de40363f5b932148b0ac6a860611cd5b0ea9c 100644 --- a/aidge_export_cpp/kernels/pooling.hpp +++ b/aidge_export_cpp/kernels/pooling.hpp @@ -7,6 +7,35 @@ #include <stdexcept> +void reorder_NCHW_NHWC_pool(const float* input, float* output, int N, int C, int H, int W, bool direct = true) { + auto nchw_index = [=](int n, int c, int h, int w) { + return ((n * C + c) * H + h) * W + w; + }; + + auto nhwc_index = [=](int n, int h, int w, int c) { + return ((n * H + h) * W + w) * C + c; + }; + #pragma omp parallel for + for (int n = 0; n < N; ++n) { + #pragma omp parallel for + for (int c = 0; c < C; ++c) { + #pragma omp parallel for + for (int h = 0; h < H; ++h) { + #pragma omp parallel for + for (int w = 0; w < W; ++w) { + if (direct) { + output[nhwc_index(n, h, w, c)] = input[nchw_index(n, c, h, w)]; + } else { + output[nchw_index(n, c, h, w)] = input[nhwc_index(n, h, w, c)]; + } + + } + } + } + } +} + + template<int NB_CHANNELS, int CHANNELS_HEIGHT, int CHANNELS_WIDTH, int NB_OUTPUTS, @@ -22,11 +51,16 @@ void pooling_forward( const Input_T* __restrict inputs, Output_T* __restrict outputs) { + + float inputs_ordered[NB_CHANNELS * CHANNELS_HEIGHT * CHANNELS_WIDTH]; + float outputs_unordered[NB_OUTPUTS * OUTPUTS_HEIGHT * OUTPUTS_WIDTH]; + reorder_NCHW_NHWC_pool(inputs, inputs_ordered, 1, NB_CHANNELS, CHANNELS_HEIGHT, CHANNELS_WIDTH, true); + constexpr int OUTPUTS_HEIGHT_NOPAD = (CHANNELS_HEIGHT - POOL_HEIGHT + STRIDE_Y) / STRIDE_Y; constexpr int OUTPUTS_WIDTH_NOPAD = (CHANNELS_WIDTH - POOL_WIDTH + STRIDE_X) / STRIDE_X; - + #pragma omp parallel for for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) { const int syMin = (PADDING_Y == 0) ? 0 : max(PADDING_Y - (oy * STRIDE_Y), 0); @@ -78,15 +112,15 @@ void pooling_forward( int iOffsetInRange = iOffset + output + sx * NB_CHANNELS; - if (inputs[iOffsetInRange] > maxVal) - maxVal = inputs[iOffsetInRange]; + if (inputs_ordered[iOffsetInRange] > maxVal) + maxVal = inputs_ordered[iOffsetInRange]; } } - outputs[oOffset + output] = maxVal; + outputs_unordered[oOffset + output] = maxVal; } else if (POOLING_TYPE == Average) { - int32_t sum = 0; + Input_T sum = 0; for (int sy = 0; sy < POOL_HEIGHT; ++sy) { if ((PADDING_Y != 0 @@ -109,11 +143,12 @@ void pooling_forward( } int iOffsetInRange = iOffset + output + sx * NB_CHANNELS; - sum += inputs[iOffsetInRange]; + sum = inputs_ordered[iOffsetInRange] + sum; } + } - outputs[oOffset + output] = (Output_T) (sum / (POOL_HEIGHT * POOL_WIDTH)); + outputs_unordered[oOffset + output] = (Output_T) (sum / (POOL_HEIGHT * POOL_WIDTH)); } else { throw std::runtime_error("The export only supports Max and Average pooling."); @@ -121,6 +156,7 @@ void pooling_forward( } } } + reorder_NCHW_NHWC_pool(outputs_unordered, outputs, 1, NB_OUTPUTS, OUTPUTS_HEIGHT, OUTPUTS_WIDTH, false); } -#endif // __AIDGE_EXPORT_CPP_KERNELS_POOLING__ +#endif // __AIDGE_EXPORT_CPP_KERNELS_POOLING__ \ No newline at end of file diff --git a/aidge_export_cpp/kernels/sub.hpp b/aidge_export_cpp/kernels/sub.hpp new file mode 100644 index 0000000000000000000000000000000000000000..2576edc1e3ba12ed1147e2a83a4a94cd875c574d --- /dev/null +++ b/aidge_export_cpp/kernels/sub.hpp @@ -0,0 +1,100 @@ +#ifndef __AIDGE_EXPORT_CPP_KERNELS_SUB__ +#define __AIDGE_EXPORT_CPP_KERNELS_SUB__ + +#include "network/typedefs.hpp" +#include "kernels/activation.hpp" + + +template<int NB_ELTS, + int INPUT_A_DIMS[], int INPUT_B_DIMS[], int OUTPUT_DIMS[], + int SIZE_DIM_IN_A, int SIZE_DIM_IN_B, int SIZE_DIM_OUT, int OUT_SIZE, + ActivationFunction_T ACTIVATION, + typename Input_T, typename Output_T> +__attribute__((always_inline)) inline +void sub_forward ( + Output_T* __restrict outputs, + const Input_T* __restrict inputs1, + const Input_T* __restrict inputs2) +{ + + int ndim_a[SIZE_DIM_OUT]; + int ndim_b[SIZE_DIM_OUT]; + for (int i= 0; i<SIZE_DIM_OUT; i++){ + int idx = SIZE_DIM_OUT-SIZE_DIM_IN_A; + ndim_a[i] = (i< idx) ? 1 : INPUT_A_DIMS[i-idx]; + } + for (int i= 0; i<SIZE_DIM_OUT; i++){ + int idx = SIZE_DIM_OUT-SIZE_DIM_IN_B; + ndim_b[i] = (i< idx) ? 1 : INPUT_B_DIMS[i-idx]; + } + + // Find the highest equal dimension + int contiguousidx = SIZE_DIM_OUT -1 ; + + for (int i = contiguousidx ; ndim_a[i] == ndim_b[i]; i--) { + contiguousidx = i; + } + + // Compute the highest number of contiguous data for each Tensor + int input0_contiguous_size = 1; + for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){ + input0_contiguous_size *= ndim_a[i]; + } + int input1_contiguous_size = 1; + for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){ + input1_contiguous_size *= ndim_b[i]; + } + int output_contiguous_size = 1; + for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){ + output_contiguous_size *= OUTPUT_DIMS[i]; + } + // initialize strides to iterate through data because of broadcasting + int stride_post0[contiguousidx ] ; + int stride_post1[contiguousidx ] ; + int stride_step0[contiguousidx ] ; + int stride_step1[contiguousidx ] ; + if (contiguousidx > 0) { + stride_post0[contiguousidx - 1] = 1; + stride_post1[contiguousidx - 1] = 1; + for (int i = contiguousidx -2; i != -1; --i) { + stride_post0[i] = stride_post0[i+1]*ndim_a[i+1]; + stride_post1[i] = stride_post1[i+1]*ndim_b[i+1]; + } + for (int i = 0; i < contiguousidx ; ++i) { + stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1; + stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1; + } + } + int offsetIn0 = 0; + int offsetIn1 = 0; + int offsetOut = 0; + int nbMatrices = 1; + for(int i = 0; i<contiguousidx ; ++i){ + nbMatrices *= OUTPUT_DIMS[i]; + + } + int dim = contiguousidx - 1; + for(int stack = 0; stack < nbMatrices;){ + for(int i = 0; i < output_contiguous_size; ++i){ + int in0_id = (input0_contiguous_size != 1) ? i : 0; + int in1_id = (input1_contiguous_size != 1) ? i : 0; + outputs[i + offsetOut*output_contiguous_size] = inputs1[in0_id + offsetIn0*input0_contiguous_size] - inputs2[in1_id + offsetIn1*input1_contiguous_size]; + } + if (++stack < nbMatrices) { + int tmp_stack = stack; + while(tmp_stack % OUTPUT_DIMS[dim] == 0) { + tmp_stack /= OUTPUT_DIMS[dim]; + dim--; + } + offsetIn0 += stride_step0[dim]; + offsetIn1 += stride_step1[dim]; + ++offsetOut; + dim = contiguousidx - 1; + } + } +} + + + + +#endif // __AIDGE_EXPORT_CPP_KERNELS_SUB__ \ No newline at end of file diff --git a/aidge_export_cpp/kernels/transpose_diff.hpp b/aidge_export_cpp/kernels/transpose_diff.hpp new file mode 100644 index 0000000000000000000000000000000000000000..712d9b42a9282578bfa80ef840a2f54b86081f55 --- /dev/null +++ b/aidge_export_cpp/kernels/transpose_diff.hpp @@ -0,0 +1,54 @@ +#ifndef __AIDGE_EXPORT_CPP_KERNELS_TRANSPOSE__ +#define __AIDGE_EXPORT_CPP_KERNELS_TRANSPOSE__ + +#include "network/typedefs.hpp" + + +using namespace std; + +template< int INPUT_DIMS[], int PERM[], int OUTPUT_DIMS[], + int SIZE_OUTPUT_DIMS, int SIZE, + typename Input_T, typename Output_T> +__attribute__((always_inline)) inline +void transpose_forward ( + const Input_T* __restrict inputs, + Output_T* __restrict outputs) + { + + int newStrides[SIZE_OUTPUT_DIMS]; + for (int i = 0; i<SIZE_OUTPUT_DIMS;++i){newStrides[i] = 1;} + for (int i = 0; i < SIZE_OUTPUT_DIMS; ++i) { + for (int j = i + 1; j < SIZE_OUTPUT_DIMS; ++j) { + newStrides[i] *= OUTPUT_DIMS[j]; + } + } + + int indices[SIZE_OUTPUT_DIMS]; + for (int i = 0; i<SIZE_OUTPUT_DIMS;++i){indices[i] = 0;} + + for (int i = 0; i < SIZE; ++i) { + int idx = 0; + for (int j = SIZE_OUTPUT_DIMS -1; j >=0; --j) { + idx += indices[PERM[j]] * newStrides[j]; + } + + outputs[idx] = inputs[i]; + + + for (int j = SIZE_OUTPUT_DIMS - 1; j >= 0; --j) { + if (indices[j] < INPUT_DIMS[j] - 1) { + indices[j]++; + break; + } + else { + indices[j] = 0; + } + } + } + + +} + + + +#endif // __AIDGE_EXPORT_CPP_KERNELS_TRANSPOSE__ diff --git a/aidge_export_cpp/operators.py b/aidge_export_cpp/operators.py index 346928f4a84c403df2172311cede8b99fd06eebe..18089fd97af7f39bf00a62251f1fd5c47391b362 100644 --- a/aidge_export_cpp/operators.py +++ b/aidge_export_cpp/operators.py @@ -56,7 +56,7 @@ class ProducerCPP(ExportNode): super().__init__(node, mem_info) self.values = np.array(self.operator.get_output(0)) - if len(self.values.shape) == 4: # Note: export in HWC + if len(self.values.shape) == 4: # Note: export in HWC self.values = np.transpose(self.values, (0, 2, 3, 1)) def export(self, export_folder: Path): @@ -143,6 +143,24 @@ def _setup_conv2D(conv): str(ROOT / "kernels" / "rescaling.hpp") ] + +def _setup_elemwise_op(elemwise, op): + """Common code (template and kernel setup) shared across all the different elementWise operator (Add, Sub,...).""" + + elemwise.attributes["elemwise_op"] = op + elemwise.attributes["activation"] = "Linear" + elemwise.attributes["rescaling"] = "NoScaling" + elemwise.config_template = str( + ROOT / "templates" / "configuration" / "elemwise_config.jinja") + elemwise.forward_template = str( + ROOT / "templates" / "kernel_forward" / "elemwise_forward.jinja") + elemwise.include_list = [] + elemwise.kernels_to_copy = [ + str(ROOT / "kernels" / "elemwise.hpp"), + str(ROOT / "kernels" / "activation.hpp"), + str(ROOT / "kernels" / "rescaling.hpp") + ] + @ExportLibCpp.register("Conv2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32))) class ConvCPP(ExportNodeCpp): def __init__(self, node, mem_info): @@ -150,7 +168,7 @@ class ConvCPP(ExportNodeCpp): # No padding with Conv # Use PaddedConv to add padding attribute self.attributes["padding"] = [0, 0] - + self.attributes["groups"] = 1 _setup_conv2D(self) @ExportLibCpp.register_metaop("PaddedConv2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32))) @@ -169,25 +187,28 @@ class PaddedConvCPP(ExportNodeCpp): ).attr.stride_dims self.attributes["dilation_dims"] = n.get_operator( ).attr.dilation_dims - + self.attributes["groups"] = 1 _setup_conv2D(self) -def _setup_elemwise_op(elemwise, op): - """Common code (template and kernel setup) shared across all the different elementWise operator (Add, Sub,...).""" +@ExportLibCpp.register_metaop("PaddedConvDepthWise2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32))) +class PaddedConvDepthWiseCPP(ExportNodeCpp): + def __init__(self, node, mem_info): + super().__init__(node, mem_info) + # TODO find a way to retrive attr for meta op + for n in self.operator.get_micro_graph().get_nodes(): + if n.type() == "Pad2D": + self.attributes["padding"] = n.get_operator( + ).attr.begin_end_borders + if n.type() == "ConvDepthWise2D": + self.attributes["kernel_dims"] = n.get_operator( + ).attr.kernel_dims + self.attributes["stride_dims"] = n.get_operator( + ).attr.stride_dims + self.attributes["dilation_dims"] = n.get_operator( + ).attr.dilation_dims - elemwise.attributes["elemwise_op"] = op - elemwise.attributes["activation"] = "Linear" - elemwise.attributes["rescaling"] = "NoScaling" - elemwise.config_template = str( - ROOT / "templates" / "configuration" / "elemwise_config.jinja") - elemwise.forward_template = str( - ROOT / "templates" / "kernel_forward" / "elemwise_forward.jinja") - elemwise.include_list = [] - elemwise.kernels_to_copy = [ - str(ROOT / "kernels" / "elemwise.hpp"), - str(ROOT / "kernels" / "activation.hpp"), - str(ROOT / "kernels" / "rescaling.hpp") - ] + self.attributes["groups"] = self.attributes["out_chan"][0] + _setup_conv2D(self) @ExportLibCpp.register("Add", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32))) class AddCPP(ExportNodeCpp): @@ -210,6 +231,14 @@ class MulCPP(ExportNodeCpp): _setup_elemwise_op(self, "Mul") +@ExportLibCpp.register("Div", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32))) +class DivCPP(ExportNodeCpp): + def __init__(self, node, mem_info): + super().__init__(node, mem_info) + + _setup_elemwise_op(self, "Div") + + def _setup_pooling(pooling): """Common code (template and kernel setup) shared across all the different pooling operator.""" @@ -302,4 +331,39 @@ class TransposeCPP(ExportNodeCpp): self.include_list = [] self.kernels_to_copy = [ str(ROOT / "kernels" / "transpose.hpp") + ] + + +@ExportLibCpp.register("Erf", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32))) +class ErfCPP(ExportNodeCpp): + def __init__(self, node, mem_info): + super().__init__(node, mem_info) + self.attributes["activation"] = "Linear" + self.attributes["rescaling"] = "NoScaling" + self.config_template = str( + ROOT / "templates" / "configuration" / "erf_config.jinja") + self.forward_template = str( + ROOT / "templates" / "kernel_forward" / "erf_forward.jinja") + self.include_list = [] + self.kernels_to_copy = [ + str(ROOT / "kernels" / "erf.hpp"), + str(ROOT / "kernels" / "activation.hpp"), + str(ROOT / "kernels" / "rescaling.hpp") + ] + +@ExportLibCpp.register("BatchNorm2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32))) +class BatchNorm2DCPP(ExportNodeCpp): + def __init__(self, node, mem_info): + super().__init__(node, mem_info) + self.attributes["activation"] = "Linear" + self.attributes["rescaling"] = "NoScaling" + self.config_template = str( + ROOT / "templates" / "configuration" / "batchnorm_config.jinja") + self.forward_template = str( + ROOT / "templates" / "kernel_forward" / "batchnorm_forward.jinja") + self.include_list = [] + self.kernels_to_copy = [ + str(ROOT / "kernels" / "batchnorm.hpp"), + str(ROOT / "kernels" / "activation.hpp"), + str(ROOT / "kernels" / "rescaling.hpp") ] \ No newline at end of file diff --git a/aidge_export_cpp/static/include/network/typedefs.hpp b/aidge_export_cpp/static/include/network/typedefs.hpp index acece91115f73a57197c8a423cd34ec37b2f2e2a..9b8360287969a8e1583a94472e1f55db2d8d5673 100644 --- a/aidge_export_cpp/static/include/network/typedefs.hpp +++ b/aidge_export_cpp/static/include/network/typedefs.hpp @@ -19,7 +19,8 @@ typedef enum { typedef enum { Add, Sub, - Mul + Mul, + Div } ElemWise_T; typedef enum { diff --git a/aidge_export_cpp/templates/configuration/add_config.jinja b/aidge_export_cpp/templates/configuration/add_config.jinja new file mode 100644 index 0000000000000000000000000000000000000000..143d0048cd771dd1e47ef0758395538d83663a20 --- /dev/null +++ b/aidge_export_cpp/templates/configuration/add_config.jinja @@ -0,0 +1,25 @@ +{#- For name header -#} +#ifndef {{ name|upper }}_LAYER_H +#define {{ name|upper }}_LAYER_H +#include "kernels/rescaling.hpp" + +{% include "./_def_io.jinja" %} +{% include "./_meminfo.jinja" %} +{# For layer configuration -#} +#define {{ name|upper }}_NB_ELTS {{ in_dims[0]|join('*') }} +#define {{ name|upper }}_NB_ELTS_B {{ in_dims[1]|join('*')}} + +int {{name|upper}}_OUTPUT_DIMS[] = { {{ out_dims[0]|join(", ") }} }; +int {{name|upper}}_INPUT_A_DIMS[] = { {{ in_dims[0]|join(", ") }} }; +int {{name|upper}}_INPUT_B_DIMS[] = { {{ in_dims[1]|join(", ") }} }; + +#define {{name|upper}}_SIZE_DIM_IN_A {{in_dims[0]|length}} +#define {{name|upper}}_SIZE_DIM_IN_B {{in_dims[1]|length}} +#define {{name|upper}}_SIZE_DIM_OUT {{out_dims[0]|length}} + +#define {{ name|upper }}_OUT_SIZE {{out_size[0]}} +#define {{name|upper }}_SIZE_DIM_OUT {{out_dims[0]|length}} + +#define {{ name|upper }}_ACTIVATION {{ activation }} +static const {{ rescaling }} {{ name|upper }}_RESCALING = {}; +#endif /* {{ name|upper }}_LAYER_H */ diff --git a/aidge_export_cpp/templates/configuration/batchnorm_config.jinja b/aidge_export_cpp/templates/configuration/batchnorm_config.jinja index 701ba7c46e4727eca86fcabf3ed997cab69f4e92..3706ee654671ce1b7bfb532de64a0ee298f8b744 100644 --- a/aidge_export_cpp/templates/configuration/batchnorm_config.jinja +++ b/aidge_export_cpp/templates/configuration/batchnorm_config.jinja @@ -8,4 +8,4 @@ #define {{ name|upper }}_ACTIVATION {{ activation }} #define {{ name|upper }}_EPSILON {{ epsilon }} -#endif /* {{ name|upper }}_LAYER_H */ +#endif /* {{ name|upper }}_LAYER_H */ \ No newline at end of file diff --git a/aidge_export_cpp/templates/configuration/convolution_config.jinja b/aidge_export_cpp/templates/configuration/convolution_config.jinja index a4a2462d1f475424d799f38e52075fafb333c0d0..417f240c64b3dfe1dc204ba6344b693aa4409998 100644 --- a/aidge_export_cpp/templates/configuration/convolution_config.jinja +++ b/aidge_export_cpp/templates/configuration/convolution_config.jinja @@ -5,6 +5,8 @@ {# For layer configuration -#} {% include "./_def_io.jinja" %} {% include "./_meminfo.jinja" %} + +#define {{ name|upper }}_GROUPS {{ groups }} #define {{ name|upper }}_PADDING_Y {{ padding[1] }} #define {{ name|upper }}_PADDING_X {{ padding[0] }} #define {{ name|upper }}_STRIDE_Y {{ stride_dims[1] }} @@ -21,5 +23,4 @@ static const {{ rescaling }} {{ name|upper }}_RESCALING = {}; #define {{ name|upper }}_WEIGHTS_SIZE {{ weights_size }} #define {{ name|upper }}_BIASES_SIZE {{ out_chan[0] }} - #endif /* {{ name|upper }}_LAYER_H */ diff --git a/aidge_export_cpp/templates/configuration/div_config.jinja b/aidge_export_cpp/templates/configuration/div_config.jinja new file mode 100644 index 0000000000000000000000000000000000000000..143d0048cd771dd1e47ef0758395538d83663a20 --- /dev/null +++ b/aidge_export_cpp/templates/configuration/div_config.jinja @@ -0,0 +1,25 @@ +{#- For name header -#} +#ifndef {{ name|upper }}_LAYER_H +#define {{ name|upper }}_LAYER_H +#include "kernels/rescaling.hpp" + +{% include "./_def_io.jinja" %} +{% include "./_meminfo.jinja" %} +{# For layer configuration -#} +#define {{ name|upper }}_NB_ELTS {{ in_dims[0]|join('*') }} +#define {{ name|upper }}_NB_ELTS_B {{ in_dims[1]|join('*')}} + +int {{name|upper}}_OUTPUT_DIMS[] = { {{ out_dims[0]|join(", ") }} }; +int {{name|upper}}_INPUT_A_DIMS[] = { {{ in_dims[0]|join(", ") }} }; +int {{name|upper}}_INPUT_B_DIMS[] = { {{ in_dims[1]|join(", ") }} }; + +#define {{name|upper}}_SIZE_DIM_IN_A {{in_dims[0]|length}} +#define {{name|upper}}_SIZE_DIM_IN_B {{in_dims[1]|length}} +#define {{name|upper}}_SIZE_DIM_OUT {{out_dims[0]|length}} + +#define {{ name|upper }}_OUT_SIZE {{out_size[0]}} +#define {{name|upper }}_SIZE_DIM_OUT {{out_dims[0]|length}} + +#define {{ name|upper }}_ACTIVATION {{ activation }} +static const {{ rescaling }} {{ name|upper }}_RESCALING = {}; +#endif /* {{ name|upper }}_LAYER_H */ diff --git a/aidge_export_cpp/templates/configuration/elemwise_config.jinja b/aidge_export_cpp/templates/configuration/elemwise_config.jinja index 91a0be4cc4b6fc15e8b979ecd3ca01f122ebc63d..c00ec86a33d44ea2a4c1632736378d52f85657d8 100644 --- a/aidge_export_cpp/templates/configuration/elemwise_config.jinja +++ b/aidge_export_cpp/templates/configuration/elemwise_config.jinja @@ -7,6 +7,19 @@ {% include "./_meminfo.jinja" %} {# For layer configuration -#} #define {{ name|upper }}_NB_ELTS {{ in_dims[0]|join('*') }} +#define {{ name|upper }}_NB_ELTS_B {{ in_dims[1]|join('*')}} + +int {{name|upper}}_OUTPUT_DIMS[] = { {{ out_dims[0]|join(", ") }} }; +int {{name|upper}}_INPUT_A_DIMS[] = { {{ in_dims[0]|join(", ") }} }; +int {{name|upper}}_INPUT_B_DIMS[] = { {{ in_dims[1]|join(", ") }} }; + +#define {{name|upper}}_SIZE_DIM_IN_A {{in_dims[0]|length}} +#define {{name|upper}}_SIZE_DIM_IN_B {{in_dims[1]|length}} +#define {{name|upper}}_SIZE_DIM_OUT {{out_dims[0]|length}} + +#define {{ name|upper }}_OUT_SIZE {{out_size[0]}} +#define {{name|upper }}_SIZE_DIM_OUT {{out_dims[0]|length}} + #define {{ name|upper }}_ACTIVATION {{ activation }} #define {{ name|upper }}_ELEM_OP {{ elemwise_op }} static const {{ rescaling }} {{ name|upper }}_RESCALING = {}; diff --git a/aidge_export_cpp/templates/configuration/erf_config.jinja b/aidge_export_cpp/templates/configuration/erf_config.jinja new file mode 100644 index 0000000000000000000000000000000000000000..b273472091c2789837f473ef71f03de6d1702473 --- /dev/null +++ b/aidge_export_cpp/templates/configuration/erf_config.jinja @@ -0,0 +1,11 @@ +{#- For name header -#} +#ifndef {{ name|upper }}_LAYER_H +#define {{ name|upper }}_LAYER_H + +{# For layer configuration -#} +{# For layer configuration -#} +{% include "./_def_io.jinja" %} +{% include "./_meminfo.jinja" %} +#define {{ name|upper }}_NB_ELTS {{ in_dims[0]|join('*') }} + +#endif /* {{ name|upper }}_LAYER_H */ \ No newline at end of file diff --git a/aidge_export_cpp/templates/configuration/matmul_config.jinja b/aidge_export_cpp/templates/configuration/matmul_config.jinja index 38316f20947fa726085bf3577ead510e6c5096f3..6ef27fef712ed0417b33da062a6dfbea1b49d8f7 100644 --- a/aidge_export_cpp/templates/configuration/matmul_config.jinja +++ b/aidge_export_cpp/templates/configuration/matmul_config.jinja @@ -1,16 +1,29 @@ {#- For name header -#} #ifndef {{ name|upper }}_LAYER_H #define {{ name|upper }}_LAYER_H +#include "kernels/rescaling.hpp" {% include "./_def_io.jinja" %} {% include "./_meminfo.jinja" %} {# For layer configuration -#} -#define {{ name|upper }}_M {{ in_dims[0][0] }} -#define {{ name|upper }}_K {{ in_dims[0][1] }} -#define {{ name|upper }}_N {{ in_dims[1][1] }} +{% include "./_def_io.jinja" %} +{% include "./_meminfo.jinja" %} +#define {{ name|upper }}_B {{ in_dims[0][0]}} +#define {{ name|upper }}_C {{ in_chan[0]}} +#define {{ name|upper }}_M {{ in_height[0]}} +#define {{ name|upper }}_K {{ in_width[0] }} +#define {{ name|upper }}_N {{ out_width[0] }} + +#define {{name|upper}}_SIZE_DIM_IN_A {{in_dims[0]|length}} +#define {{name|upper}}_SIZE_DIM_IN_B {{in_dims[1]|length}} +#define {{name|upper}}_SIZE_DIM_OUT {{out_dims[0]|length}} + +int {{name|upper}}_OUTPUT_DIMS[] = { {{ out_dims[0]|join(", ") }} }; +int {{name|upper}}_INPUT_A_DIMS[] = { {{ in_dims[0]|join(", ") }} }; +int {{name|upper}}_INPUT_B_DIMS[] = { {{ in_dims[1]|join(", ") }} }; + #define {{ name|upper }}_ACTIVATION {{ activation }} -static const {{ rescaling }} {{ name|upper }}_RESCALING = {}; {#- Calculate sizes #} diff --git a/aidge_export_cpp/templates/configuration/mul_config.jinja b/aidge_export_cpp/templates/configuration/mul_config.jinja new file mode 100644 index 0000000000000000000000000000000000000000..143d0048cd771dd1e47ef0758395538d83663a20 --- /dev/null +++ b/aidge_export_cpp/templates/configuration/mul_config.jinja @@ -0,0 +1,25 @@ +{#- For name header -#} +#ifndef {{ name|upper }}_LAYER_H +#define {{ name|upper }}_LAYER_H +#include "kernels/rescaling.hpp" + +{% include "./_def_io.jinja" %} +{% include "./_meminfo.jinja" %} +{# For layer configuration -#} +#define {{ name|upper }}_NB_ELTS {{ in_dims[0]|join('*') }} +#define {{ name|upper }}_NB_ELTS_B {{ in_dims[1]|join('*')}} + +int {{name|upper}}_OUTPUT_DIMS[] = { {{ out_dims[0]|join(", ") }} }; +int {{name|upper}}_INPUT_A_DIMS[] = { {{ in_dims[0]|join(", ") }} }; +int {{name|upper}}_INPUT_B_DIMS[] = { {{ in_dims[1]|join(", ") }} }; + +#define {{name|upper}}_SIZE_DIM_IN_A {{in_dims[0]|length}} +#define {{name|upper}}_SIZE_DIM_IN_B {{in_dims[1]|length}} +#define {{name|upper}}_SIZE_DIM_OUT {{out_dims[0]|length}} + +#define {{ name|upper }}_OUT_SIZE {{out_size[0]}} +#define {{name|upper }}_SIZE_DIM_OUT {{out_dims[0]|length}} + +#define {{ name|upper }}_ACTIVATION {{ activation }} +static const {{ rescaling }} {{ name|upper }}_RESCALING = {}; +#endif /* {{ name|upper }}_LAYER_H */ diff --git a/aidge_export_cpp/templates/configuration/sub_config.jinja b/aidge_export_cpp/templates/configuration/sub_config.jinja new file mode 100644 index 0000000000000000000000000000000000000000..143d0048cd771dd1e47ef0758395538d83663a20 --- /dev/null +++ b/aidge_export_cpp/templates/configuration/sub_config.jinja @@ -0,0 +1,25 @@ +{#- For name header -#} +#ifndef {{ name|upper }}_LAYER_H +#define {{ name|upper }}_LAYER_H +#include "kernels/rescaling.hpp" + +{% include "./_def_io.jinja" %} +{% include "./_meminfo.jinja" %} +{# For layer configuration -#} +#define {{ name|upper }}_NB_ELTS {{ in_dims[0]|join('*') }} +#define {{ name|upper }}_NB_ELTS_B {{ in_dims[1]|join('*')}} + +int {{name|upper}}_OUTPUT_DIMS[] = { {{ out_dims[0]|join(", ") }} }; +int {{name|upper}}_INPUT_A_DIMS[] = { {{ in_dims[0]|join(", ") }} }; +int {{name|upper}}_INPUT_B_DIMS[] = { {{ in_dims[1]|join(", ") }} }; + +#define {{name|upper}}_SIZE_DIM_IN_A {{in_dims[0]|length}} +#define {{name|upper}}_SIZE_DIM_IN_B {{in_dims[1]|length}} +#define {{name|upper}}_SIZE_DIM_OUT {{out_dims[0]|length}} + +#define {{ name|upper }}_OUT_SIZE {{out_size[0]}} +#define {{name|upper }}_SIZE_DIM_OUT {{out_dims[0]|length}} + +#define {{ name|upper }}_ACTIVATION {{ activation }} +static const {{ rescaling }} {{ name|upper }}_RESCALING = {}; +#endif /* {{ name|upper }}_LAYER_H */ diff --git a/aidge_export_cpp/templates/configuration/transpose_config.jinja b/aidge_export_cpp/templates/configuration/transpose_config.jinja new file mode 100644 index 0000000000000000000000000000000000000000..01e57b89ac2cbd64635db236aa85b342778c9bab --- /dev/null +++ b/aidge_export_cpp/templates/configuration/transpose_config.jinja @@ -0,0 +1,15 @@ +{#- For name header -#} +#ifndef {{ name|upper }}_LAYER_H +#define {{ name|upper }}_LAYER_H + +{# For layer configuration -#} +{% include "./_def_io.jinja" %} +{% include "./_meminfo.jinja" %} +#define {{ name|upper }}_SIZE {{out_size[0]}} +#define {{name|upper }}_SIZE_OUTPUT_DIMS {{out_dims[0]|length}} + +int {{name|upper}}_OUTPUT_DIMS[] = { {{ out_dims[0]|join(", ") }} }; +int {{name|upper}}_INPUT_DIMS[] = { {{ in_dims[0]|join(", ") }} }; +int {{name|upper}}_PERM[] = { {{ output_dims_order|join(", ") }} }; + +#endif /* {{ name|upper }}_LAYER_H */ \ No newline at end of file diff --git a/aidge_export_cpp/templates/kernel_forward/add_forward.jinja b/aidge_export_cpp/templates/kernel_forward/add_forward.jinja new file mode 100644 index 0000000000000000000000000000000000000000..3176cedfbc9c172612cae7b50dcb87c4efedfd09 --- /dev/null +++ b/aidge_export_cpp/templates/kernel_forward/add_forward.jinja @@ -0,0 +1,14 @@ +{% filter indent(width=4, first=False) %} +{% include "./_mem_offset.jinja" %} +add_forward<{{name|upper}}_NB_ELTS, + {{name|upper}}_INPUT_A_DIMS, + {{name|upper}}_INPUT_B_DIMS, + {{name|upper}}_OUTPUT_DIMS, + {{name|upper}}_SIZE_DIM_IN_A, + {{name|upper}}_SIZE_DIM_IN_B, + {{name|upper}}_SIZE_DIM_OUT, + {{name|upper}}_OUT_SIZE, + {{name|upper}}_ACTIVATION> + ({{out_name[0]}}, {{in_name[0]}}, {{in_name[1]}}); +{% include "./_save_outputs.jinja" %} +{% endfilter %} diff --git a/aidge_export_cpp/templates/kernel_forward/batchnorm_forward.jinja b/aidge_export_cpp/templates/kernel_forward/batchnorm_forward.jinja index 5a759b839cd0b04b3b82f8ca4cb8dd1b0201f4f7..a18e3a755a0743a33660ef92fac4edd59e464969 100644 --- a/aidge_export_cpp/templates/kernel_forward/batchnorm_forward.jinja +++ b/aidge_export_cpp/templates/kernel_forward/batchnorm_forward.jinja @@ -6,4 +6,4 @@ batchnorm_forward<{{ out_name[0]|upper }}_NB_OUTPUTS, {{name|upper}}_ACTIVATION> ({{in_name[0]}}, {{out_name[0]}}, {{in_name[1]}}, {{in_name[2]}}, {{in_name[3]}}, {{in_name[4]}}, {{name|upper}}_EPSILON); {% include "./_save_outputs.jinja" %} -{% endfilter %} +{% endfilter %} \ No newline at end of file diff --git a/aidge_export_cpp/templates/kernel_forward/convolution_forward.jinja b/aidge_export_cpp/templates/kernel_forward/convolution_forward.jinja index 421013b9590dabe6ee0ac12f969494913414a530..98b9e0394c737afa4ed12a4424b0004b365d3049 100644 --- a/aidge_export_cpp/templates/kernel_forward/convolution_forward.jinja +++ b/aidge_export_cpp/templates/kernel_forward/convolution_forward.jinja @@ -4,6 +4,7 @@ convolution_forward<{{ in_name[0]|upper }}_NB_CHANNELS, {{ in_name[0]|upper }}_IN_HEIGHT, {{ in_name[0]|upper }}_IN_WIDTH, {{ out_name[0]|upper }}_NB_OUTPUTS, + {{name|upper}}_GROUPS, {{ out_name[0]|upper }}_OUT_HEIGHT, {{ out_name[0]|upper }}_OUT_WIDTH, {{name|upper}}_PADDING_Y, @@ -16,5 +17,6 @@ convolution_forward<{{ in_name[0]|upper }}_NB_CHANNELS, {{name|upper}}_KERNEL_WIDTH, {{name|upper}}_ACTIVATION> ({{in_name[0]}}, {{out_name[0]}}, {{in_name[1]}}, {{in_name[2]}}, {{name|upper}}_RESCALING); + {% include "./_save_outputs.jinja" %} -{% endfilter %} +{% endfilter %} \ No newline at end of file diff --git a/aidge_export_cpp/templates/kernel_forward/div_forward.jinja b/aidge_export_cpp/templates/kernel_forward/div_forward.jinja new file mode 100644 index 0000000000000000000000000000000000000000..4b793570a01405ac7fbdac1a7e4b84c283024c8e --- /dev/null +++ b/aidge_export_cpp/templates/kernel_forward/div_forward.jinja @@ -0,0 +1,14 @@ +{% filter indent(width=4, first=False) %} +{% include "./_mem_offset.jinja" %} +div_forward<{{name|upper}}_NB_ELTS, + {{name|upper}}_INPUT_A_DIMS, + {{name|upper}}_INPUT_B_DIMS, + {{name|upper}}_OUTPUT_DIMS, + {{name|upper}}_SIZE_DIM_IN_A, + {{name|upper}}_SIZE_DIM_IN_B, + {{name|upper}}_SIZE_DIM_OUT, + {{name|upper}}_OUT_SIZE, + {{name|upper}}_ACTIVATION> + ({{out_name[0]}}, {{in_name[0]}}, {{in_name[1]}}); +{% include "./_save_outputs.jinja" %} +{% endfilter %} diff --git a/aidge_export_cpp/templates/kernel_forward/elemwise_forward.jinja b/aidge_export_cpp/templates/kernel_forward/elemwise_forward.jinja index f60d163dcbfd6eff75e6b66c37bc5e57cf2cfca9..3ac42a524432aba9e2082e24e8a2755c99ff84cc 100644 --- a/aidge_export_cpp/templates/kernel_forward/elemwise_forward.jinja +++ b/aidge_export_cpp/templates/kernel_forward/elemwise_forward.jinja @@ -1,8 +1,15 @@ {% filter indent(width=4, first=False) %} {% include "./_mem_offset.jinja" %} elemwise_forward<{{name|upper}}_NB_ELTS, - {{name|upper}}_ELEM_OP, - {{name|upper}}_ACTIVATION> + {{name|upper}}_ELEM_OP, + {{name|upper}}_INPUT_A_DIMS, + {{name|upper}}_INPUT_B_DIMS, + {{name|upper}}_OUTPUT_DIMS, + {{name|upper}}_SIZE_DIM_IN_A, + {{name|upper}}_SIZE_DIM_IN_B, + {{name|upper}}_SIZE_DIM_OUT, + {{name|upper}}_OUT_SIZE, + {{name|upper}}_ACTIVATION> ({{out_name[0]}}, {{name|upper}}_RESCALING, {{in_name[0]}}, {{in_name[1]}}); {% include "./_save_outputs.jinja" %} {% endfilter %} diff --git a/aidge_export_cpp/templates/kernel_forward/erf_forward.jinja b/aidge_export_cpp/templates/kernel_forward/erf_forward.jinja new file mode 100644 index 0000000000000000000000000000000000000000..9f3fbf3009c96454aceb306042358355e6b60a22 --- /dev/null +++ b/aidge_export_cpp/templates/kernel_forward/erf_forward.jinja @@ -0,0 +1,6 @@ +{% filter indent(width=4, first=False) %} +{% include "./_mem_offset.jinja" %} +erf_forward<{{name|upper}}_NB_ELTS> + ({{in_name[0]}}, {{out_name[0]}}); +{% include "./_save_outputs.jinja" %} +{% endfilter %} \ No newline at end of file diff --git a/aidge_export_cpp/templates/kernel_forward/matmul_forward.jinja b/aidge_export_cpp/templates/kernel_forward/matmul_forward.jinja index 64b3df301794e1cb3d56170646a6b9524f18a6ab..1a634fe53b1c77f45d87193410d738a1c0c03760 100644 --- a/aidge_export_cpp/templates/kernel_forward/matmul_forward.jinja +++ b/aidge_export_cpp/templates/kernel_forward/matmul_forward.jinja @@ -1,5 +1,17 @@ {% filter indent(width=4, first=False) %} {% include "./_mem_offset.jinja" %} +<<<<<<< HEAD +matmul_forward<{{name|upper}}_INPUT_A_DIMS, + {{name|upper}}_INPUT_B_DIMS, + {{name|upper}}_OUTPUT_DIMS, + {{name|upper}}_SIZE_DIM_IN_A, + {{name|upper}}_SIZE_DIM_IN_B, + {{name|upper}}_SIZE_DIM_OUT, + {{name|upper}}_ACTIVATION> + ({{in_name[0]}}, {{in_name[1]}}, {{out_name[0]}}); +{% include "./_save_outputs.jinja" %} +{% endfilter %} +======= matmul_forward<{{name|upper}}_M, {{name|upper}}_K, {{name|upper}}_N, @@ -7,3 +19,4 @@ matmul_forward<{{name|upper}}_M, ({{in_name[0]}}, {{in_name[1]}}, {{out_name[0]}}, {{name|upper}}_RESCALING); {% include "./_save_outputs.jinja" %} {% endfilter %} +>>>>>>> origin/dev diff --git a/aidge_export_cpp/templates/kernel_forward/mul_forward.jinja b/aidge_export_cpp/templates/kernel_forward/mul_forward.jinja new file mode 100644 index 0000000000000000000000000000000000000000..9a7170b510b56ed75872aab5d7b08b35921352b0 --- /dev/null +++ b/aidge_export_cpp/templates/kernel_forward/mul_forward.jinja @@ -0,0 +1,14 @@ +{% filter indent(width=4, first=False) %} +{% include "./_mem_offset.jinja" %} +mul_forward<{{name|upper}}_NB_ELTS, + {{name|upper}}_INPUT_A_DIMS, + {{name|upper}}_INPUT_B_DIMS, + {{name|upper}}_OUTPUT_DIMS, + {{name|upper}}_SIZE_DIM_IN_A, + {{name|upper}}_SIZE_DIM_IN_B, + {{name|upper}}_SIZE_DIM_OUT, + {{name|upper}}_OUT_SIZE, + {{name|upper}}_ACTIVATION> + ({{out_name[0]}}, {{in_name[0]}}, {{in_name[1]}}); +{% include "./_save_outputs.jinja" %} +{% endfilter %} diff --git a/aidge_export_cpp/templates/kernel_forward/sub_forward.jinja b/aidge_export_cpp/templates/kernel_forward/sub_forward.jinja new file mode 100644 index 0000000000000000000000000000000000000000..51b47a8a83d62eb87ac4e5c96026d8e5458d7e65 --- /dev/null +++ b/aidge_export_cpp/templates/kernel_forward/sub_forward.jinja @@ -0,0 +1,14 @@ +{% filter indent(width=4, first=False) %} +{% include "./_mem_offset.jinja" %} +sub_forward<{{name|upper}}_NB_ELTS, + {{name|upper}}_INPUT_A_DIMS, + {{name|upper}}_INPUT_B_DIMS, + {{name|upper}}_OUTPUT_DIMS, + {{name|upper}}_SIZE_DIM_IN_A, + {{name|upper}}_SIZE_DIM_IN_B, + {{name|upper}}_SIZE_DIM_OUT, + {{name|upper}}_OUT_SIZE, + {{name|upper}}_ACTIVATION> + ({{out_name[0]}}, {{in_name[0]}}, {{in_name[1]}}); +{% include "./_save_outputs.jinja" %} +{% endfilter %} diff --git a/aidge_export_cpp/templates/kernel_forward/transpose_forward.jinja b/aidge_export_cpp/templates/kernel_forward/transpose_forward.jinja new file mode 100644 index 0000000000000000000000000000000000000000..2a5433c20636a22c2a2fde1f15cbc7c39b2f8123 --- /dev/null +++ b/aidge_export_cpp/templates/kernel_forward/transpose_forward.jinja @@ -0,0 +1,10 @@ +{% filter indent(width=4, first=False) %} +{% include "./_mem_offset.jinja" %} +transpose_forward<{{ name|upper }}_INPUT_DIMS, + {{ name|upper }}_PERM, + {{ name|upper}}_OUTPUT_DIMS, + {{ name|upper}}_SIZE_OUTPUT_DIMS, + {{name|upper}}_SIZE> + ({{in_name[0]}}, {{out_name[0]}}); +{% include "./_save_outputs.jinja" %} +{% endfilter %} \ No newline at end of file