Compare revisions

Cyril Moineau · Cyril Moineau · Cyril Moineau · Olivier BICHLER · Matthew Newson · Matthew Newson
--- a/aidge_export_cpp/kernels/add.hpp
+++ b/aidge_export_cpp/kernels/add.hpp
+#ifndef __AIDGE_EXPORT_CPP_KERNELS_ADD__
+#define __AIDGE_EXPORT_CPP_KERNELS_ADD__
+
+#include "network/typedefs.hpp"
+#include "kernels/activation.hpp"
+
+
+template<int NB_ELTS, 
+        int INPUT_A_DIMS[],  int INPUT_B_DIMS[], int OUTPUT_DIMS[], 
+		int SIZE_DIM_IN_A, int SIZE_DIM_IN_B, int SIZE_DIM_OUT, int OUT_SIZE, 
+        ActivationFunction_T ACTIVATION,
+        typename Input_T, typename Output_T>        
+__attribute__((always_inline)) inline
+void add_forward (
+    Output_T* __restrict outputs,
+    const Input_T* __restrict inputs1,
+    const Input_T* __restrict inputs2)
+{
+    int ndim_a[SIZE_DIM_OUT];
+    int ndim_b[SIZE_DIM_OUT];
+    for (int i= 0; i<SIZE_DIM_OUT; i++){
+        int idx = SIZE_DIM_OUT-SIZE_DIM_IN_A;
+        ndim_a[i] = (i< idx) ? 1 : INPUT_A_DIMS[i-idx];
+    }
+    for (int i= 0; i<SIZE_DIM_OUT; i++){
+        int idx = SIZE_DIM_OUT-SIZE_DIM_IN_B;
+        ndim_b[i] = (i< idx) ? 1 : INPUT_B_DIMS[i-idx];
+    }
+    
+    // Find the highest equal dimension
+    int contiguousidx  = SIZE_DIM_OUT -1 ;
+
+    for (int i = contiguousidx ; ndim_a[i] == ndim_b[i]; i--) {
+        contiguousidx  = i;
+    }
+
+    // Compute the highest number of contiguous data for each Tensor
+    int input0_contiguous_size = 1;
+    for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
+        input0_contiguous_size *= ndim_a[i];
+    }
+    int input1_contiguous_size = 1;
+    for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
+        input1_contiguous_size *= ndim_b[i];
+    }
+    int output_contiguous_size = 1;
+    for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
+        output_contiguous_size *= OUTPUT_DIMS[i];
+    }
+    // initialize strides to iterate through data because of broadcasting
+    int stride_post0[contiguousidx ] ;
+    int stride_post1[contiguousidx ] ;
+    int stride_step0[contiguousidx ] ;
+    int stride_step1[contiguousidx ] ;
+    if (contiguousidx > 0) {
+        stride_post0[contiguousidx  - 1] = 1;
+        stride_post1[contiguousidx  - 1] = 1;
+        for (int i = contiguousidx -2; i != -1; --i) {
+            stride_post0[i] = stride_post0[i+1]*ndim_a[i+1];
+            stride_post1[i] = stride_post1[i+1]*ndim_b[i+1];
+        }
+        for (int i = 0; i < contiguousidx ; ++i) {
+            stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1;
+            stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1;
+        }
+    }
+    int offsetIn0 = 0;
+    int offsetIn1 = 0;
+    int offsetOut = 0;
+    int nbMatrices = 1;
+    for(int i = 0; i<contiguousidx ; ++i){
+        nbMatrices *= OUTPUT_DIMS[i];
+        
+    }
+    int dim = contiguousidx  - 1;
+    for(int stack = 0; stack < nbMatrices;){
+        for(int i = 0; i < output_contiguous_size; ++i){
+            int in0_id = (input0_contiguous_size != 1) ? i : 0;
+            int in1_id = (input1_contiguous_size != 1) ? i : 0;
+            outputs[i + offsetOut*output_contiguous_size] = inputs1[in0_id + offsetIn0*input0_contiguous_size] + inputs2[in1_id + offsetIn1*input1_contiguous_size];
+        }
+        if (++stack < nbMatrices) {
+            int tmp_stack = stack;
+            while(tmp_stack % OUTPUT_DIMS[dim] == 0) {
+                tmp_stack /= OUTPUT_DIMS[dim];
+                dim--;
+            }
+            offsetIn0 += stride_step0[dim];
+            offsetIn1 += stride_step1[dim];
+            ++offsetOut;
+            dim = contiguousidx  - 1;
+        }
+    }
+}
+
+
+
+
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_ADD__
\ No newline at end of file
--- a/aidge_export_cpp/kernels/batchnorm.hpp
+++ b/aidge_export_cpp/kernels/batchnorm.hpp
@@ -3,6 +3,7 @@

 #include "network/typedefs.hpp"
 #include "kernels/rescaling.hpp"
+#include "kernels/activation.hpp"
 #include <math.h>

 // WARNING: this kernel only works for 32-bits floating point values
@@ -12,30 +13,40 @@ template<int NB_OUTPUTS,
         ActivationFunction_T ACTIVATION,
         typename Input_T, typename Output_T,
         typename Param_T>
-__attribute__((always_inline)) inline
+__attribute__((always_inline)) inline   
 void batchnorm_forward (
    const Input_T* __restrict inputs,
    Output_T* __restrict outputs,
+    const Param_T* __restrict scales,
    const Param_T* __restrict biases,
-    const Param_T* __restrict variances,
    const Param_T* __restrict means,
-    const Param_T* __restrict scales,
+    const Param_T* __restrict variances,
    const double epsilon)
 {
-    for (unsigned int output = 0; output < NB_OUTPUTS; ++output) {
-        const Output_T var = sqrt(variances[output] + epsilon);

-        for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
-            for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
-                const int outputOffset = OUTPUTS_HEIGHT * oy + ox;
-
-                const Output_T normalized = (inputs[outputOffset + output] - means[output]) / var;
-                const Output_T sAs = scales[output] * normalized + biases[output];
-                outputs[outputOffset + output] = sat<Output_T>(sAs, output, ACTIVATION, NoScaling);
-            }
+    int featureMapSize = OUTPUTS_HEIGHT * OUTPUTS_WIDTH;
+#ifdef _OPENMP
+    #pragma omp parallel for
+#endif
+    for (int ch = 0; ch < NB_OUTPUTS; ++ch) {
+        int ioIndex = ch * featureMapSize;
+#ifdef _OPENMP
+        #pragma omp parallel for
+#endif
+        for (int i = ioIndex; i < ioIndex + featureMapSize; i++) {
+            outputs[i] = biases[ch];
+        }
+        float var = sqrt(variances[ch] + epsilon);
+#ifdef _OPENMP
+        #pragma omp parallel for
+#endif
+        for (int feature = 0; feature < featureMapSize; ++feature) {
+            outputs[ioIndex + feature] += (scales[ch] * (inputs[ioIndex + feature] - means[ch]) / var);
        }
    }
+
 }


+
 #endif  // __AIDGE_EXPORT_CPP_KERNELS_BATCHNORM__
--- a/aidge_export_cpp/kernels/convolution.hpp
+++ b/aidge_export_cpp/kernels/convolution.hpp
@@ -6,30 +6,44 @@
 #include "network/utils.hpp"
 #include "kernels/macs.hpp"
 #include "kernels/activation.hpp"
+#include <omp.h>
+#include <iostream>
+
+// Weights index en NHWC
+constexpr int inds_pos(int n, int c, int h, int w, int N, int C, int H, int W) {
+    return n * (H * W * C) +
+           h * (W * C) + 
+           w * C +
+           c;
+}

+// Image index in CHW
+constexpr int inds_pos(int c, int h, int w, int C, int H, int W) {
+    return c * (H * W) + 
+           h * W +
+           w;
+}

-template<int NB_CHANNELS,
-         int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
-         int NB_OUTPUTS,
-         int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
+template<int NB_CHANNELS, 
+         int IN_HEIGHT, int IN_WIDTH,
+         int NB_OUTPUTS, int GROUPS,
+         int OUT_HEIGHT, int OUT_WIDTH,
         int PADDING_Y, int PADDING_X,
         int STRIDE_Y, int STRIDE_X,
         int DILATION_Y, int DILATION_X,
         int KERNEL_HEIGHT, int KERNEL_WIDTH,
         ActivationFunction_T ACTIVATION,
-         typename Input_T, typename Output_T,
+         typename Input_T, typename Output_T, 
         typename Weight_T, typename Bias_T,
         typename Rescaling_T>
-__attribute__((always_inline)) inline
+__attribute__((always_inline)) inline 
 void convolution_forward(
-    const Input_T* __restrict inputs,
+    const Input_T* __restrict inputs,  
    Output_T* __restrict outputs,
    const Weight_T* __restrict weights,
    const Bias_T* __restrict biases,
    const Rescaling_T& __restrict rescaling)
 {
-    constexpr int DILATED_KERNEL_HEIGHT
-            = KERNEL_HEIGHT + (DILATION_Y - 1) * (KERNEL_HEIGHT - 1);

    constexpr int DILATED_KERNEL_WIDTH
            = KERNEL_WIDTH + (DILATION_X - 1) * (KERNEL_WIDTH - 1);

--- a/aidge_export_cpp/kernels/convolution_groups.hpp
+++ b/aidge_export_cpp/kernels/convolution_groups.hpp
+#ifndef __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION__
+#define __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION__
+
+#include "network/typedefs.hpp"
+#include "kernels/rescaling.hpp"
+#include "network/utils.hpp"
+#include "kernels/macs.hpp"
+#include "kernels/activation.hpp"
+
+// Weights index en NHWC
+constexpr int inds_pos(int n, int c, int h, int w, int N, int C, int H, int W) {
+    return n * (H * W * C) +
+           h * (W * C) + 
+           w * C +
+           c;
+}
+
+// Image index in CHW
+constexpr int inds_pos(int c, int h, int w, int C, int H, int W) {
+    return c * (H * W) + 
+           h * W +
+           w;
+}
+
+
+
+template<int NB_CHANNELS, 
+         int IN_HEIGHT, int IN_WIDTH,
+         int NB_OUTPUTS, int GROUPS,
+         int OUT_HEIGHT, int OUT_WIDTH,
+         int PADDING_Y, int PADDING_X,
+         int STRIDE_Y, int STRIDE_X,
+         int DILATION_Y, int DILATION_X,
+         int KERNEL_HEIGHT, int KERNEL_WIDTH,
+         ActivationFunction_T ACTIVATION,
+         typename Input_T, typename Output_T, 
+         typename Weight_T, typename Bias_T,
+         typename Rescaling_T>
+__attribute__((always_inline)) inline 
+void convolution_forward(
+    const Input_T* __restrict inputs,  
+    Output_T* __restrict outputs,
+    const Weight_T* __restrict weights,
+    const Bias_T* __restrict biases,
+    const Rescaling_T& __restrict rescaling)
+{
+
+    if (NB_CHANNELS % GROUPS != 0 || NB_OUTPUTS % GROUPS != 0) {
+        throw std::invalid_argument("Groups must be a divisor of both NB_CHANNELS and NB_OUTPUTS!");
+    }
+    
+    int c_in_g = NB_CHANNELS / GROUPS;
+    int c_out_g = NB_OUTPUTS / GROUPS;
+#ifdef _OPENMP
+    #pragma omp parallel for collapse(3)
+#endif
+    for (int oc = 0; oc < NB_OUTPUTS; oc++) {
+    	for (int i = 0; i < OUT_HEIGHT; ++i) {
+    	    for (int j = 0; j < OUT_WIDTH; ++j) {
+                int g_oc = oc / c_out_g;
+    	        Output_T value = biases[oc];
+    	        for (int ic = g_oc * c_in_g; ic < (g_oc + 1) * c_in_g; ++ic) {
+#ifdef _OPENMP
+                    #pragma omp parallel for
+#endif
+    	            for (int m = 0; m < KERNEL_HEIGHT; ++m) {
+#ifdef _OPENMP
+                        #pragma omp parallel for
+#endif
+    	                for (int n = 0; n < KERNEL_WIDTH; ++n) {
+    	                    int i_p = i * STRIDE_X - PADDING_X + m * DILATION_X;
+                            int j_p = j * STRIDE_Y - PADDING_Y + n * DILATION_Y;
+                            if (i_p >= 0 && i_p < IN_HEIGHT && j_p >= 0 && j_p < IN_WIDTH) {
+                                value += weights[inds_pos(oc, ic % c_in_g, m, n, NB_OUTPUTS, c_in_g, KERNEL_HEIGHT, KERNEL_WIDTH)] *
+                                         inputs[inds_pos(ic, i_p, j_p, NB_CHANNELS, IN_HEIGHT, IN_WIDTH)];
+    	                    }
+    	                }
+    	            }
+    	        }
+    	        outputs[inds_pos(oc, i, j, NB_OUTPUTS, OUT_HEIGHT, OUT_WIDTH)] = activation_forward_value<Output_T>(value, oc, ACTIVATION, rescaling);
+    	    }
+    	} 
+    }
+}
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION__
\ No newline at end of file
--- a/aidge_export_cpp/kernels/div.hpp
+++ b/aidge_export_cpp/kernels/div.hpp
+#ifndef __AIDGE_EXPORT_CPP_KERNELS_DIV__
+#define __AIDGE_EXPORT_CPP_KERNELS_DIV__
+
+#include "network/typedefs.hpp"
+#include "kernels/activation.hpp"
+
+template<int NB_ELTS, 
+        int INPUT_A_DIMS[],  int INPUT_B_DIMS[], int OUTPUT_DIMS[], 
+		int SIZE_DIM_IN_A, int SIZE_DIM_IN_B, int SIZE_DIM_OUT, int OUT_SIZE, 
+        ActivationFunction_T ACTIVATION,
+        typename Input_T, typename Output_T>        
+__attribute__((always_inline)) inline
+void div_forward (
+    Output_T* __restrict outputs,
+    const Input_T* __restrict inputs1,
+    const Input_T* __restrict inputs2)
+{
+
+    int ndim_a[SIZE_DIM_OUT];
+    int ndim_b[SIZE_DIM_OUT];
+    for (int i= 0; i<SIZE_DIM_OUT; i++){
+        int idx = SIZE_DIM_OUT-SIZE_DIM_IN_A;
+        ndim_a[i] = (i< idx) ? 1 : INPUT_A_DIMS[i-idx];
+    }
+    for (int i= 0; i<SIZE_DIM_OUT; i++){
+        int idx = SIZE_DIM_OUT-SIZE_DIM_IN_B;
+        ndim_b[i] = (i< idx) ? 1 : INPUT_B_DIMS[i-idx];
+    }
+    
+    // Find the highest equal dimension
+    int contiguousidx  = SIZE_DIM_OUT -1 ;
+
+    for (int i = contiguousidx ; ndim_a[i] == ndim_b[i]; i--) {
+        contiguousidx  = i;
+    }
+
+    // Compute the highest number of contiguous data for each Tensor
+    int input0_contiguous_size = 1;
+    for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
+        input0_contiguous_size *= ndim_a[i];
+    }
+    int input1_contiguous_size = 1;
+    for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
+        input1_contiguous_size *= ndim_b[i];
+    }
+    int output_contiguous_size = 1;
+    for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
+        output_contiguous_size *= OUTPUT_DIMS[i];
+    }
+    // initialize strides to iterate through data because of broadcasting
+    int stride_post0[contiguousidx ] ;
+    int stride_post1[contiguousidx ] ;
+    int stride_step0[contiguousidx ] ;
+    int stride_step1[contiguousidx ] ;
+    if (contiguousidx > 0) {
+        stride_post0[contiguousidx  - 1] = 1;
+        stride_post1[contiguousidx  - 1] = 1;
+        for (int i = contiguousidx -2; i != -1; --i) {
+            stride_post0[i] = stride_post0[i+1]*ndim_a[i+1];
+            stride_post1[i] = stride_post1[i+1]*ndim_b[i+1];
+        }
+        for (int i = 0; i < contiguousidx ; ++i) {
+            stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1;
+            stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1;
+        }
+    }
+    int offsetIn0 = 0;
+    int offsetIn1 = 0;
+    int offsetOut = 0;
+    int nbMatrices = 1;
+    for(int i = 0; i<contiguousidx ; ++i){
+        nbMatrices *= OUTPUT_DIMS[i];
+        
+    }
+    int dim = contiguousidx  - 1;
+    for(int stack = 0; stack < nbMatrices;){
+        for(int i = 0; i < output_contiguous_size; ++i){
+            int in0_id = (input0_contiguous_size != 1) ? i : 0;
+            int in1_id = (input1_contiguous_size != 1) ? i : 0;
+            outputs[i + offsetOut*output_contiguous_size] = inputs1[in0_id + offsetIn0*input0_contiguous_size] / inputs2[in1_id + offsetIn1*input1_contiguous_size];
+        }
+        if (++stack < nbMatrices) {
+            int tmp_stack = stack;
+            while(tmp_stack % OUTPUT_DIMS[dim] == 0) {
+                tmp_stack /= OUTPUT_DIMS[dim];
+                dim--;
+            }
+            offsetIn0 += stride_step0[dim];
+            offsetIn1 += stride_step1[dim];
+            ++offsetOut;
+            dim = contiguousidx  - 1;
+        }
+    }
+}
+
+
+
+
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_DIV__
\ No newline at end of file
--- a/aidge_export_cpp/kernels/elemwise.hpp
+++ b/aidge_export_cpp/kernels/elemwise.hpp
@@ -4,13 +4,12 @@
 #include "network/typedefs.hpp"
 #include "kernels/activation.hpp"

-// Generic function for two inputs

-template<int NB_ELTS,
-         ElemWise_T ELEM_OP,
-         ActivationFunction_T ACTIVATION,
-         typename Input_T, typename Output_T,
-         typename Rescaling_T>
+template<int NB_ELTS, ElemWise_T ELEM_OP,
+        int INPUT_A_DIMS[],  int INPUT_B_DIMS[], int OUTPUT_DIMS[], 
+		int SIZE_DIM_IN_A, int SIZE_DIM_IN_B, int SIZE_DIM_OUT, int OUT_SIZE, 
+        ActivationFunction_T ACTIVATION,
+        typename Input_T, typename Output_T, typename Rescaling_T>        
 __attribute__((always_inline)) inline
 void elemwise_forward (
    Output_T* __restrict outputs,
@@ -21,32 +20,313 @@ void elemwise_forward (
    if (std::is_floating_point<Input_T>::value)
    {
        Input_T val = 0;
-
+        
        switch (ELEM_OP) {
            case Add: {
-                for (int i = 0; i < NB_ELTS; ++i) {
-                    val = inputs1[i] + inputs2[i];
-                    outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
+                int ndim_a[SIZE_DIM_OUT];
+                int ndim_b[SIZE_DIM_OUT];
+                for (int i= 0; i<SIZE_DIM_OUT; i++){
+                    int idx = SIZE_DIM_OUT-SIZE_DIM_IN_A;
+                    ndim_a[i] = (i< idx) ? 1 : INPUT_A_DIMS[i-idx];
+                }
+                for (int i= 0; i<SIZE_DIM_OUT; i++){
+                    int idx = SIZE_DIM_OUT-SIZE_DIM_IN_B;
+                    ndim_b[i] = (i< idx) ? 1 : INPUT_B_DIMS[i-idx];
+                }
+                
+                // Find the highest equal dimension
+                int contiguousidx  = SIZE_DIM_OUT -1 ;
+                for (int i = contiguousidx ; ndim_a[i] == ndim_b[i]; i--) {
+                    contiguousidx  = i;
+                }
+                // Compute the highest number of contiguous data for each Tensor
+                int input0_contiguous_size = 1;
+                for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
+                    input0_contiguous_size *= ndim_a[i];
+                }
+                int input1_contiguous_size = 1;
+                for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
+                    input1_contiguous_size *= ndim_b[i];
+                }
+                int output_contiguous_size = 1;
+                for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
+                    output_contiguous_size *= OUTPUT_DIMS[i];
+                }
+                // initialize strides to iterate through data because of broadcasting
+                int stride_post0[contiguousidx ] ;
+                int stride_post1[contiguousidx ] ;
+                int stride_step0[contiguousidx ] ;
+                int stride_step1[contiguousidx ] ;
+
+                if (contiguousidx > 0) {
+                    stride_post0[contiguousidx  - 1] = 1;
+                    stride_post1[contiguousidx  - 1] = 1;
+                    for (int i = contiguousidx -2; i != -1; --i) {
+                        stride_post0[i] = stride_post0[i+1]*ndim_a[i+1];
+                        stride_post1[i] = stride_post1[i+1]*ndim_b[i+1];
+                    }
+                    for (int i = 0; i < contiguousidx ; ++i) {
+                        stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1;
+                        stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1;
+                    }
+                }
+                int offsetIn0 = 0;
+                int offsetIn1 = 0;
+                int offsetOut = 0;
+                int nbMatrices = 1;
+                for(int i = 0; i<contiguousidx ; ++i){
+                    nbMatrices *= OUTPUT_DIMS[i];
+                    
+                }
+                int dim = contiguousidx  - 1;
+                for(int stack = 0; stack < nbMatrices;){
+                    for(int i = 0; i < output_contiguous_size; ++i){
+                        int in0_id = (input0_contiguous_size != 1) ? i : 0;
+                        int in1_id = (input1_contiguous_size != 1) ? i : 0;
+                        outputs[i + offsetOut*output_contiguous_size] = inputs1[in0_id + offsetIn0*input0_contiguous_size] + inputs2[in1_id + offsetIn1*input1_contiguous_size];
+                    }
+                    if (++stack < nbMatrices) {
+                        int tmp_stack = stack;
+                        while(tmp_stack % OUTPUT_DIMS[dim] == 0) {
+                            tmp_stack /= OUTPUT_DIMS[dim];
+                            dim--;
+                        }
+                        offsetIn0 += stride_step0[dim];
+                        offsetIn1 += stride_step1[dim];
+                        ++offsetOut;
+                        dim = contiguousidx  - 1;
+                    }
                }
                break;
+
            }
-            case Sub: {
-                for (int i = 0; i < NB_ELTS; ++i) {
-                    val = inputs1[i] - inputs2[i];
-                    outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
+            case Sub: { 
+
+                int ndim_a[SIZE_DIM_OUT];
+                int ndim_b[SIZE_DIM_OUT];
+                for (int i= 0; i<SIZE_DIM_OUT; i++){
+                    int idx = SIZE_DIM_OUT-SIZE_DIM_IN_A;
+                    ndim_a[i] = (i< idx) ? 1 : INPUT_A_DIMS[i-idx];
+                }
+                for (int i= 0; i<SIZE_DIM_OUT; i++){
+                    int idx = SIZE_DIM_OUT-SIZE_DIM_IN_B;
+                    ndim_b[i] = (i< idx) ? 1 : INPUT_B_DIMS[i-idx];
+                }
+                // Find the highest equal dimension
+                int contiguousIdx = SIZE_DIM_OUT-1;

+                for (int i = contiguousIdx ; ndim_a[i] == ndim_b[i]; i--) {
+                    contiguousIdx  = i;
+                }
+                // Compute the highest number of contiguous data for each Tensor
+                int input0_contiguous_size = 1;
+                for(int i = contiguousIdx; i<SIZE_DIM_OUT; ++i){
+                    input0_contiguous_size *= ndim_a[i];
+                }
+                int input1_contiguous_size = 1;
+                for(int i = contiguousIdx; i<SIZE_DIM_OUT; ++i){
+                    input1_contiguous_size *= ndim_b[i];
+                }
+                int output_contiguous_size = 1;
+                for(int i = contiguousIdx; i<SIZE_DIM_OUT; ++i){
+                    output_contiguous_size *= OUTPUT_DIMS[i];
                }
+                // initialize strides to iterate through data because of broadcasting
+                int stride_post0[contiguousIdx] ;
+                int stride_post1[contiguousIdx] ;
+                int stride_step0[contiguousIdx] ;
+                int stride_step1[contiguousIdx] ;
+                if (contiguousIdx > 0) {
+                    stride_post0[contiguousIdx - 1] = 1;
+                    stride_post1[contiguousIdx - 1] = 1;
+                    for (int i = contiguousIdx-2; i != -1; --i) {
+                        stride_post0[i] = stride_post0[i+1]*ndim_a[i+1];
+                        stride_post1[i] = stride_post1[i+1]*ndim_b[i+1];
+                    }
+                    for (int i = 0; i < contiguousIdx; ++i) {
+                        stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1;
+                        stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1;
+                    }
+                }
+                int offsetIn0 = 0;
+                int offsetIn1 = 0;
+                int offsetOut = 0;
+                int nbMatrices = 1;
+                for(int i = 0; i<contiguousIdx; ++i){
+                    nbMatrices *= OUTPUT_DIMS[i];
+                }
+                int dim = contiguousIdx - 1;
+                for(int stack = 0; stack < nbMatrices;){
+                    for(int i = 0; i < output_contiguous_size; ++i){
+                        int in0_id = (input0_contiguous_size != 1) ? i : 0;
+                        int in1_id = (input1_contiguous_size != 1) ? i : 0;
+                        outputs[i + offsetOut*output_contiguous_size] = inputs1[in0_id + offsetIn0*input0_contiguous_size] - inputs2[in1_id + offsetIn1*input1_contiguous_size];
+                    }
+                    if (++stack < nbMatrices) {
+                        int tmp_stack = stack;
+                        while(tmp_stack % OUTPUT_DIMS[dim] == 0) {
+                            tmp_stack /= OUTPUT_DIMS[dim];
+                            dim--;
+                        }
+                        offsetIn0 += stride_step0[dim];
+                        offsetIn1 += stride_step1[dim];
+                        ++offsetOut;
+                        dim = contiguousIdx - 1;
+                    }
+                }
+                
                break;
            }
            case Mul: {
-                for (int i = 0; i < NB_ELTS; ++i) {
-                    val = inputs1[i] * inputs2[i];
-                    outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
+                int ndim_a[SIZE_DIM_OUT];
+                int ndim_b[SIZE_DIM_OUT];
+
+                for (int i= 0; i<SIZE_DIM_OUT; i++){
+                    int idx = SIZE_DIM_OUT-SIZE_DIM_IN_A;
+                    ndim_a[i] = (i< idx) ? 1 : INPUT_A_DIMS[i-idx];
+                }
+                for (int i= 0; i<SIZE_DIM_OUT; i++){
+                    int idx = SIZE_DIM_OUT-SIZE_DIM_IN_B;
+                    ndim_b[i] = (i< idx) ? 1 : INPUT_B_DIMS[i-idx];
+                }
+                // Find the highest equal dimension
+                int contiguousIdx = SIZE_DIM_OUT-1;
+                for (int i = contiguousIdx; ndim_a[i] == ndim_b[i]; i--) {
+                    contiguousIdx = i;
+                }   
+                // Compute the highest number of contiguous data for each Tensor
+                int input0_contiguous_size = 1;
+                for(int i = contiguousIdx; i<SIZE_DIM_OUT; ++i){
+                    input0_contiguous_size *= ndim_a[i];
+                }
+                int input1_contiguous_size = 1;
+                for(int i = contiguousIdx; i<SIZE_DIM_OUT; ++i){
+                    input1_contiguous_size *= ndim_b[i];
+                }
+                int output_contiguous_size = 1;
+                for(int i = contiguousIdx; i<SIZE_DIM_OUT; ++i){
+                    output_contiguous_size *= OUTPUT_DIMS[i];
+                }
+                // initialize strides to iterate through data because of broadcasting
+                int stride_post0[contiguousIdx] ;
+                int stride_post1[contiguousIdx] ;
+                int stride_step0[contiguousIdx] ;
+                int stride_step1[contiguousIdx] ;
+                if (contiguousIdx > 0) {
+                    stride_post0[contiguousIdx - 1] = 1;
+                    stride_post1[contiguousIdx - 1] = 1;
+                    for (int i = contiguousIdx-2; i != -1; --i) {
+                        stride_post0[i] = stride_post0[i+1]*ndim_a[i+1];
+                        stride_post1[i] = stride_post1[i+1]*ndim_b[i+1];
+                    }
+                    for (int i = 0; i < contiguousIdx; ++i) {
+                        stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1;
+                        stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1;
+                    }
+                }
+                int offsetIn0 = 0;
+                int offsetIn1 = 0;
+                int offsetOut = 0;
+                int nbMatrices = 1;
+                for(int i = 0; i<contiguousIdx; ++i){
+                    nbMatrices *= OUTPUT_DIMS[i];
+                }
+                int dim = contiguousIdx - 1;
+                for(int stack = 0; stack < nbMatrices;){
+                    for(int i = 0; i < output_contiguous_size; ++i){
+                        int in0_id = (input0_contiguous_size != 1) ? i : 0;
+                        int in1_id = (input1_contiguous_size != 1) ? i : 0;
+                        outputs[i + offsetOut*output_contiguous_size] = inputs1[in0_id + offsetIn0*input0_contiguous_size] * inputs2[in1_id + offsetIn1*input1_contiguous_size];
+                    }
+                    if (++stack < nbMatrices) {
+                        int tmp_stack = stack;
+                        while(tmp_stack % OUTPUT_DIMS[dim] == 0) {
+                            tmp_stack /= OUTPUT_DIMS[dim];
+                            dim--;
+                        }
+                        offsetIn0 += stride_step0[dim];
+                        offsetIn1 += stride_step1[dim];
+                        ++offsetOut;
+                        dim = contiguousIdx - 1;
+                    }
+                }
+                break;
+            }
+            case Div: {
+                int ndim_a[SIZE_DIM_OUT];
+                int ndim_b[SIZE_DIM_OUT];
+                for (int i= 0; i<SIZE_DIM_OUT; i++){
+                    int idx = SIZE_DIM_OUT-SIZE_DIM_IN_A;
+                    ndim_a[i] = (i< idx) ? 1 : INPUT_A_DIMS[i-idx];
+                }
+                for (int i= 0; i<SIZE_DIM_OUT; i++){
+                    int idx = SIZE_DIM_OUT-SIZE_DIM_IN_B;
+                    ndim_b[i] = (i< idx) ? 1 : INPUT_B_DIMS[i-idx];
+                }
+                // Find the highest equal dimension
+                int contiguousIdx = SIZE_DIM_OUT-1;
+                for (int i = contiguousIdx; ndim_a[i] == ndim_b[i]; i--) {
+                    contiguousIdx = i;
+                }
+                // Compute the highest number of contiguous data for each Tensor
+                int input0_contiguous_size = 1;
+                for(int i = contiguousIdx; i<SIZE_DIM_OUT; ++i){
+                    input0_contiguous_size *= ndim_a[i];
+                }
+                int input1_contiguous_size = 1;
+                for(int i = contiguousIdx; i<SIZE_DIM_OUT; ++i){
+                    input1_contiguous_size *= ndim_b[i];
+                }
+                int output_contiguous_size = 1;
+                for(int i = contiguousIdx; i<SIZE_DIM_OUT; ++i){
+                    output_contiguous_size *= OUTPUT_DIMS[i];
+                }
+                // initialize strides to iterate through data because of broadcasting
+                int stride_post0[contiguousIdx] ;
+                int stride_post1[contiguousIdx] ;
+                int stride_step0[contiguousIdx] ;
+                int stride_step1[contiguousIdx] ;
+                if (contiguousIdx > 0) {
+                    stride_post0[contiguousIdx - 1] = 1;
+                    stride_post1[contiguousIdx - 1] = 1;
+                    for (int i = contiguousIdx-2; i != -1; --i) {
+                        stride_post0[i] = stride_post0[i+1]*ndim_a[i+1];
+                        stride_post1[i] = stride_post1[i+1]*ndim_b[i+1];
+                    }
+                    for (int i = 0; i < contiguousIdx; ++i) {
+                        stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1;
+                        stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1;
+                    }
+                }
+                int offsetIn0 = 0;
+                int offsetIn1 = 0;
+                int offsetOut = 0;
+                int nbMatrices = 1;
+                for(int i = 0; i<contiguousIdx; ++i){
+                    nbMatrices *= OUTPUT_DIMS[i];
+                }
+                int dim = contiguousIdx - 1;
+                for(int stack = 0; stack < nbMatrices;){
+                    for(int i = 0; i < output_contiguous_size; ++i){
+                        int in0_id = (input0_contiguous_size != 1) ? i : 0;
+                        int in1_id = (input1_contiguous_size != 1) ? i : 0;
+                        outputs[i + offsetOut*output_contiguous_size] = inputs1[in0_id + offsetIn0*input0_contiguous_size] / inputs2[in1_id + offsetIn1*input1_contiguous_size];
+                    }
+                    if (++stack < nbMatrices) {
+                        int tmp_stack = stack;
+                        while(tmp_stack % OUTPUT_DIMS[dim] == 0) {
+                            tmp_stack /= OUTPUT_DIMS[dim];
+                            dim--;
+                        }
+                        offsetIn0 += stride_step0[dim];
+                        offsetIn1 += stride_step1[dim];
+                        ++offsetOut;
+                        dim = contiguousIdx - 1;
+                    }
                }
                break;
            }
            default: {
-                // Copy inputs1 in outputs for default case
                for (int i = 0; i < NB_ELTS; ++i) {
                    val = inputs1[i];
                    outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
@@ -58,7 +338,6 @@ void elemwise_forward (
    else
    {
        int32_t val = 0;
-
        switch (ELEM_OP) {
            case Add: {
                for (int i = 0; i < NB_ELTS; ++i) {
@@ -81,6 +360,13 @@ void elemwise_forward (
                }
                break;
            }
+            case Div: {
+                for (int i = 0; i < NB_ELTS; ++i) {
+                    val = inputs1[i] / inputs2[i] ;
+                    outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
+                }
+                break; 
+            }
            default: {
                // Copy inputs1 in outputs for default case
                for (int i = 0; i < NB_ELTS; ++i) {
@@ -94,78 +380,6 @@ void elemwise_forward (
 }


-// Generic function for multiple inputs
-// Not working
-
-// template<ElemWise_T ELEM_OP, typename Output_T>
-// __attribute__((always_inline)) inline
-// Output_T elemWise (int /*pos*/, int /*ch*/)
-// {
-//     return 0;
-// }
-
-// template<ElemWise_T ELEM_OP,
-//          int NB_CHANNELS,
-//          // For next inputs
-//          int... ARGS,
-//          typename... INPUTS,
-//          // Types
-//          typename Input_T, typename Output_T>
-// __attribute__((always_inline)) inline
-// Output_T elemWise (int pos, int ch,
-//                    const Input_T* __restrict firstInputs,
-//                    INPUTS... inputs)
-// {
-//     int iOffset = NB_CHANNELS * pos;
-
-//     return firstInputs[iOffset + ch]
-//                 + elemWise<ELEM_OP, ARGS...>(pos, ch, inputs...);
-// }
-
-// template<// For all inputs
-//          int NB_CHANNELS,
-//          int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
-//          int NB_ELTS,
-//          int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
-//          ElemWise_T ELEM_OP,
-//          ActivationFunction_T ACTIVATION,
-//          // For next inputs
-//          int... ARGS,
-//          typename... INPUTS,
-//          // Types
-//          typename Input_T, typename Output_T,
-//          typename Rescaling_T>
-// __attribute__((always_inline)) inline
-// void elemWise_forward (
-//     Output_T* __restrict outputs,
-//     const Rescaling_T& __restrict rescaling,
-//     const Input_T* __restrict firstInputs,
-//     INPUTS... inputs)
-// {
-//     for (int oy = 0; oy < OUTPUTS_HEIGHT; oy++) {
-//         for (int ox = 0; ox < OUTPUTS_WIDTH; ox++) {
-//             const int pos = (ox + OUTPUTS_WIDTH * oy);
-//             int oOffset = NB_ELTS * pos;
-
-//             for (int ch = 0; ch < NB_ELTS; ++ch) {
-//                 const Add_T val = elemWise<ELEM_OP,
-//                                         INPUT_NB_CHANNELS,
-//                                         INPUT_MEM_CONT_OFFSET,
-//                                         INPUT_MEM_CONT_NB_ELTS,
-//                                         INPUT_MEM_WRAP_OFFSET,
-//                                         INPUT_MEM_WRAP_NB_ELTS,
-//                                         INPUT_MEM_STRIDE,
-//                                         ARGS...>(pos, ch, firstInputs, inputs...);
-
-//                 outputs[oOffset + ch]
-//                     = sat<Output_T>(val, ch, ACTIVATION, rescaling);
-//             }
-//         }
-//     }
-// }
-
-
-


-#endif  // __AIDGE_EXPORT_CPP_KERNELS_ELEMWISE__
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_ELEMWISE__
\ No newline at end of file
--- a/aidge_export_cpp/kernels/erf.hpp
+++ b/aidge_export_cpp/kernels/erf.hpp
+#ifndef __AIDGE_EXPORT_CPP_KERNELS_ERF__
+#define __AIDGE_EXPORT_CPP_KERNELS_ERF__
+
+#include "network/typedefs.hpp"
+#include <cmath>
+#include <math.h>
+
+template<int _NB_ELTS,
+         typename Input_T, typename Output_T>
+__attribute__((always_inline)) inline 
+void erf_forward (
+    const Input_T* __restrict inputs,
+    Output_T* __restrict outputs)
+{
+    double a1 =  0.254829592;
+    double a2 = -0.284496736;
+    double a3 =  1.421413741;
+    double a4 = -1.453152027;
+    double a5 =  1.061405429;
+    double p  =  0.3275911;
+
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif 
+    for (int i = 0; i < _NB_ELTS; ++i) {
+        int sign = 1;
+        if (inputs[i] < 0)
+            sign = -1;
+        double abs_value = abs(inputs[i]);
+        
+        // A&S formula 7.1.26
+        double t = 1.0/(1.0 + p*abs_value);
+        double y = 1.0 - (((((a5*t + a4)*t) + a3)*t + a2)*t + a1)*t*exp(-abs_value*abs_value);
+        outputs[i] = sign*y;
+
+    }
+}
+
+
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_ERF_
\ No newline at end of file
--- a/aidge_export_cpp/kernels/fullyconnected.hpp
+++ b/aidge_export_cpp/kernels/fullyconnected.hpp
@@ -28,7 +28,7 @@ void fullyconnected_forward (
    // It is only an issue if the FC was after a flatten layer.
    // Otherwise it is not an issue for the other FC because CHANNELS_WIDTH = CHANNELS_HEIGHT = 1
    // Solution: Add a system to check dataformat
-    for (int och = 0; och < NB_OUTPUTS; och++) {
+    /*for (int och = 0; och < NB_OUTPUTS; och++) {

        Bias_T weightedSum = biases[och];

@@ -42,9 +42,9 @@ void fullyconnected_forward (
        }

        outputs[och] = activation_forward_value<Output_T>(weightedSum, och, ACTIVATION, rescaling);
-    }
-/*
-Here the kernel to use with inputs in NHWC and weights in NHWC
+    }*/
+
+//Here the kernel to use with inputs in NHWC and weights in NHWC
 #pragma omp parallel for
    for (int och = 0; och < NB_OUTPUTS; och++) {

@@ -65,7 +65,7 @@ Here the kernel to use with inputs in NHWC and weights in NHWC

        outputs[och] = activation_forward_value<Output_T>(weightedSum, och, ACTIVATION, rescaling);
    }
-*/
+
 }



--- a/aidge_export_cpp/kernels/matmul.hpp
+++ b/aidge_export_cpp/kernels/matmul.hpp
@@ -6,28 +6,106 @@

 // Generic function for matmul and activation

-template<int M,
-         int K,
-         int N,
+template<int INPUT_A_DIMS[],  int INPUT_B_DIMS[], int OUTPUT_DIMS[], 
+		int _SIZE_DIM_IN_A, int _SIZE_DIM_IN_B, int SIZE_DIM_OUT, 
         ActivationFunction_T ACTIVATION,
-         typename Input_T, typename Output_T,
-         typename Rescaling_T>
+         typename Input_T, typename Output_T>
 __attribute__((always_inline)) inline
 void matmul_forward (
    const Input_T* __restrict inputs1,
    const Input_T* __restrict inputs2,
-    Output_T* __restrict outputs,
-    const Rescaling_T& __restrict rescaling)
+    Output_T* __restrict outputs)
 {
-    for (int m = 0; m < M; ++m) {
-        for (int n = 0; n < N; ++n) {
-            Output_T sum = Output_T(0);
-            for (int k = 0; k < K; ++k) {
-                sum += inputs1[K*m + k] * inputs2[N*k + n];
+
+    //initialize arrays storing broadcasted(or not) dims
+    int ndim_a[SIZE_DIM_OUT];     
+    int ndim_b[SIZE_DIM_OUT];
+    if ( _SIZE_DIM_IN_A == 1){ 
+        ndim_a[0] = 1;
+        ndim_a[1] =INPUT_A_DIMS[0];
+    }
+    if ( _SIZE_DIM_IN_B == 1){ 
+        ndim_b[0] =INPUT_B_DIMS[0];
+        ndim_b[1] = 1;
+    }
+    
+    for (int i= 0; i<SIZE_DIM_OUT; i++){
+        int idx = SIZE_DIM_OUT-_SIZE_DIM_IN_A;
+        ndim_a[i] = (i< idx) ? 1 :INPUT_A_DIMS[i-idx];
+    }
+    for (int i= 0; i<SIZE_DIM_OUT; i++){
+        int idx = SIZE_DIM_OUT-_SIZE_DIM_IN_B;
+        ndim_b[i] = (i< idx) ? 1 :INPUT_B_DIMS[i-idx];
+    }
+        
+    // initialize strides to iterate through data because of broadcasting
+    int stride_post0[SIZE_DIM_OUT-2] ;
+    int stride_post1[SIZE_DIM_OUT-2] ; 
+    int stride_step0[SIZE_DIM_OUT-2] ;
+    int stride_step1[SIZE_DIM_OUT-2] ; 
+    if (SIZE_DIM_OUT > 2){ 
+        stride_post0[SIZE_DIM_OUT - 3] = 1;
+        stride_post1[SIZE_DIM_OUT - 3] = 1;
+        for (int i = SIZE_DIM_OUT-4; i != -1; --i) {
+            stride_post0[i] = stride_post0[i+1]*ndim_a[i+1];
+            stride_post1[i] = stride_post1[i+1]*ndim_b[i+1];
+        }
+        for (int i = 0; i < SIZE_DIM_OUT-2; ++i) {
+            stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1;
+            stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1;
+        }
+
+    }
+
+    
+    // if _SIZE_DIM_IN_B == _SIZE_DIM_IN_A, then _SIZE_DIM_IN_A == SIZE_DIM_OUT == _SIZE_DIM_IN_B; 
+    // else it will be broadcasted to the correct dims
+
+    int nbMatrices = 1;
+    for(int i = SIZE_DIM_OUT -3; i>=0; --i){
+        nbMatrices *= OUTPUT_DIMS[i];
+    }
+    int dim = SIZE_DIM_OUT -3;
+
+
+    int offsetIn0 = 0;
+    int offsetIn1 = 0;
+    int offsetOut = 0;
+    const int n = ndim_a[SIZE_DIM_OUT - 2];
+    const int k = ndim_a[SIZE_DIM_OUT - 1];
+    const int m = ndim_b[SIZE_DIM_OUT - 1];
+    const int matrix0Size = n*k;
+    const int matrix1Size = k*m;
+    const int matrixOutSize = n*m;
+
+    for(int stack = 0; stack < nbMatrices;){
+
+        for (int i = 0; i < n; ++i) {
+
+            for (int j = 0; j < m; ++j) {
+                float sum = 0;
+
+                for (int l = 0; l < k; ++l) {
+                    sum += (inputs1[ offsetIn0*matrix0Size + i*k + l] * inputs2[offsetIn1*matrix1Size + l*m + j]);
+                }
+                outputs[ offsetOut*matrixOutSize + i*m + j] = sum;
+            }
+        } 
+
+        if (++stack < nbMatrices) {
+            int tmp_stack = stack;
+            while(tmp_stack % OUTPUT_DIMS[dim] == 0) {
+                tmp_stack /= OUTPUT_DIMS[dim];
+                dim--;
            }
-            outputs[N*m + n] = activation_forward_value<Output_T>(sum, 0/*not applicable*/, ACTIVATION, rescaling);
+            offsetIn0 += stride_step0[dim];
+            offsetIn1 += stride_step1[dim];
+            ++offsetOut;
+            dim = SIZE_DIM_OUT -3;
        }
+
    }
+
 }

-#endif  // __AIDGE_EXPORT_CPP_KERNELS_MATMUL__
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_MATMUL__
\ No newline at end of file
--- a/aidge_export_cpp/kernels/mul.hpp
+++ b/aidge_export_cpp/kernels/mul.hpp
+#ifndef __AIDGE_EXPORT_CPP_KERNELS_MUL__
+#define __AIDGE_EXPORT_CPP_KERNELS_MUL__
+
+#include "network/typedefs.hpp"
+#include "kernels/activation.hpp"
+
+template<int NB_ELTS, 
+        int INPUT_A_DIMS[],  int INPUT_B_DIMS[], int OUTPUT_DIMS[], 
+		int SIZE_DIM_IN_A, int SIZE_DIM_IN_B, int SIZE_DIM_OUT, int OUT_SIZE, 
+        ActivationFunction_T ACTIVATION,
+        typename Input_T, typename Output_T>        
+__attribute__((always_inline)) inline
+void mul_forward (
+    Output_T* __restrict outputs,
+    const Input_T* __restrict inputs1,
+    const Input_T* __restrict inputs2)
+{
+    int ndim_a[SIZE_DIM_OUT];
+    int ndim_b[SIZE_DIM_OUT];                                                  
+    for (int i= 0; i<SIZE_DIM_OUT; i++){
+        int idx = SIZE_DIM_OUT-SIZE_DIM_IN_A;
+        ndim_a[i] = (i< idx) ? 1 : INPUT_A_DIMS[i-idx];
+    }
+    for (int i= 0; i<SIZE_DIM_OUT; i++){
+        int idx = SIZE_DIM_OUT-SIZE_DIM_IN_B;
+        ndim_b[i] = (i< idx) ? 1 : INPUT_B_DIMS[i-idx];
+    }
+    
+    // Find the highest equal dimension
+    int contiguousidx  = SIZE_DIM_OUT -1 ;
+
+    for (int i = contiguousidx ; ndim_a[i] == ndim_b[i]; i--) {
+        contiguousidx  = i;
+    }
+
+    // Compute the highest number of contiguous data for each Tensor
+    int input0_contiguous_size = 1;
+    for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
+        input0_contiguous_size *= ndim_a[i];
+    }
+    int input1_contiguous_size = 1;
+    for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
+        input1_contiguous_size *= ndim_b[i];
+    }
+    int output_contiguous_size = 1;
+    for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
+        output_contiguous_size *= OUTPUT_DIMS[i];
+    }
+    // initialize strides to iterate through data because of broadcasting
+    int stride_post0[contiguousidx ] ;
+    int stride_post1[contiguousidx ] ;
+    int stride_step0[contiguousidx ] ;
+    int stride_step1[contiguousidx ] ;
+    if (contiguousidx > 0) {
+        stride_post0[contiguousidx  - 1] = 1;
+        stride_post1[contiguousidx  - 1] = 1;
+        for (int i = contiguousidx -2; i != -1; --i) {
+            stride_post0[i] = stride_post0[i+1]*ndim_a[i+1];
+            stride_post1[i] = stride_post1[i+1]*ndim_b[i+1];
+        }
+        for (int i = 0; i < contiguousidx ; ++i) {
+            stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1;
+            stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1;
+        }
+    }
+    int offsetIn0 = 0;
+    int offsetIn1 = 0;
+    int offsetOut = 0;
+    int nbMatrices = 1;
+    for(int i = 0; i<contiguousidx ; ++i){
+        nbMatrices *= OUTPUT_DIMS[i];
+        
+    }
+    int dim = contiguousidx  - 1;
+    for(int stack = 0; stack < nbMatrices;){
+        for(int i = 0; i < output_contiguous_size; ++i){
+            int in0_id = (input0_contiguous_size != 1) ? i : 0;
+            int in1_id = (input1_contiguous_size != 1) ? i : 0;
+            outputs[i + offsetOut*output_contiguous_size] = inputs1[in0_id + offsetIn0*input0_contiguous_size] * inputs2[in1_id + offsetIn1*input1_contiguous_size];
+        }
+        if (++stack < nbMatrices) {
+            int tmp_stack = stack;
+            while(tmp_stack % OUTPUT_DIMS[dim] == 0) {
+                tmp_stack /= OUTPUT_DIMS[dim];
+                dim--;
+            }
+            offsetIn0 += stride_step0[dim];
+            offsetIn1 += stride_step1[dim];
+            ++offsetOut;
+            dim = contiguousidx  - 1;
+        }
+    }
+}
+
+
+
+
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_MUL__
\ No newline at end of file
--- a/aidge_export_cpp/kernels/pooling.hpp
+++ b/aidge_export_cpp/kernels/pooling.hpp
@@ -7,6 +7,35 @@
 #include <stdexcept>


+void reorder_NCHW_NHWC_pool(const float* input, float* output, int N, int C, int H, int W, bool direct = true) {
+    auto nchw_index = [=](int n, int c, int h, int w) {
+        return ((n * C + c) * H + h) * W + w;
+    };
+
+    auto nhwc_index = [=](int n, int h, int w, int c) {
+        return ((n * H + h) * W + w) * C + c;
+    };
+    #pragma omp parallel for
+    for (int n = 0; n < N; ++n) {
+        #pragma omp parallel for
+        for (int c = 0; c < C; ++c) {
+            #pragma omp parallel for
+            for (int h = 0; h < H; ++h) {
+                #pragma omp parallel for
+                for (int w = 0; w < W; ++w) {
+                    if (direct) {
+                    	output[nhwc_index(n, h, w, c)] = input[nchw_index(n, c, h, w)];
+                    } else {
+                    	output[nchw_index(n, c, h, w)] = input[nhwc_index(n, h, w, c)];
+                    }
+                    
+                }
+            }
+        }
+    }
+}
+
+
 template<int NB_CHANNELS, 
         int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
         int NB_OUTPUTS,
@@ -22,11 +51,16 @@ void pooling_forward(
    const Input_T* __restrict inputs,
    Output_T* __restrict outputs)
 {
+
+    float inputs_ordered[NB_CHANNELS * CHANNELS_HEIGHT * CHANNELS_WIDTH];
+    float outputs_unordered[NB_OUTPUTS * OUTPUTS_HEIGHT * OUTPUTS_WIDTH];
+    reorder_NCHW_NHWC_pool(inputs, inputs_ordered, 1, NB_CHANNELS, CHANNELS_HEIGHT, CHANNELS_WIDTH, true);
+    
    constexpr int OUTPUTS_HEIGHT_NOPAD
        = (CHANNELS_HEIGHT - POOL_HEIGHT + STRIDE_Y) / STRIDE_Y;
    constexpr int OUTPUTS_WIDTH_NOPAD
        = (CHANNELS_WIDTH - POOL_WIDTH + STRIDE_X) / STRIDE_X;
-
+    #pragma omp parallel for
    for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
        const int syMin = (PADDING_Y == 0) ? 0
            : max(PADDING_Y - (oy * STRIDE_Y), 0);
@@ -78,15 +112,15 @@ void pooling_forward(

                            int iOffsetInRange = iOffset + output + sx * NB_CHANNELS;

-                            if (inputs[iOffsetInRange] > maxVal)
-                                maxVal = inputs[iOffsetInRange];
+                            if (inputs_ordered[iOffsetInRange] > maxVal)
+                                maxVal = inputs_ordered[iOffsetInRange];
                        }
                    }

-                    outputs[oOffset + output] = maxVal;
+                    outputs_unordered[oOffset + output] = maxVal;
                }
                else if (POOLING_TYPE == Average) {
-                    int32_t sum = 0;
+                    Input_T sum = 0;

                    for (int sy = 0; sy < POOL_HEIGHT; ++sy) {
                        if ((PADDING_Y != 0
@@ -109,11 +143,12 @@ void pooling_forward(
                            }

                            int iOffsetInRange = iOffset + output + sx * NB_CHANNELS;
-                            sum += inputs[iOffsetInRange];
+                            sum = inputs_ordered[iOffsetInRange] + sum;
                        }
+                        
                    }

-                    outputs[oOffset + output] = (Output_T) (sum / (POOL_HEIGHT * POOL_WIDTH));
+                    outputs_unordered[oOffset + output] = (Output_T) (sum / (POOL_HEIGHT * POOL_WIDTH));
                }
                else {
                    throw std::runtime_error("The export only supports Max and Average pooling.");
@@ -121,6 +156,7 @@ void pooling_forward(
            }
        }
    }
+    reorder_NCHW_NHWC_pool(outputs_unordered, outputs, 1, NB_OUTPUTS, OUTPUTS_HEIGHT, OUTPUTS_WIDTH, false);
 }

-#endif  // __AIDGE_EXPORT_CPP_KERNELS_POOLING__
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_POOLING__
\ No newline at end of file
--- a/aidge_export_cpp/kernels/sub.hpp
+++ b/aidge_export_cpp/kernels/sub.hpp
+#ifndef __AIDGE_EXPORT_CPP_KERNELS_SUB__
+#define __AIDGE_EXPORT_CPP_KERNELS_SUB__
+
+#include "network/typedefs.hpp"
+#include "kernels/activation.hpp"
+
+
+template<int NB_ELTS, 
+        int INPUT_A_DIMS[],  int INPUT_B_DIMS[], int OUTPUT_DIMS[], 
+		int SIZE_DIM_IN_A, int SIZE_DIM_IN_B, int SIZE_DIM_OUT, int OUT_SIZE, 
+        ActivationFunction_T ACTIVATION,
+        typename Input_T, typename Output_T>        
+__attribute__((always_inline)) inline
+void sub_forward (
+    Output_T* __restrict outputs,
+    const Input_T* __restrict inputs1,
+    const Input_T* __restrict inputs2)
+{
+
+    int ndim_a[SIZE_DIM_OUT];
+    int ndim_b[SIZE_DIM_OUT];
+    for (int i= 0; i<SIZE_DIM_OUT; i++){
+        int idx = SIZE_DIM_OUT-SIZE_DIM_IN_A;
+        ndim_a[i] = (i< idx) ? 1 : INPUT_A_DIMS[i-idx];
+    }
+    for (int i= 0; i<SIZE_DIM_OUT; i++){
+        int idx = SIZE_DIM_OUT-SIZE_DIM_IN_B;
+        ndim_b[i] = (i< idx) ? 1 : INPUT_B_DIMS[i-idx];
+    }
+    
+    // Find the highest equal dimension
+    int contiguousidx  = SIZE_DIM_OUT -1 ;
+
+    for (int i = contiguousidx ; ndim_a[i] == ndim_b[i]; i--) {
+        contiguousidx  = i;
+    }
+
+    // Compute the highest number of contiguous data for each Tensor
+    int input0_contiguous_size = 1;
+    for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
+        input0_contiguous_size *= ndim_a[i];
+    }
+    int input1_contiguous_size = 1;
+    for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
+        input1_contiguous_size *= ndim_b[i];
+    }
+    int output_contiguous_size = 1;
+    for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
+        output_contiguous_size *= OUTPUT_DIMS[i];
+    }
+    // initialize strides to iterate through data because of broadcasting
+    int stride_post0[contiguousidx ] ;
+    int stride_post1[contiguousidx ] ;
+    int stride_step0[contiguousidx ] ;
+    int stride_step1[contiguousidx ] ;
+    if (contiguousidx > 0) {
+        stride_post0[contiguousidx  - 1] = 1;
+        stride_post1[contiguousidx  - 1] = 1;
+        for (int i = contiguousidx -2; i != -1; --i) {
+            stride_post0[i] = stride_post0[i+1]*ndim_a[i+1];
+            stride_post1[i] = stride_post1[i+1]*ndim_b[i+1];
+        }
+        for (int i = 0; i < contiguousidx ; ++i) {
+            stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1;
+            stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1;
+        }
+    }
+    int offsetIn0 = 0;
+    int offsetIn1 = 0;
+    int offsetOut = 0;
+    int nbMatrices = 1;
+    for(int i = 0; i<contiguousidx ; ++i){
+        nbMatrices *= OUTPUT_DIMS[i];
+        
+    }
+    int dim = contiguousidx  - 1;
+    for(int stack = 0; stack < nbMatrices;){
+        for(int i = 0; i < output_contiguous_size; ++i){
+            int in0_id = (input0_contiguous_size != 1) ? i : 0;
+            int in1_id = (input1_contiguous_size != 1) ? i : 0;
+            outputs[i + offsetOut*output_contiguous_size] = inputs1[in0_id + offsetIn0*input0_contiguous_size] - inputs2[in1_id + offsetIn1*input1_contiguous_size];
+        }
+        if (++stack < nbMatrices) {
+            int tmp_stack = stack;
+            while(tmp_stack % OUTPUT_DIMS[dim] == 0) {
+                tmp_stack /= OUTPUT_DIMS[dim];
+                dim--;
+            }
+            offsetIn0 += stride_step0[dim];
+            offsetIn1 += stride_step1[dim];
+            ++offsetOut;
+            dim = contiguousidx  - 1;
+        }
+    }
+}
+
+
+
+
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_SUB__
\ No newline at end of file
--- a/aidge_export_cpp/kernels/transpose_diff.hpp
+++ b/aidge_export_cpp/kernels/transpose_diff.hpp
+#ifndef __AIDGE_EXPORT_CPP_KERNELS_TRANSPOSE__
+#define __AIDGE_EXPORT_CPP_KERNELS_TRANSPOSE__
+
+#include "network/typedefs.hpp"
+
+
+using namespace std; 
+
+template< int INPUT_DIMS[],  int PERM[], int OUTPUT_DIMS[], 
+		int SIZE_OUTPUT_DIMS, int SIZE,
+        typename Input_T, typename Output_T>
+__attribute__((always_inline)) inline 
+void transpose_forward (
+    const Input_T* __restrict inputs,
+    Output_T* __restrict outputs)
+    {
+
+	int newStrides[SIZE_OUTPUT_DIMS];
+	for (int i = 0; i<SIZE_OUTPUT_DIMS;++i){newStrides[i] = 1;}
+	for (int i = 0; i < SIZE_OUTPUT_DIMS; ++i) {
+		for (int j = i + 1; j < SIZE_OUTPUT_DIMS; ++j) {
+			newStrides[i] *= OUTPUT_DIMS[j];
+		}
+	}
+
+	int indices[SIZE_OUTPUT_DIMS];
+	for (int i = 0; i<SIZE_OUTPUT_DIMS;++i){indices[i] = 0;}
+	
+	for (int i = 0; i < SIZE; ++i) {
+		int idx = 0;
+		for (int j = SIZE_OUTPUT_DIMS -1; j >=0; --j) {
+			idx += indices[PERM[j]] * newStrides[j];
+		}
+
+		outputs[idx] = inputs[i];
+
+
+		for (int j = SIZE_OUTPUT_DIMS - 1; j >= 0; --j) {
+			if (indices[j] < INPUT_DIMS[j] - 1) {
+				indices[j]++;
+				break;
+			}
+			else {
+				indices[j] = 0;
+			}
+		}
+	}
+
+    
+}
+
+
+
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_TRANSPOSE__
--- a/aidge_export_cpp/operators.py
+++ b/aidge_export_cpp/operators.py
@@ -56,7 +56,7 @@ class ProducerCPP(ExportNode):
        super().__init__(node, mem_info)
        self.values = np.array(self.operator.get_output(0))

-        if len(self.values.shape) == 4:  # Note: export in HWC
+        if len(self.values.shape) == 4:  # Note: export in HWC   
            self.values =  np.transpose(self.values, (0, 2, 3, 1))

    def export(self, export_folder: Path):
@@ -143,6 +143,24 @@ def _setup_conv2D(conv):
        str(ROOT / "kernels" / "rescaling.hpp")
    ]

+
+def _setup_elemwise_op(elemwise, op):
+    """Common code (template and kernel setup) shared across all the different elementWise operator (Add, Sub,...)."""
+
+    elemwise.attributes["elemwise_op"] = op
+    elemwise.attributes["activation"] = "Linear"
+    elemwise.attributes["rescaling"] = "NoScaling"
+    elemwise.config_template = str(
+        ROOT / "templates" / "configuration" / "elemwise_config.jinja")
+    elemwise.forward_template = str(
+        ROOT / "templates" / "kernel_forward" / "elemwise_forward.jinja")
+    elemwise.include_list = []
+    elemwise.kernels_to_copy = [
+        str(ROOT / "kernels" / "elemwise.hpp"),
+        str(ROOT / "kernels" / "activation.hpp"),
+        str(ROOT / "kernels" / "rescaling.hpp")
+    ]
+
 @ExportLibCpp.register("Conv2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
 class ConvCPP(ExportNodeCpp):
    def __init__(self, node, mem_info):
@@ -150,7 +168,7 @@ class ConvCPP(ExportNodeCpp):
        # No padding with Conv
        # Use PaddedConv to add padding attribute
        self.attributes["padding"] = [0, 0]
-
+        self.attributes["groups"] = 1
        _setup_conv2D(self)

 @ExportLibCpp.register_metaop("PaddedConv2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
@@ -169,25 +187,28 @@ class PaddedConvCPP(ExportNodeCpp):
                ).attr.stride_dims
                self.attributes["dilation_dims"] = n.get_operator(
                ).attr.dilation_dims
-
+        self.attributes["groups"] = 1
        _setup_conv2D(self)

-def _setup_elemwise_op(elemwise, op):
-    """Common code (template and kernel setup) shared across all the different elementWise operator (Add, Sub,...)."""
+@ExportLibCpp.register_metaop("PaddedConvDepthWise2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
+class PaddedConvDepthWiseCPP(ExportNodeCpp):
+    def __init__(self, node, mem_info):
+        super().__init__(node, mem_info)
+        # TODO find a way to retrive attr for meta op
+        for n in self.operator.get_micro_graph().get_nodes():
+            if n.type() == "Pad2D":
+                self.attributes["padding"] = n.get_operator(
+                ).attr.begin_end_borders
+            if n.type() == "ConvDepthWise2D":
+                self.attributes["kernel_dims"] = n.get_operator(
+                ).attr.kernel_dims
+                self.attributes["stride_dims"] = n.get_operator(
+                ).attr.stride_dims
+                self.attributes["dilation_dims"] = n.get_operator(
+                ).attr.dilation_dims

-    elemwise.attributes["elemwise_op"] = op
-    elemwise.attributes["activation"] = "Linear"
-    elemwise.attributes["rescaling"] = "NoScaling"
-    elemwise.config_template = str(
-        ROOT / "templates" / "configuration" / "elemwise_config.jinja")
-    elemwise.forward_template = str(
-        ROOT / "templates" / "kernel_forward" / "elemwise_forward.jinja")
-    elemwise.include_list = []
-    elemwise.kernels_to_copy = [
-        str(ROOT / "kernels" / "elemwise.hpp"),
-        str(ROOT / "kernels" / "activation.hpp"),
-        str(ROOT / "kernels" / "rescaling.hpp")
-    ]
+        self.attributes["groups"] = self.attributes["out_chan"][0]
+        _setup_conv2D(self)

 @ExportLibCpp.register("Add", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
 class AddCPP(ExportNodeCpp):
@@ -210,6 +231,14 @@ class MulCPP(ExportNodeCpp):

        _setup_elemwise_op(self, "Mul")

+@ExportLibCpp.register("Div", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
+class DivCPP(ExportNodeCpp):
+    def __init__(self, node, mem_info):
+        super().__init__(node, mem_info)
+
+        _setup_elemwise_op(self, "Div")
+
+
 def _setup_pooling(pooling):
    """Common code (template and kernel setup) shared across all the different pooling operator."""

@@ -302,4 +331,39 @@ class TransposeCPP(ExportNodeCpp):
        self.include_list = []
        self.kernels_to_copy = [
            str(ROOT / "kernels" / "transpose.hpp")
+        ]
+
+
+@ExportLibCpp.register("Erf", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
+class ErfCPP(ExportNodeCpp):
+    def __init__(self, node, mem_info):
+        super().__init__(node, mem_info)
+        self.attributes["activation"] = "Linear"
+        self.attributes["rescaling"] = "NoScaling"
+        self.config_template = str(
+            ROOT / "templates" / "configuration" / "erf_config.jinja")
+        self.forward_template = str(
+            ROOT / "templates" / "kernel_forward" / "erf_forward.jinja")
+        self.include_list = []
+        self.kernels_to_copy = [
+            str(ROOT / "kernels" / "erf.hpp"),
+            str(ROOT / "kernels" / "activation.hpp"),
+            str(ROOT / "kernels" / "rescaling.hpp")
+        ]
+
+@ExportLibCpp.register("BatchNorm2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
+class BatchNorm2DCPP(ExportNodeCpp):
+    def __init__(self, node, mem_info):
+        super().__init__(node, mem_info)
+        self.attributes["activation"] = "Linear"
+        self.attributes["rescaling"] = "NoScaling"
+        self.config_template = str(
+            ROOT / "templates" / "configuration" / "batchnorm_config.jinja")
+        self.forward_template = str(
+            ROOT / "templates" / "kernel_forward" / "batchnorm_forward.jinja")
+        self.include_list = []
+        self.kernels_to_copy = [
+            str(ROOT / "kernels" / "batchnorm.hpp"),
+            str(ROOT / "kernels" / "activation.hpp"),
+            str(ROOT / "kernels" / "rescaling.hpp")
        ]
\ No newline at end of file
--- a/aidge_export_cpp/static/include/network/typedefs.hpp
+++ b/aidge_export_cpp/static/include/network/typedefs.hpp
@@ -19,7 +19,8 @@ typedef enum {
 typedef enum {
    Add,
    Sub,
-    Mul
+    Mul,
+    Div
 } ElemWise_T;

 typedef enum {

--- a/aidge_export_cpp/templates/configuration/add_config.jinja
+++ b/aidge_export_cpp/templates/configuration/add_config.jinja
+{#- For name header -#}
+#ifndef {{ name|upper }}_LAYER_H
+#define {{ name|upper }}_LAYER_H
+#include "kernels/rescaling.hpp"
+
+{% include "./_def_io.jinja" %}
+{% include "./_meminfo.jinja" %}
+{# For layer configuration -#}
+#define {{ name|upper }}_NB_ELTS {{ in_dims[0]|join('*') }}
+#define {{ name|upper }}_NB_ELTS_B {{ in_dims[1]|join('*')}}
+
+int {{name|upper}}_OUTPUT_DIMS[] =  { {{ out_dims[0]|join(", ") }} };
+int {{name|upper}}_INPUT_A_DIMS[] = { {{ in_dims[0]|join(", ") }} };
+int {{name|upper}}_INPUT_B_DIMS[] = { {{ in_dims[1]|join(", ") }} };
+
+#define {{name|upper}}_SIZE_DIM_IN_A {{in_dims[0]|length}}
+#define {{name|upper}}_SIZE_DIM_IN_B {{in_dims[1]|length}}
+#define {{name|upper}}_SIZE_DIM_OUT {{out_dims[0]|length}}
+
+#define {{ name|upper }}_OUT_SIZE {{out_size[0]}}
+#define {{name|upper }}_SIZE_DIM_OUT {{out_dims[0]|length}}
+
+#define {{ name|upper }}_ACTIVATION {{ activation }}
+static const {{ rescaling }} {{ name|upper }}_RESCALING = {};
+#endif /* {{ name|upper }}_LAYER_H */
--- a/aidge_export_cpp/templates/configuration/batchnorm_config.jinja
+++ b/aidge_export_cpp/templates/configuration/batchnorm_config.jinja
@@ -8,4 +8,4 @@
 #define {{ name|upper }}_ACTIVATION {{ activation }}
 #define {{ name|upper }}_EPSILON {{ epsilon }}

-#endif /* {{ name|upper }}_LAYER_H */
+#endif /* {{ name|upper }}_LAYER_H */
\ No newline at end of file
--- a/aidge_export_cpp/templates/configuration/convolution_config.jinja
+++ b/aidge_export_cpp/templates/configuration/convolution_config.jinja
@@ -5,6 +5,8 @@
 {# For layer configuration -#}
 {% include "./_def_io.jinja" %}
 {% include "./_meminfo.jinja" %}
+
+#define {{ name|upper }}_GROUPS {{ groups }}
 #define {{ name|upper }}_PADDING_Y {{ padding[1] }}
 #define {{ name|upper }}_PADDING_X {{ padding[0] }}
 #define {{ name|upper }}_STRIDE_Y {{ stride_dims[1] }}
@@ -21,5 +23,4 @@ static const {{ rescaling }} {{ name|upper }}_RESCALING = {};
 #define {{ name|upper }}_WEIGHTS_SIZE {{ weights_size }}
 #define {{ name|upper }}_BIASES_SIZE {{ out_chan[0] }}

-
 #endif /* {{ name|upper }}_LAYER_H */
--- a/aidge_export_cpp/templates/configuration/div_config.jinja
+++ b/aidge_export_cpp/templates/configuration/div_config.jinja
+{#- For name header -#}
+#ifndef {{ name|upper }}_LAYER_H
+#define {{ name|upper }}_LAYER_H
+#include "kernels/rescaling.hpp"
+
+{% include "./_def_io.jinja" %}
+{% include "./_meminfo.jinja" %}
+{# For layer configuration -#}
+#define {{ name|upper }}_NB_ELTS {{ in_dims[0]|join('*') }}
+#define {{ name|upper }}_NB_ELTS_B {{ in_dims[1]|join('*')}}
+
+int {{name|upper}}_OUTPUT_DIMS[] =  { {{ out_dims[0]|join(", ") }} };
+int {{name|upper}}_INPUT_A_DIMS[] = { {{ in_dims[0]|join(", ") }} };
+int {{name|upper}}_INPUT_B_DIMS[] = { {{ in_dims[1]|join(", ") }} };
+
+#define {{name|upper}}_SIZE_DIM_IN_A {{in_dims[0]|length}}
+#define {{name|upper}}_SIZE_DIM_IN_B {{in_dims[1]|length}}
+#define {{name|upper}}_SIZE_DIM_OUT {{out_dims[0]|length}}
+
+#define {{ name|upper }}_OUT_SIZE {{out_size[0]}}
+#define {{name|upper }}_SIZE_DIM_OUT {{out_dims[0]|length}}
+
+#define {{ name|upper }}_ACTIVATION {{ activation }}
+static const {{ rescaling }} {{ name|upper }}_RESCALING = {};
+#endif /* {{ name|upper }}_LAYER_H */
--- a/aidge_export_cpp/templates/configuration/elemwise_config.jinja
+++ b/aidge_export_cpp/templates/configuration/elemwise_config.jinja
@@ -7,6 +7,19 @@
 {% include "./_meminfo.jinja" %}
 {# For layer configuration -#}
 #define {{ name|upper }}_NB_ELTS {{ in_dims[0]|join('*') }}
+#define {{ name|upper }}_NB_ELTS_B {{ in_dims[1]|join('*')}}
+
+int {{name|upper}}_OUTPUT_DIMS[] =  { {{ out_dims[0]|join(", ") }} };
+int {{name|upper}}_INPUT_A_DIMS[] = { {{ in_dims[0]|join(", ") }} };
+int {{name|upper}}_INPUT_B_DIMS[] = { {{ in_dims[1]|join(", ") }} };
+
+#define {{name|upper}}_SIZE_DIM_IN_A {{in_dims[0]|length}}
+#define {{name|upper}}_SIZE_DIM_IN_B {{in_dims[1]|length}}
+#define {{name|upper}}_SIZE_DIM_OUT {{out_dims[0]|length}}
+
+#define {{ name|upper }}_OUT_SIZE {{out_size[0]}}
+#define {{name|upper }}_SIZE_DIM_OUT {{out_dims[0]|length}}
+
 #define {{ name|upper }}_ACTIVATION {{ activation }}
 #define {{ name|upper }}_ELEM_OP {{ elemwise_op }}
 static const {{ rescaling }} {{ name|upper }}_RESCALING = {};
No results found