diff --git a/aidge_export_cpp/kernels/add.hpp b/aidge_export_cpp/kernels/add.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..52b58f5a09591781357f64f27090a7e347850d35
--- /dev/null
+++ b/aidge_export_cpp/kernels/add.hpp
@@ -0,0 +1,99 @@
+#ifndef __AIDGE_EXPORT_CPP_KERNELS_ADD__
+#define __AIDGE_EXPORT_CPP_KERNELS_ADD__
+
+#include "network/typedefs.hpp"
+#include "kernels/activation.hpp"
+
+
+template<int NB_ELTS, 
+        int INPUT_A_DIMS[],  int INPUT_B_DIMS[], int OUTPUT_DIMS[], 
+		int SIZE_DIM_IN_A, int SIZE_DIM_IN_B, int SIZE_DIM_OUT, int OUT_SIZE, 
+        ActivationFunction_T ACTIVATION,
+        typename Input_T, typename Output_T>        
+__attribute__((always_inline)) inline
+void add_forward (
+    Output_T* __restrict outputs,
+    const Input_T* __restrict inputs1,
+    const Input_T* __restrict inputs2)
+{
+    int ndim_a[SIZE_DIM_OUT];
+    int ndim_b[SIZE_DIM_OUT];
+    for (int i= 0; i<SIZE_DIM_OUT; i++){
+        int idx = SIZE_DIM_OUT-SIZE_DIM_IN_A;
+        ndim_a[i] = (i< idx) ? 1 : INPUT_A_DIMS[i-idx];
+    }
+    for (int i= 0; i<SIZE_DIM_OUT; i++){
+        int idx = SIZE_DIM_OUT-SIZE_DIM_IN_B;
+        ndim_b[i] = (i< idx) ? 1 : INPUT_B_DIMS[i-idx];
+    }
+    
+    // Find the highest equal dimension
+    int contiguousidx  = SIZE_DIM_OUT -1 ;
+
+    for (int i = contiguousidx ; ndim_a[i] == ndim_b[i]; i--) {
+        contiguousidx  = i;
+    }
+
+    // Compute the highest number of contiguous data for each Tensor
+    int input0_contiguous_size = 1;
+    for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
+        input0_contiguous_size *= ndim_a[i];
+    }
+    int input1_contiguous_size = 1;
+    for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
+        input1_contiguous_size *= ndim_b[i];
+    }
+    int output_contiguous_size = 1;
+    for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
+        output_contiguous_size *= OUTPUT_DIMS[i];
+    }
+    // initialize strides to iterate through data because of broadcasting
+    int stride_post0[contiguousidx ] ;
+    int stride_post1[contiguousidx ] ;
+    int stride_step0[contiguousidx ] ;
+    int stride_step1[contiguousidx ] ;
+    if (contiguousidx > 0) {
+        stride_post0[contiguousidx  - 1] = 1;
+        stride_post1[contiguousidx  - 1] = 1;
+        for (int i = contiguousidx -2; i != -1; --i) {
+            stride_post0[i] = stride_post0[i+1]*ndim_a[i+1];
+            stride_post1[i] = stride_post1[i+1]*ndim_b[i+1];
+        }
+        for (int i = 0; i < contiguousidx ; ++i) {
+            stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1;
+            stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1;
+        }
+    }
+    int offsetIn0 = 0;
+    int offsetIn1 = 0;
+    int offsetOut = 0;
+    int nbMatrices = 1;
+    for(int i = 0; i<contiguousidx ; ++i){
+        nbMatrices *= OUTPUT_DIMS[i];
+        
+    }
+    int dim = contiguousidx  - 1;
+    for(int stack = 0; stack < nbMatrices;){
+        for(int i = 0; i < output_contiguous_size; ++i){
+            int in0_id = (input0_contiguous_size != 1) ? i : 0;
+            int in1_id = (input1_contiguous_size != 1) ? i : 0;
+            outputs[i + offsetOut*output_contiguous_size] = inputs1[in0_id + offsetIn0*input0_contiguous_size] + inputs2[in1_id + offsetIn1*input1_contiguous_size];
+        }
+        if (++stack < nbMatrices) {
+            int tmp_stack = stack;
+            while(tmp_stack % OUTPUT_DIMS[dim] == 0) {
+                tmp_stack /= OUTPUT_DIMS[dim];
+                dim--;
+            }
+            offsetIn0 += stride_step0[dim];
+            offsetIn1 += stride_step1[dim];
+            ++offsetOut;
+            dim = contiguousidx  - 1;
+        }
+    }
+}
+
+
+
+
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_ADD__
\ No newline at end of file
diff --git a/aidge_export_cpp/kernels/batchnorm.hpp b/aidge_export_cpp/kernels/batchnorm.hpp
index 740ea21e6f66ba338985db4f724a5d57377e1f81..4100e6d1ef0d07ff16ad25b33cb497e1143f0c0b 100644
--- a/aidge_export_cpp/kernels/batchnorm.hpp
+++ b/aidge_export_cpp/kernels/batchnorm.hpp
@@ -3,6 +3,7 @@
 
 #include "network/typedefs.hpp"
 #include "kernels/rescaling.hpp"
+#include "kernels/activation.hpp"
 #include <math.h>
 
 // WARNING: this kernel only works for 32-bits floating point values
@@ -12,30 +13,40 @@ template<int NB_OUTPUTS,
          ActivationFunction_T ACTIVATION,
          typename Input_T, typename Output_T,
          typename Param_T>
-__attribute__((always_inline)) inline
+__attribute__((always_inline)) inline   
 void batchnorm_forward (
     const Input_T* __restrict inputs,
     Output_T* __restrict outputs,
+    const Param_T* __restrict scales,
     const Param_T* __restrict biases,
-    const Param_T* __restrict variances,
     const Param_T* __restrict means,
-    const Param_T* __restrict scales,
+    const Param_T* __restrict variances,
     const double epsilon)
 {
-    for (unsigned int output = 0; output < NB_OUTPUTS; ++output) {
-        const Output_T var = sqrt(variances[output] + epsilon);
 
-        for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
-            for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
-                const int outputOffset = OUTPUTS_HEIGHT * oy + ox;
-
-                const Output_T normalized = (inputs[outputOffset + output] - means[output]) / var;
-                const Output_T sAs = scales[output] * normalized + biases[output];
-                outputs[outputOffset + output] = sat<Output_T>(sAs, output, ACTIVATION, NoScaling);
-            }
+    int featureMapSize = OUTPUTS_HEIGHT * OUTPUTS_WIDTH;
+#ifdef _OPENMP
+    #pragma omp parallel for
+#endif
+    for (int ch = 0; ch < NB_OUTPUTS; ++ch) {
+        int ioIndex = ch * featureMapSize;
+#ifdef _OPENMP
+        #pragma omp parallel for
+#endif
+        for (int i = ioIndex; i < ioIndex + featureMapSize; i++) {
+            outputs[i] = biases[ch];
+        }
+        float var = sqrt(variances[ch] + epsilon);
+#ifdef _OPENMP
+        #pragma omp parallel for
+#endif
+        for (int feature = 0; feature < featureMapSize; ++feature) {
+            outputs[ioIndex + feature] += (scales[ch] * (inputs[ioIndex + feature] - means[ch]) / var);
         }
     }
+
 }
 
 
+
 #endif  // __AIDGE_EXPORT_CPP_KERNELS_BATCHNORM__
diff --git a/aidge_export_cpp/kernels/convolution.hpp b/aidge_export_cpp/kernels/convolution.hpp
index 6ea9f0579b84dd5a28a5ea66a778326fcd9c84ce..11553398a24351d7d6f4b7d31e5fe4e3cd0c2654 100644
--- a/aidge_export_cpp/kernels/convolution.hpp
+++ b/aidge_export_cpp/kernels/convolution.hpp
@@ -6,30 +6,44 @@
 #include "network/utils.hpp"
 #include "kernels/macs.hpp"
 #include "kernels/activation.hpp"
+#include <omp.h>
+#include <iostream>
+
+// Weights index en NHWC
+constexpr int inds_pos(int n, int c, int h, int w, int N, int C, int H, int W) {
+    return n * (H * W * C) +
+           h * (W * C) + 
+           w * C +
+           c;
+}
 
+// Image index in CHW
+constexpr int inds_pos(int c, int h, int w, int C, int H, int W) {
+    return c * (H * W) + 
+           h * W +
+           w;
+}
 
-template<int NB_CHANNELS,
-         int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
-         int NB_OUTPUTS,
-         int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
+template<int NB_CHANNELS, 
+         int IN_HEIGHT, int IN_WIDTH,
+         int NB_OUTPUTS, int GROUPS,
+         int OUT_HEIGHT, int OUT_WIDTH,
          int PADDING_Y, int PADDING_X,
          int STRIDE_Y, int STRIDE_X,
          int DILATION_Y, int DILATION_X,
          int KERNEL_HEIGHT, int KERNEL_WIDTH,
          ActivationFunction_T ACTIVATION,
-         typename Input_T, typename Output_T,
+         typename Input_T, typename Output_T, 
          typename Weight_T, typename Bias_T,
          typename Rescaling_T>
-__attribute__((always_inline)) inline
+__attribute__((always_inline)) inline 
 void convolution_forward(
-    const Input_T* __restrict inputs,
+    const Input_T* __restrict inputs,  
     Output_T* __restrict outputs,
     const Weight_T* __restrict weights,
     const Bias_T* __restrict biases,
     const Rescaling_T& __restrict rescaling)
 {
-    constexpr int DILATED_KERNEL_HEIGHT
-            = KERNEL_HEIGHT + (DILATION_Y - 1) * (KERNEL_HEIGHT - 1);
 
     constexpr int DILATED_KERNEL_WIDTH
             = KERNEL_WIDTH + (DILATION_X - 1) * (KERNEL_WIDTH - 1);
diff --git a/aidge_export_cpp/kernels/convolution_groups.hpp b/aidge_export_cpp/kernels/convolution_groups.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..17cb1bfa7962063c59fca2395b1110c526e68015
--- /dev/null
+++ b/aidge_export_cpp/kernels/convolution_groups.hpp
@@ -0,0 +1,85 @@
+#ifndef __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION__
+#define __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION__
+
+#include "network/typedefs.hpp"
+#include "kernels/rescaling.hpp"
+#include "network/utils.hpp"
+#include "kernels/macs.hpp"
+#include "kernels/activation.hpp"
+
+// Weights index en NHWC
+constexpr int inds_pos(int n, int c, int h, int w, int N, int C, int H, int W) {
+    return n * (H * W * C) +
+           h * (W * C) + 
+           w * C +
+           c;
+}
+
+// Image index in CHW
+constexpr int inds_pos(int c, int h, int w, int C, int H, int W) {
+    return c * (H * W) + 
+           h * W +
+           w;
+}
+
+
+
+template<int NB_CHANNELS, 
+         int IN_HEIGHT, int IN_WIDTH,
+         int NB_OUTPUTS, int GROUPS,
+         int OUT_HEIGHT, int OUT_WIDTH,
+         int PADDING_Y, int PADDING_X,
+         int STRIDE_Y, int STRIDE_X,
+         int DILATION_Y, int DILATION_X,
+         int KERNEL_HEIGHT, int KERNEL_WIDTH,
+         ActivationFunction_T ACTIVATION,
+         typename Input_T, typename Output_T, 
+         typename Weight_T, typename Bias_T,
+         typename Rescaling_T>
+__attribute__((always_inline)) inline 
+void convolution_forward(
+    const Input_T* __restrict inputs,  
+    Output_T* __restrict outputs,
+    const Weight_T* __restrict weights,
+    const Bias_T* __restrict biases,
+    const Rescaling_T& __restrict rescaling)
+{
+
+    if (NB_CHANNELS % GROUPS != 0 || NB_OUTPUTS % GROUPS != 0) {
+        throw std::invalid_argument("Groups must be a divisor of both NB_CHANNELS and NB_OUTPUTS!");
+    }
+    
+    int c_in_g = NB_CHANNELS / GROUPS;
+    int c_out_g = NB_OUTPUTS / GROUPS;
+#ifdef _OPENMP
+    #pragma omp parallel for collapse(3)
+#endif
+    for (int oc = 0; oc < NB_OUTPUTS; oc++) {
+    	for (int i = 0; i < OUT_HEIGHT; ++i) {
+    	    for (int j = 0; j < OUT_WIDTH; ++j) {
+                int g_oc = oc / c_out_g;
+    	        Output_T value = biases[oc];
+    	        for (int ic = g_oc * c_in_g; ic < (g_oc + 1) * c_in_g; ++ic) {
+#ifdef _OPENMP
+                    #pragma omp parallel for
+#endif
+    	            for (int m = 0; m < KERNEL_HEIGHT; ++m) {
+#ifdef _OPENMP
+                        #pragma omp parallel for
+#endif
+    	                for (int n = 0; n < KERNEL_WIDTH; ++n) {
+    	                    int i_p = i * STRIDE_X - PADDING_X + m * DILATION_X;
+                            int j_p = j * STRIDE_Y - PADDING_Y + n * DILATION_Y;
+                            if (i_p >= 0 && i_p < IN_HEIGHT && j_p >= 0 && j_p < IN_WIDTH) {
+                                value += weights[inds_pos(oc, ic % c_in_g, m, n, NB_OUTPUTS, c_in_g, KERNEL_HEIGHT, KERNEL_WIDTH)] *
+                                         inputs[inds_pos(ic, i_p, j_p, NB_CHANNELS, IN_HEIGHT, IN_WIDTH)];
+    	                    }
+    	                }
+    	            }
+    	        }
+    	        outputs[inds_pos(oc, i, j, NB_OUTPUTS, OUT_HEIGHT, OUT_WIDTH)] = activation_forward_value<Output_T>(value, oc, ACTIVATION, rescaling);
+    	    }
+    	} 
+    }
+}
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION__
\ No newline at end of file
diff --git a/aidge_export_cpp/kernels/div.hpp b/aidge_export_cpp/kernels/div.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..44640aafcf745e837bb872c56df70eab74ffb4dd
--- /dev/null
+++ b/aidge_export_cpp/kernels/div.hpp
@@ -0,0 +1,99 @@
+#ifndef __AIDGE_EXPORT_CPP_KERNELS_DIV__
+#define __AIDGE_EXPORT_CPP_KERNELS_DIV__
+
+#include "network/typedefs.hpp"
+#include "kernels/activation.hpp"
+
+template<int NB_ELTS, 
+        int INPUT_A_DIMS[],  int INPUT_B_DIMS[], int OUTPUT_DIMS[], 
+		int SIZE_DIM_IN_A, int SIZE_DIM_IN_B, int SIZE_DIM_OUT, int OUT_SIZE, 
+        ActivationFunction_T ACTIVATION,
+        typename Input_T, typename Output_T>        
+__attribute__((always_inline)) inline
+void div_forward (
+    Output_T* __restrict outputs,
+    const Input_T* __restrict inputs1,
+    const Input_T* __restrict inputs2)
+{
+
+    int ndim_a[SIZE_DIM_OUT];
+    int ndim_b[SIZE_DIM_OUT];
+    for (int i= 0; i<SIZE_DIM_OUT; i++){
+        int idx = SIZE_DIM_OUT-SIZE_DIM_IN_A;
+        ndim_a[i] = (i< idx) ? 1 : INPUT_A_DIMS[i-idx];
+    }
+    for (int i= 0; i<SIZE_DIM_OUT; i++){
+        int idx = SIZE_DIM_OUT-SIZE_DIM_IN_B;
+        ndim_b[i] = (i< idx) ? 1 : INPUT_B_DIMS[i-idx];
+    }
+    
+    // Find the highest equal dimension
+    int contiguousidx  = SIZE_DIM_OUT -1 ;
+
+    for (int i = contiguousidx ; ndim_a[i] == ndim_b[i]; i--) {
+        contiguousidx  = i;
+    }
+
+    // Compute the highest number of contiguous data for each Tensor
+    int input0_contiguous_size = 1;
+    for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
+        input0_contiguous_size *= ndim_a[i];
+    }
+    int input1_contiguous_size = 1;
+    for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
+        input1_contiguous_size *= ndim_b[i];
+    }
+    int output_contiguous_size = 1;
+    for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
+        output_contiguous_size *= OUTPUT_DIMS[i];
+    }
+    // initialize strides to iterate through data because of broadcasting
+    int stride_post0[contiguousidx ] ;
+    int stride_post1[contiguousidx ] ;
+    int stride_step0[contiguousidx ] ;
+    int stride_step1[contiguousidx ] ;
+    if (contiguousidx > 0) {
+        stride_post0[contiguousidx  - 1] = 1;
+        stride_post1[contiguousidx  - 1] = 1;
+        for (int i = contiguousidx -2; i != -1; --i) {
+            stride_post0[i] = stride_post0[i+1]*ndim_a[i+1];
+            stride_post1[i] = stride_post1[i+1]*ndim_b[i+1];
+        }
+        for (int i = 0; i < contiguousidx ; ++i) {
+            stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1;
+            stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1;
+        }
+    }
+    int offsetIn0 = 0;
+    int offsetIn1 = 0;
+    int offsetOut = 0;
+    int nbMatrices = 1;
+    for(int i = 0; i<contiguousidx ; ++i){
+        nbMatrices *= OUTPUT_DIMS[i];
+        
+    }
+    int dim = contiguousidx  - 1;
+    for(int stack = 0; stack < nbMatrices;){
+        for(int i = 0; i < output_contiguous_size; ++i){
+            int in0_id = (input0_contiguous_size != 1) ? i : 0;
+            int in1_id = (input1_contiguous_size != 1) ? i : 0;
+            outputs[i + offsetOut*output_contiguous_size] = inputs1[in0_id + offsetIn0*input0_contiguous_size] / inputs2[in1_id + offsetIn1*input1_contiguous_size];
+        }
+        if (++stack < nbMatrices) {
+            int tmp_stack = stack;
+            while(tmp_stack % OUTPUT_DIMS[dim] == 0) {
+                tmp_stack /= OUTPUT_DIMS[dim];
+                dim--;
+            }
+            offsetIn0 += stride_step0[dim];
+            offsetIn1 += stride_step1[dim];
+            ++offsetOut;
+            dim = contiguousidx  - 1;
+        }
+    }
+}
+
+
+
+
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_DIV__
\ No newline at end of file
diff --git a/aidge_export_cpp/kernels/elemwise.hpp b/aidge_export_cpp/kernels/elemwise.hpp
index 67ee574c1cb7d197f3c976ce80a2a63d36aec873..9b979591dbc367f9b9d3520714e4b6871804538d 100644
--- a/aidge_export_cpp/kernels/elemwise.hpp
+++ b/aidge_export_cpp/kernels/elemwise.hpp
@@ -4,13 +4,12 @@
 #include "network/typedefs.hpp"
 #include "kernels/activation.hpp"
 
-// Generic function for two inputs
 
-template<int NB_ELTS,
-         ElemWise_T ELEM_OP,
-         ActivationFunction_T ACTIVATION,
-         typename Input_T, typename Output_T,
-         typename Rescaling_T>
+template<int NB_ELTS, ElemWise_T ELEM_OP,
+        int INPUT_A_DIMS[],  int INPUT_B_DIMS[], int OUTPUT_DIMS[], 
+		int SIZE_DIM_IN_A, int SIZE_DIM_IN_B, int SIZE_DIM_OUT, int OUT_SIZE, 
+        ActivationFunction_T ACTIVATION,
+        typename Input_T, typename Output_T, typename Rescaling_T>        
 __attribute__((always_inline)) inline
 void elemwise_forward (
     Output_T* __restrict outputs,
@@ -21,32 +20,313 @@ void elemwise_forward (
     if (std::is_floating_point<Input_T>::value)
     {
         Input_T val = 0;
-
+        
         switch (ELEM_OP) {
             case Add: {
-                for (int i = 0; i < NB_ELTS; ++i) {
-                    val = inputs1[i] + inputs2[i];
-                    outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
+                int ndim_a[SIZE_DIM_OUT];
+                int ndim_b[SIZE_DIM_OUT];
+                for (int i= 0; i<SIZE_DIM_OUT; i++){
+                    int idx = SIZE_DIM_OUT-SIZE_DIM_IN_A;
+                    ndim_a[i] = (i< idx) ? 1 : INPUT_A_DIMS[i-idx];
+                }
+                for (int i= 0; i<SIZE_DIM_OUT; i++){
+                    int idx = SIZE_DIM_OUT-SIZE_DIM_IN_B;
+                    ndim_b[i] = (i< idx) ? 1 : INPUT_B_DIMS[i-idx];
+                }
+                
+                // Find the highest equal dimension
+                int contiguousidx  = SIZE_DIM_OUT -1 ;
+                for (int i = contiguousidx ; ndim_a[i] == ndim_b[i]; i--) {
+                    contiguousidx  = i;
+                }
+                // Compute the highest number of contiguous data for each Tensor
+                int input0_contiguous_size = 1;
+                for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
+                    input0_contiguous_size *= ndim_a[i];
+                }
+                int input1_contiguous_size = 1;
+                for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
+                    input1_contiguous_size *= ndim_b[i];
+                }
+                int output_contiguous_size = 1;
+                for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
+                    output_contiguous_size *= OUTPUT_DIMS[i];
+                }
+                // initialize strides to iterate through data because of broadcasting
+                int stride_post0[contiguousidx ] ;
+                int stride_post1[contiguousidx ] ;
+                int stride_step0[contiguousidx ] ;
+                int stride_step1[contiguousidx ] ;
+
+                if (contiguousidx > 0) {
+                    stride_post0[contiguousidx  - 1] = 1;
+                    stride_post1[contiguousidx  - 1] = 1;
+                    for (int i = contiguousidx -2; i != -1; --i) {
+                        stride_post0[i] = stride_post0[i+1]*ndim_a[i+1];
+                        stride_post1[i] = stride_post1[i+1]*ndim_b[i+1];
+                    }
+                    for (int i = 0; i < contiguousidx ; ++i) {
+                        stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1;
+                        stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1;
+                    }
+                }
+                int offsetIn0 = 0;
+                int offsetIn1 = 0;
+                int offsetOut = 0;
+                int nbMatrices = 1;
+                for(int i = 0; i<contiguousidx ; ++i){
+                    nbMatrices *= OUTPUT_DIMS[i];
+                    
+                }
+                int dim = contiguousidx  - 1;
+                for(int stack = 0; stack < nbMatrices;){
+                    for(int i = 0; i < output_contiguous_size; ++i){
+                        int in0_id = (input0_contiguous_size != 1) ? i : 0;
+                        int in1_id = (input1_contiguous_size != 1) ? i : 0;
+                        outputs[i + offsetOut*output_contiguous_size] = inputs1[in0_id + offsetIn0*input0_contiguous_size] + inputs2[in1_id + offsetIn1*input1_contiguous_size];
+                    }
+                    if (++stack < nbMatrices) {
+                        int tmp_stack = stack;
+                        while(tmp_stack % OUTPUT_DIMS[dim] == 0) {
+                            tmp_stack /= OUTPUT_DIMS[dim];
+                            dim--;
+                        }
+                        offsetIn0 += stride_step0[dim];
+                        offsetIn1 += stride_step1[dim];
+                        ++offsetOut;
+                        dim = contiguousidx  - 1;
+                    }
                 }
                 break;
+
             }
-            case Sub: {
-                for (int i = 0; i < NB_ELTS; ++i) {
-                    val = inputs1[i] - inputs2[i];
-                    outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
+            case Sub: { 
+
+                int ndim_a[SIZE_DIM_OUT];
+                int ndim_b[SIZE_DIM_OUT];
+                for (int i= 0; i<SIZE_DIM_OUT; i++){
+                    int idx = SIZE_DIM_OUT-SIZE_DIM_IN_A;
+                    ndim_a[i] = (i< idx) ? 1 : INPUT_A_DIMS[i-idx];
+                }
+                for (int i= 0; i<SIZE_DIM_OUT; i++){
+                    int idx = SIZE_DIM_OUT-SIZE_DIM_IN_B;
+                    ndim_b[i] = (i< idx) ? 1 : INPUT_B_DIMS[i-idx];
+                }
+                // Find the highest equal dimension
+                int contiguousIdx = SIZE_DIM_OUT-1;
 
+                for (int i = contiguousIdx ; ndim_a[i] == ndim_b[i]; i--) {
+                    contiguousIdx  = i;
+                }
+                // Compute the highest number of contiguous data for each Tensor
+                int input0_contiguous_size = 1;
+                for(int i = contiguousIdx; i<SIZE_DIM_OUT; ++i){
+                    input0_contiguous_size *= ndim_a[i];
+                }
+                int input1_contiguous_size = 1;
+                for(int i = contiguousIdx; i<SIZE_DIM_OUT; ++i){
+                    input1_contiguous_size *= ndim_b[i];
+                }
+                int output_contiguous_size = 1;
+                for(int i = contiguousIdx; i<SIZE_DIM_OUT; ++i){
+                    output_contiguous_size *= OUTPUT_DIMS[i];
                 }
+                // initialize strides to iterate through data because of broadcasting
+                int stride_post0[contiguousIdx] ;
+                int stride_post1[contiguousIdx] ;
+                int stride_step0[contiguousIdx] ;
+                int stride_step1[contiguousIdx] ;
+                if (contiguousIdx > 0) {
+                    stride_post0[contiguousIdx - 1] = 1;
+                    stride_post1[contiguousIdx - 1] = 1;
+                    for (int i = contiguousIdx-2; i != -1; --i) {
+                        stride_post0[i] = stride_post0[i+1]*ndim_a[i+1];
+                        stride_post1[i] = stride_post1[i+1]*ndim_b[i+1];
+                    }
+                    for (int i = 0; i < contiguousIdx; ++i) {
+                        stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1;
+                        stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1;
+                    }
+                }
+                int offsetIn0 = 0;
+                int offsetIn1 = 0;
+                int offsetOut = 0;
+                int nbMatrices = 1;
+                for(int i = 0; i<contiguousIdx; ++i){
+                    nbMatrices *= OUTPUT_DIMS[i];
+                }
+                int dim = contiguousIdx - 1;
+                for(int stack = 0; stack < nbMatrices;){
+                    for(int i = 0; i < output_contiguous_size; ++i){
+                        int in0_id = (input0_contiguous_size != 1) ? i : 0;
+                        int in1_id = (input1_contiguous_size != 1) ? i : 0;
+                        outputs[i + offsetOut*output_contiguous_size] = inputs1[in0_id + offsetIn0*input0_contiguous_size] - inputs2[in1_id + offsetIn1*input1_contiguous_size];
+                    }
+                    if (++stack < nbMatrices) {
+                        int tmp_stack = stack;
+                        while(tmp_stack % OUTPUT_DIMS[dim] == 0) {
+                            tmp_stack /= OUTPUT_DIMS[dim];
+                            dim--;
+                        }
+                        offsetIn0 += stride_step0[dim];
+                        offsetIn1 += stride_step1[dim];
+                        ++offsetOut;
+                        dim = contiguousIdx - 1;
+                    }
+                }
+                
                 break;
             }
             case Mul: {
-                for (int i = 0; i < NB_ELTS; ++i) {
-                    val = inputs1[i] * inputs2[i];
-                    outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
+                int ndim_a[SIZE_DIM_OUT];
+                int ndim_b[SIZE_DIM_OUT];
+
+                for (int i= 0; i<SIZE_DIM_OUT; i++){
+                    int idx = SIZE_DIM_OUT-SIZE_DIM_IN_A;
+                    ndim_a[i] = (i< idx) ? 1 : INPUT_A_DIMS[i-idx];
+                }
+                for (int i= 0; i<SIZE_DIM_OUT; i++){
+                    int idx = SIZE_DIM_OUT-SIZE_DIM_IN_B;
+                    ndim_b[i] = (i< idx) ? 1 : INPUT_B_DIMS[i-idx];
+                }
+                // Find the highest equal dimension
+                int contiguousIdx = SIZE_DIM_OUT-1;
+                for (int i = contiguousIdx; ndim_a[i] == ndim_b[i]; i--) {
+                    contiguousIdx = i;
+                }   
+                // Compute the highest number of contiguous data for each Tensor
+                int input0_contiguous_size = 1;
+                for(int i = contiguousIdx; i<SIZE_DIM_OUT; ++i){
+                    input0_contiguous_size *= ndim_a[i];
+                }
+                int input1_contiguous_size = 1;
+                for(int i = contiguousIdx; i<SIZE_DIM_OUT; ++i){
+                    input1_contiguous_size *= ndim_b[i];
+                }
+                int output_contiguous_size = 1;
+                for(int i = contiguousIdx; i<SIZE_DIM_OUT; ++i){
+                    output_contiguous_size *= OUTPUT_DIMS[i];
+                }
+                // initialize strides to iterate through data because of broadcasting
+                int stride_post0[contiguousIdx] ;
+                int stride_post1[contiguousIdx] ;
+                int stride_step0[contiguousIdx] ;
+                int stride_step1[contiguousIdx] ;
+                if (contiguousIdx > 0) {
+                    stride_post0[contiguousIdx - 1] = 1;
+                    stride_post1[contiguousIdx - 1] = 1;
+                    for (int i = contiguousIdx-2; i != -1; --i) {
+                        stride_post0[i] = stride_post0[i+1]*ndim_a[i+1];
+                        stride_post1[i] = stride_post1[i+1]*ndim_b[i+1];
+                    }
+                    for (int i = 0; i < contiguousIdx; ++i) {
+                        stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1;
+                        stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1;
+                    }
+                }
+                int offsetIn0 = 0;
+                int offsetIn1 = 0;
+                int offsetOut = 0;
+                int nbMatrices = 1;
+                for(int i = 0; i<contiguousIdx; ++i){
+                    nbMatrices *= OUTPUT_DIMS[i];
+                }
+                int dim = contiguousIdx - 1;
+                for(int stack = 0; stack < nbMatrices;){
+                    for(int i = 0; i < output_contiguous_size; ++i){
+                        int in0_id = (input0_contiguous_size != 1) ? i : 0;
+                        int in1_id = (input1_contiguous_size != 1) ? i : 0;
+                        outputs[i + offsetOut*output_contiguous_size] = inputs1[in0_id + offsetIn0*input0_contiguous_size] * inputs2[in1_id + offsetIn1*input1_contiguous_size];
+                    }
+                    if (++stack < nbMatrices) {
+                        int tmp_stack = stack;
+                        while(tmp_stack % OUTPUT_DIMS[dim] == 0) {
+                            tmp_stack /= OUTPUT_DIMS[dim];
+                            dim--;
+                        }
+                        offsetIn0 += stride_step0[dim];
+                        offsetIn1 += stride_step1[dim];
+                        ++offsetOut;
+                        dim = contiguousIdx - 1;
+                    }
+                }
+                break;
+            }
+            case Div: {
+                int ndim_a[SIZE_DIM_OUT];
+                int ndim_b[SIZE_DIM_OUT];
+                for (int i= 0; i<SIZE_DIM_OUT; i++){
+                    int idx = SIZE_DIM_OUT-SIZE_DIM_IN_A;
+                    ndim_a[i] = (i< idx) ? 1 : INPUT_A_DIMS[i-idx];
+                }
+                for (int i= 0; i<SIZE_DIM_OUT; i++){
+                    int idx = SIZE_DIM_OUT-SIZE_DIM_IN_B;
+                    ndim_b[i] = (i< idx) ? 1 : INPUT_B_DIMS[i-idx];
+                }
+                // Find the highest equal dimension
+                int contiguousIdx = SIZE_DIM_OUT-1;
+                for (int i = contiguousIdx; ndim_a[i] == ndim_b[i]; i--) {
+                    contiguousIdx = i;
+                }
+                // Compute the highest number of contiguous data for each Tensor
+                int input0_contiguous_size = 1;
+                for(int i = contiguousIdx; i<SIZE_DIM_OUT; ++i){
+                    input0_contiguous_size *= ndim_a[i];
+                }
+                int input1_contiguous_size = 1;
+                for(int i = contiguousIdx; i<SIZE_DIM_OUT; ++i){
+                    input1_contiguous_size *= ndim_b[i];
+                }
+                int output_contiguous_size = 1;
+                for(int i = contiguousIdx; i<SIZE_DIM_OUT; ++i){
+                    output_contiguous_size *= OUTPUT_DIMS[i];
+                }
+                // initialize strides to iterate through data because of broadcasting
+                int stride_post0[contiguousIdx] ;
+                int stride_post1[contiguousIdx] ;
+                int stride_step0[contiguousIdx] ;
+                int stride_step1[contiguousIdx] ;
+                if (contiguousIdx > 0) {
+                    stride_post0[contiguousIdx - 1] = 1;
+                    stride_post1[contiguousIdx - 1] = 1;
+                    for (int i = contiguousIdx-2; i != -1; --i) {
+                        stride_post0[i] = stride_post0[i+1]*ndim_a[i+1];
+                        stride_post1[i] = stride_post1[i+1]*ndim_b[i+1];
+                    }
+                    for (int i = 0; i < contiguousIdx; ++i) {
+                        stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1;
+                        stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1;
+                    }
+                }
+                int offsetIn0 = 0;
+                int offsetIn1 = 0;
+                int offsetOut = 0;
+                int nbMatrices = 1;
+                for(int i = 0; i<contiguousIdx; ++i){
+                    nbMatrices *= OUTPUT_DIMS[i];
+                }
+                int dim = contiguousIdx - 1;
+                for(int stack = 0; stack < nbMatrices;){
+                    for(int i = 0; i < output_contiguous_size; ++i){
+                        int in0_id = (input0_contiguous_size != 1) ? i : 0;
+                        int in1_id = (input1_contiguous_size != 1) ? i : 0;
+                        outputs[i + offsetOut*output_contiguous_size] = inputs1[in0_id + offsetIn0*input0_contiguous_size] / inputs2[in1_id + offsetIn1*input1_contiguous_size];
+                    }
+                    if (++stack < nbMatrices) {
+                        int tmp_stack = stack;
+                        while(tmp_stack % OUTPUT_DIMS[dim] == 0) {
+                            tmp_stack /= OUTPUT_DIMS[dim];
+                            dim--;
+                        }
+                        offsetIn0 += stride_step0[dim];
+                        offsetIn1 += stride_step1[dim];
+                        ++offsetOut;
+                        dim = contiguousIdx - 1;
+                    }
                 }
                 break;
             }
             default: {
-                // Copy inputs1 in outputs for default case
                 for (int i = 0; i < NB_ELTS; ++i) {
                     val = inputs1[i];
                     outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
@@ -58,7 +338,6 @@ void elemwise_forward (
     else
     {
         int32_t val = 0;
-
         switch (ELEM_OP) {
             case Add: {
                 for (int i = 0; i < NB_ELTS; ++i) {
@@ -81,6 +360,13 @@ void elemwise_forward (
                 }
                 break;
             }
+            case Div: {
+                for (int i = 0; i < NB_ELTS; ++i) {
+                    val = inputs1[i] / inputs2[i] ;
+                    outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
+                }
+                break; 
+            }
             default: {
                 // Copy inputs1 in outputs for default case
                 for (int i = 0; i < NB_ELTS; ++i) {
@@ -94,78 +380,6 @@ void elemwise_forward (
 }
 
 
-// Generic function for multiple inputs
-// Not working
-
-// template<ElemWise_T ELEM_OP, typename Output_T>
-// __attribute__((always_inline)) inline
-// Output_T elemWise (int /*pos*/, int /*ch*/)
-// {
-//     return 0;
-// }
-
-// template<ElemWise_T ELEM_OP,
-//          int NB_CHANNELS,
-//          // For next inputs
-//          int... ARGS,
-//          typename... INPUTS,
-//          // Types
-//          typename Input_T, typename Output_T>
-// __attribute__((always_inline)) inline
-// Output_T elemWise (int pos, int ch,
-//                    const Input_T* __restrict firstInputs,
-//                    INPUTS... inputs)
-// {
-//     int iOffset = NB_CHANNELS * pos;
-
-//     return firstInputs[iOffset + ch]
-//                 + elemWise<ELEM_OP, ARGS...>(pos, ch, inputs...);
-// }
-
-// template<// For all inputs
-//          int NB_CHANNELS,
-//          int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
-//          int NB_ELTS,
-//          int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
-//          ElemWise_T ELEM_OP,
-//          ActivationFunction_T ACTIVATION,
-//          // For next inputs
-//          int... ARGS,
-//          typename... INPUTS,
-//          // Types
-//          typename Input_T, typename Output_T,
-//          typename Rescaling_T>
-// __attribute__((always_inline)) inline
-// void elemWise_forward (
-//     Output_T* __restrict outputs,
-//     const Rescaling_T& __restrict rescaling,
-//     const Input_T* __restrict firstInputs,
-//     INPUTS... inputs)
-// {
-//     for (int oy = 0; oy < OUTPUTS_HEIGHT; oy++) {
-//         for (int ox = 0; ox < OUTPUTS_WIDTH; ox++) {
-//             const int pos = (ox + OUTPUTS_WIDTH * oy);
-//             int oOffset = NB_ELTS * pos;
-
-//             for (int ch = 0; ch < NB_ELTS; ++ch) {
-//                 const Add_T val = elemWise<ELEM_OP,
-//                                         INPUT_NB_CHANNELS,
-//                                         INPUT_MEM_CONT_OFFSET,
-//                                         INPUT_MEM_CONT_NB_ELTS,
-//                                         INPUT_MEM_WRAP_OFFSET,
-//                                         INPUT_MEM_WRAP_NB_ELTS,
-//                                         INPUT_MEM_STRIDE,
-//                                         ARGS...>(pos, ch, firstInputs, inputs...);
-
-//                 outputs[oOffset + ch]
-//                     = sat<Output_T>(val, ch, ACTIVATION, rescaling);
-//             }
-//         }
-//     }
-// }
-
-
-
 
 
-#endif  // __AIDGE_EXPORT_CPP_KERNELS_ELEMWISE__
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_ELEMWISE__
\ No newline at end of file
diff --git a/aidge_export_cpp/kernels/erf.hpp b/aidge_export_cpp/kernels/erf.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..88aafe2407c4a1106b1e05acf2406a1337fa5ac6
--- /dev/null
+++ b/aidge_export_cpp/kernels/erf.hpp
@@ -0,0 +1,40 @@
+#ifndef __AIDGE_EXPORT_CPP_KERNELS_ERF__
+#define __AIDGE_EXPORT_CPP_KERNELS_ERF__
+
+#include "network/typedefs.hpp"
+#include <cmath>
+#include <math.h>
+
+template<int _NB_ELTS,
+         typename Input_T, typename Output_T>
+__attribute__((always_inline)) inline 
+void erf_forward (
+    const Input_T* __restrict inputs,
+    Output_T* __restrict outputs)
+{
+    double a1 =  0.254829592;
+    double a2 = -0.284496736;
+    double a3 =  1.421413741;
+    double a4 = -1.453152027;
+    double a5 =  1.061405429;
+    double p  =  0.3275911;
+
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif 
+    for (int i = 0; i < _NB_ELTS; ++i) {
+        int sign = 1;
+        if (inputs[i] < 0)
+            sign = -1;
+        double abs_value = abs(inputs[i]);
+        
+        // A&S formula 7.1.26
+        double t = 1.0/(1.0 + p*abs_value);
+        double y = 1.0 - (((((a5*t + a4)*t) + a3)*t + a2)*t + a1)*t*exp(-abs_value*abs_value);
+        outputs[i] = sign*y;
+
+    }
+}
+
+
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_ERF_
\ No newline at end of file
diff --git a/aidge_export_cpp/kernels/fullyconnected.hpp b/aidge_export_cpp/kernels/fullyconnected.hpp
index 895ed1c21d35e7e266f788407dd7f42719607ad7..97547f806e97a8b187bbf260ef6ffaeac33cfaf6 100644
--- a/aidge_export_cpp/kernels/fullyconnected.hpp
+++ b/aidge_export_cpp/kernels/fullyconnected.hpp
@@ -28,7 +28,7 @@ void fullyconnected_forward (
     // It is only an issue if the FC was after a flatten layer.
     // Otherwise it is not an issue for the other FC because CHANNELS_WIDTH = CHANNELS_HEIGHT = 1
     // Solution: Add a system to check dataformat
-    for (int och = 0; och < NB_OUTPUTS; och++) {
+    /*for (int och = 0; och < NB_OUTPUTS; och++) {
 
         Bias_T weightedSum = biases[och];
 
@@ -42,9 +42,9 @@ void fullyconnected_forward (
         }
 
         outputs[och] = activation_forward_value<Output_T>(weightedSum, och, ACTIVATION, rescaling);
-    }
-/*
-Here the kernel to use with inputs in NHWC and weights in NHWC
+    }*/
+
+//Here the kernel to use with inputs in NHWC and weights in NHWC
 #pragma omp parallel for
     for (int och = 0; och < NB_OUTPUTS; och++) {
 
@@ -65,7 +65,7 @@ Here the kernel to use with inputs in NHWC and weights in NHWC
 
         outputs[och] = activation_forward_value<Output_T>(weightedSum, och, ACTIVATION, rescaling);
     }
-*/
+
 }
 
 
diff --git a/aidge_export_cpp/kernels/matmul.hpp b/aidge_export_cpp/kernels/matmul.hpp
index 4500993e02cf42fb698bc9004462800bdd3f7dc4..1403a010298e32a6a5670edb5e1efeb03ef5d37d 100644
--- a/aidge_export_cpp/kernels/matmul.hpp
+++ b/aidge_export_cpp/kernels/matmul.hpp
@@ -6,28 +6,106 @@
 
 // Generic function for matmul and activation
 
-template<int M,
-         int K,
-         int N,
+template<int INPUT_A_DIMS[],  int INPUT_B_DIMS[], int OUTPUT_DIMS[], 
+		int _SIZE_DIM_IN_A, int _SIZE_DIM_IN_B, int SIZE_DIM_OUT, 
          ActivationFunction_T ACTIVATION,
-         typename Input_T, typename Output_T,
-         typename Rescaling_T>
+         typename Input_T, typename Output_T>
 __attribute__((always_inline)) inline
 void matmul_forward (
     const Input_T* __restrict inputs1,
     const Input_T* __restrict inputs2,
-    Output_T* __restrict outputs,
-    const Rescaling_T& __restrict rescaling)
+    Output_T* __restrict outputs)
 {
-    for (int m = 0; m < M; ++m) {
-        for (int n = 0; n < N; ++n) {
-            Output_T sum = Output_T(0);
-            for (int k = 0; k < K; ++k) {
-                sum += inputs1[K*m + k] * inputs2[N*k + n];
+
+    //initialize arrays storing broadcasted(or not) dims
+    int ndim_a[SIZE_DIM_OUT];     
+    int ndim_b[SIZE_DIM_OUT];
+    if ( _SIZE_DIM_IN_A == 1){ 
+        ndim_a[0] = 1;
+        ndim_a[1] =INPUT_A_DIMS[0];
+    }
+    if ( _SIZE_DIM_IN_B == 1){ 
+        ndim_b[0] =INPUT_B_DIMS[0];
+        ndim_b[1] = 1;
+    }
+    
+    for (int i= 0; i<SIZE_DIM_OUT; i++){
+        int idx = SIZE_DIM_OUT-_SIZE_DIM_IN_A;
+        ndim_a[i] = (i< idx) ? 1 :INPUT_A_DIMS[i-idx];
+    }
+    for (int i= 0; i<SIZE_DIM_OUT; i++){
+        int idx = SIZE_DIM_OUT-_SIZE_DIM_IN_B;
+        ndim_b[i] = (i< idx) ? 1 :INPUT_B_DIMS[i-idx];
+    }
+        
+    // initialize strides to iterate through data because of broadcasting
+    int stride_post0[SIZE_DIM_OUT-2] ;
+    int stride_post1[SIZE_DIM_OUT-2] ; 
+    int stride_step0[SIZE_DIM_OUT-2] ;
+    int stride_step1[SIZE_DIM_OUT-2] ; 
+    if (SIZE_DIM_OUT > 2){ 
+        stride_post0[SIZE_DIM_OUT - 3] = 1;
+        stride_post1[SIZE_DIM_OUT - 3] = 1;
+        for (int i = SIZE_DIM_OUT-4; i != -1; --i) {
+            stride_post0[i] = stride_post0[i+1]*ndim_a[i+1];
+            stride_post1[i] = stride_post1[i+1]*ndim_b[i+1];
+        }
+        for (int i = 0; i < SIZE_DIM_OUT-2; ++i) {
+            stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1;
+            stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1;
+        }
+
+    }
+
+    
+    // if _SIZE_DIM_IN_B == _SIZE_DIM_IN_A, then _SIZE_DIM_IN_A == SIZE_DIM_OUT == _SIZE_DIM_IN_B; 
+    // else it will be broadcasted to the correct dims
+
+    int nbMatrices = 1;
+    for(int i = SIZE_DIM_OUT -3; i>=0; --i){
+        nbMatrices *= OUTPUT_DIMS[i];
+    }
+    int dim = SIZE_DIM_OUT -3;
+
+
+    int offsetIn0 = 0;
+    int offsetIn1 = 0;
+    int offsetOut = 0;
+    const int n = ndim_a[SIZE_DIM_OUT - 2];
+    const int k = ndim_a[SIZE_DIM_OUT - 1];
+    const int m = ndim_b[SIZE_DIM_OUT - 1];
+    const int matrix0Size = n*k;
+    const int matrix1Size = k*m;
+    const int matrixOutSize = n*m;
+
+    for(int stack = 0; stack < nbMatrices;){
+
+        for (int i = 0; i < n; ++i) {
+
+            for (int j = 0; j < m; ++j) {
+                float sum = 0;
+
+                for (int l = 0; l < k; ++l) {
+                    sum += (inputs1[ offsetIn0*matrix0Size + i*k + l] * inputs2[offsetIn1*matrix1Size + l*m + j]);
+                }
+                outputs[ offsetOut*matrixOutSize + i*m + j] = sum;
+            }
+        } 
+
+        if (++stack < nbMatrices) {
+            int tmp_stack = stack;
+            while(tmp_stack % OUTPUT_DIMS[dim] == 0) {
+                tmp_stack /= OUTPUT_DIMS[dim];
+                dim--;
             }
-            outputs[N*m + n] = activation_forward_value<Output_T>(sum, 0/*not applicable*/, ACTIVATION, rescaling);
+            offsetIn0 += stride_step0[dim];
+            offsetIn1 += stride_step1[dim];
+            ++offsetOut;
+            dim = SIZE_DIM_OUT -3;
         }
+
     }
+
 }
 
-#endif  // __AIDGE_EXPORT_CPP_KERNELS_MATMUL__
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_MATMUL__
\ No newline at end of file
diff --git a/aidge_export_cpp/kernels/mul.hpp b/aidge_export_cpp/kernels/mul.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..5c1ba622ef5971dbac8979d89931dc1f4e13cce5
--- /dev/null
+++ b/aidge_export_cpp/kernels/mul.hpp
@@ -0,0 +1,98 @@
+#ifndef __AIDGE_EXPORT_CPP_KERNELS_MUL__
+#define __AIDGE_EXPORT_CPP_KERNELS_MUL__
+
+#include "network/typedefs.hpp"
+#include "kernels/activation.hpp"
+
+template<int NB_ELTS, 
+        int INPUT_A_DIMS[],  int INPUT_B_DIMS[], int OUTPUT_DIMS[], 
+		int SIZE_DIM_IN_A, int SIZE_DIM_IN_B, int SIZE_DIM_OUT, int OUT_SIZE, 
+        ActivationFunction_T ACTIVATION,
+        typename Input_T, typename Output_T>        
+__attribute__((always_inline)) inline
+void mul_forward (
+    Output_T* __restrict outputs,
+    const Input_T* __restrict inputs1,
+    const Input_T* __restrict inputs2)
+{
+    int ndim_a[SIZE_DIM_OUT];
+    int ndim_b[SIZE_DIM_OUT];                                                  
+    for (int i= 0; i<SIZE_DIM_OUT; i++){
+        int idx = SIZE_DIM_OUT-SIZE_DIM_IN_A;
+        ndim_a[i] = (i< idx) ? 1 : INPUT_A_DIMS[i-idx];
+    }
+    for (int i= 0; i<SIZE_DIM_OUT; i++){
+        int idx = SIZE_DIM_OUT-SIZE_DIM_IN_B;
+        ndim_b[i] = (i< idx) ? 1 : INPUT_B_DIMS[i-idx];
+    }
+    
+    // Find the highest equal dimension
+    int contiguousidx  = SIZE_DIM_OUT -1 ;
+
+    for (int i = contiguousidx ; ndim_a[i] == ndim_b[i]; i--) {
+        contiguousidx  = i;
+    }
+
+    // Compute the highest number of contiguous data for each Tensor
+    int input0_contiguous_size = 1;
+    for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
+        input0_contiguous_size *= ndim_a[i];
+    }
+    int input1_contiguous_size = 1;
+    for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
+        input1_contiguous_size *= ndim_b[i];
+    }
+    int output_contiguous_size = 1;
+    for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
+        output_contiguous_size *= OUTPUT_DIMS[i];
+    }
+    // initialize strides to iterate through data because of broadcasting
+    int stride_post0[contiguousidx ] ;
+    int stride_post1[contiguousidx ] ;
+    int stride_step0[contiguousidx ] ;
+    int stride_step1[contiguousidx ] ;
+    if (contiguousidx > 0) {
+        stride_post0[contiguousidx  - 1] = 1;
+        stride_post1[contiguousidx  - 1] = 1;
+        for (int i = contiguousidx -2; i != -1; --i) {
+            stride_post0[i] = stride_post0[i+1]*ndim_a[i+1];
+            stride_post1[i] = stride_post1[i+1]*ndim_b[i+1];
+        }
+        for (int i = 0; i < contiguousidx ; ++i) {
+            stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1;
+            stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1;
+        }
+    }
+    int offsetIn0 = 0;
+    int offsetIn1 = 0;
+    int offsetOut = 0;
+    int nbMatrices = 1;
+    for(int i = 0; i<contiguousidx ; ++i){
+        nbMatrices *= OUTPUT_DIMS[i];
+        
+    }
+    int dim = contiguousidx  - 1;
+    for(int stack = 0; stack < nbMatrices;){
+        for(int i = 0; i < output_contiguous_size; ++i){
+            int in0_id = (input0_contiguous_size != 1) ? i : 0;
+            int in1_id = (input1_contiguous_size != 1) ? i : 0;
+            outputs[i + offsetOut*output_contiguous_size] = inputs1[in0_id + offsetIn0*input0_contiguous_size] * inputs2[in1_id + offsetIn1*input1_contiguous_size];
+        }
+        if (++stack < nbMatrices) {
+            int tmp_stack = stack;
+            while(tmp_stack % OUTPUT_DIMS[dim] == 0) {
+                tmp_stack /= OUTPUT_DIMS[dim];
+                dim--;
+            }
+            offsetIn0 += stride_step0[dim];
+            offsetIn1 += stride_step1[dim];
+            ++offsetOut;
+            dim = contiguousidx  - 1;
+        }
+    }
+}
+
+
+
+
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_MUL__
\ No newline at end of file
diff --git a/aidge_export_cpp/kernels/pooling.hpp b/aidge_export_cpp/kernels/pooling.hpp
index 478b6a58aed45e2bce0ed1683ad113f9c7a8bffb..8f6de40363f5b932148b0ac6a860611cd5b0ea9c 100644
--- a/aidge_export_cpp/kernels/pooling.hpp
+++ b/aidge_export_cpp/kernels/pooling.hpp
@@ -7,6 +7,35 @@
 #include <stdexcept>
 
 
+void reorder_NCHW_NHWC_pool(const float* input, float* output, int N, int C, int H, int W, bool direct = true) {
+    auto nchw_index = [=](int n, int c, int h, int w) {
+        return ((n * C + c) * H + h) * W + w;
+    };
+
+    auto nhwc_index = [=](int n, int h, int w, int c) {
+        return ((n * H + h) * W + w) * C + c;
+    };
+    #pragma omp parallel for
+    for (int n = 0; n < N; ++n) {
+        #pragma omp parallel for
+        for (int c = 0; c < C; ++c) {
+            #pragma omp parallel for
+            for (int h = 0; h < H; ++h) {
+                #pragma omp parallel for
+                for (int w = 0; w < W; ++w) {
+                    if (direct) {
+                    	output[nhwc_index(n, h, w, c)] = input[nchw_index(n, c, h, w)];
+                    } else {
+                    	output[nchw_index(n, c, h, w)] = input[nhwc_index(n, h, w, c)];
+                    }
+                    
+                }
+            }
+        }
+    }
+}
+
+
 template<int NB_CHANNELS, 
          int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
          int NB_OUTPUTS,
@@ -22,11 +51,16 @@ void pooling_forward(
     const Input_T* __restrict inputs,
     Output_T* __restrict outputs)
 {
+
+    float inputs_ordered[NB_CHANNELS * CHANNELS_HEIGHT * CHANNELS_WIDTH];
+    float outputs_unordered[NB_OUTPUTS * OUTPUTS_HEIGHT * OUTPUTS_WIDTH];
+    reorder_NCHW_NHWC_pool(inputs, inputs_ordered, 1, NB_CHANNELS, CHANNELS_HEIGHT, CHANNELS_WIDTH, true);
+    
     constexpr int OUTPUTS_HEIGHT_NOPAD
         = (CHANNELS_HEIGHT - POOL_HEIGHT + STRIDE_Y) / STRIDE_Y;
     constexpr int OUTPUTS_WIDTH_NOPAD
         = (CHANNELS_WIDTH - POOL_WIDTH + STRIDE_X) / STRIDE_X;
-
+    #pragma omp parallel for
     for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
         const int syMin = (PADDING_Y == 0) ? 0
             : max(PADDING_Y - (oy * STRIDE_Y), 0);
@@ -78,15 +112,15 @@ void pooling_forward(
 
                             int iOffsetInRange = iOffset + output + sx * NB_CHANNELS;
 
-                            if (inputs[iOffsetInRange] > maxVal)
-                                maxVal = inputs[iOffsetInRange];
+                            if (inputs_ordered[iOffsetInRange] > maxVal)
+                                maxVal = inputs_ordered[iOffsetInRange];
                         }
                     }
 
-                    outputs[oOffset + output] = maxVal;
+                    outputs_unordered[oOffset + output] = maxVal;
                 }
                 else if (POOLING_TYPE == Average) {
-                    int32_t sum = 0;
+                    Input_T sum = 0;
 
                     for (int sy = 0; sy < POOL_HEIGHT; ++sy) {
                         if ((PADDING_Y != 0
@@ -109,11 +143,12 @@ void pooling_forward(
                             }
 
                             int iOffsetInRange = iOffset + output + sx * NB_CHANNELS;
-                            sum += inputs[iOffsetInRange];
+                            sum = inputs_ordered[iOffsetInRange] + sum;
                         }
+                        
                     }
 
-                    outputs[oOffset + output] = (Output_T) (sum / (POOL_HEIGHT * POOL_WIDTH));
+                    outputs_unordered[oOffset + output] = (Output_T) (sum / (POOL_HEIGHT * POOL_WIDTH));
                 }
                 else {
                     throw std::runtime_error("The export only supports Max and Average pooling.");
@@ -121,6 +156,7 @@ void pooling_forward(
             }
         }
     }
+    reorder_NCHW_NHWC_pool(outputs_unordered, outputs, 1, NB_OUTPUTS, OUTPUTS_HEIGHT, OUTPUTS_WIDTH, false);
 }
 
-#endif  // __AIDGE_EXPORT_CPP_KERNELS_POOLING__
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_POOLING__
\ No newline at end of file
diff --git a/aidge_export_cpp/kernels/sub.hpp b/aidge_export_cpp/kernels/sub.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2576edc1e3ba12ed1147e2a83a4a94cd875c574d
--- /dev/null
+++ b/aidge_export_cpp/kernels/sub.hpp
@@ -0,0 +1,100 @@
+#ifndef __AIDGE_EXPORT_CPP_KERNELS_SUB__
+#define __AIDGE_EXPORT_CPP_KERNELS_SUB__
+
+#include "network/typedefs.hpp"
+#include "kernels/activation.hpp"
+
+
+template<int NB_ELTS, 
+        int INPUT_A_DIMS[],  int INPUT_B_DIMS[], int OUTPUT_DIMS[], 
+		int SIZE_DIM_IN_A, int SIZE_DIM_IN_B, int SIZE_DIM_OUT, int OUT_SIZE, 
+        ActivationFunction_T ACTIVATION,
+        typename Input_T, typename Output_T>        
+__attribute__((always_inline)) inline
+void sub_forward (
+    Output_T* __restrict outputs,
+    const Input_T* __restrict inputs1,
+    const Input_T* __restrict inputs2)
+{
+
+    int ndim_a[SIZE_DIM_OUT];
+    int ndim_b[SIZE_DIM_OUT];
+    for (int i= 0; i<SIZE_DIM_OUT; i++){
+        int idx = SIZE_DIM_OUT-SIZE_DIM_IN_A;
+        ndim_a[i] = (i< idx) ? 1 : INPUT_A_DIMS[i-idx];
+    }
+    for (int i= 0; i<SIZE_DIM_OUT; i++){
+        int idx = SIZE_DIM_OUT-SIZE_DIM_IN_B;
+        ndim_b[i] = (i< idx) ? 1 : INPUT_B_DIMS[i-idx];
+    }
+    
+    // Find the highest equal dimension
+    int contiguousidx  = SIZE_DIM_OUT -1 ;
+
+    for (int i = contiguousidx ; ndim_a[i] == ndim_b[i]; i--) {
+        contiguousidx  = i;
+    }
+
+    // Compute the highest number of contiguous data for each Tensor
+    int input0_contiguous_size = 1;
+    for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
+        input0_contiguous_size *= ndim_a[i];
+    }
+    int input1_contiguous_size = 1;
+    for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
+        input1_contiguous_size *= ndim_b[i];
+    }
+    int output_contiguous_size = 1;
+    for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
+        output_contiguous_size *= OUTPUT_DIMS[i];
+    }
+    // initialize strides to iterate through data because of broadcasting
+    int stride_post0[contiguousidx ] ;
+    int stride_post1[contiguousidx ] ;
+    int stride_step0[contiguousidx ] ;
+    int stride_step1[contiguousidx ] ;
+    if (contiguousidx > 0) {
+        stride_post0[contiguousidx  - 1] = 1;
+        stride_post1[contiguousidx  - 1] = 1;
+        for (int i = contiguousidx -2; i != -1; --i) {
+            stride_post0[i] = stride_post0[i+1]*ndim_a[i+1];
+            stride_post1[i] = stride_post1[i+1]*ndim_b[i+1];
+        }
+        for (int i = 0; i < contiguousidx ; ++i) {
+            stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1;
+            stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1;
+        }
+    }
+    int offsetIn0 = 0;
+    int offsetIn1 = 0;
+    int offsetOut = 0;
+    int nbMatrices = 1;
+    for(int i = 0; i<contiguousidx ; ++i){
+        nbMatrices *= OUTPUT_DIMS[i];
+        
+    }
+    int dim = contiguousidx  - 1;
+    for(int stack = 0; stack < nbMatrices;){
+        for(int i = 0; i < output_contiguous_size; ++i){
+            int in0_id = (input0_contiguous_size != 1) ? i : 0;
+            int in1_id = (input1_contiguous_size != 1) ? i : 0;
+            outputs[i + offsetOut*output_contiguous_size] = inputs1[in0_id + offsetIn0*input0_contiguous_size] - inputs2[in1_id + offsetIn1*input1_contiguous_size];
+        }
+        if (++stack < nbMatrices) {
+            int tmp_stack = stack;
+            while(tmp_stack % OUTPUT_DIMS[dim] == 0) {
+                tmp_stack /= OUTPUT_DIMS[dim];
+                dim--;
+            }
+            offsetIn0 += stride_step0[dim];
+            offsetIn1 += stride_step1[dim];
+            ++offsetOut;
+            dim = contiguousidx  - 1;
+        }
+    }
+}
+
+
+
+
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_SUB__
\ No newline at end of file
diff --git a/aidge_export_cpp/kernels/transpose_diff.hpp b/aidge_export_cpp/kernels/transpose_diff.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..712d9b42a9282578bfa80ef840a2f54b86081f55
--- /dev/null
+++ b/aidge_export_cpp/kernels/transpose_diff.hpp
@@ -0,0 +1,54 @@
+#ifndef __AIDGE_EXPORT_CPP_KERNELS_TRANSPOSE__
+#define __AIDGE_EXPORT_CPP_KERNELS_TRANSPOSE__
+
+#include "network/typedefs.hpp"
+
+
+using namespace std; 
+
+template< int INPUT_DIMS[],  int PERM[], int OUTPUT_DIMS[], 
+		int SIZE_OUTPUT_DIMS, int SIZE,
+        typename Input_T, typename Output_T>
+__attribute__((always_inline)) inline 
+void transpose_forward (
+    const Input_T* __restrict inputs,
+    Output_T* __restrict outputs)
+    {
+
+	int newStrides[SIZE_OUTPUT_DIMS];
+	for (int i = 0; i<SIZE_OUTPUT_DIMS;++i){newStrides[i] = 1;}
+	for (int i = 0; i < SIZE_OUTPUT_DIMS; ++i) {
+		for (int j = i + 1; j < SIZE_OUTPUT_DIMS; ++j) {
+			newStrides[i] *= OUTPUT_DIMS[j];
+		}
+	}
+
+	int indices[SIZE_OUTPUT_DIMS];
+	for (int i = 0; i<SIZE_OUTPUT_DIMS;++i){indices[i] = 0;}
+	
+	for (int i = 0; i < SIZE; ++i) {
+		int idx = 0;
+		for (int j = SIZE_OUTPUT_DIMS -1; j >=0; --j) {
+			idx += indices[PERM[j]] * newStrides[j];
+		}
+
+		outputs[idx] = inputs[i];
+
+
+		for (int j = SIZE_OUTPUT_DIMS - 1; j >= 0; --j) {
+			if (indices[j] < INPUT_DIMS[j] - 1) {
+				indices[j]++;
+				break;
+			}
+			else {
+				indices[j] = 0;
+			}
+		}
+	}
+
+    
+}
+
+
+
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_TRANSPOSE__
diff --git a/aidge_export_cpp/operators.py b/aidge_export_cpp/operators.py
index 346928f4a84c403df2172311cede8b99fd06eebe..18089fd97af7f39bf00a62251f1fd5c47391b362 100644
--- a/aidge_export_cpp/operators.py
+++ b/aidge_export_cpp/operators.py
@@ -56,7 +56,7 @@ class ProducerCPP(ExportNode):
         super().__init__(node, mem_info)
         self.values = np.array(self.operator.get_output(0))
 
-        if len(self.values.shape) == 4:  # Note: export in HWC
+        if len(self.values.shape) == 4:  # Note: export in HWC   
             self.values =  np.transpose(self.values, (0, 2, 3, 1))
 
     def export(self, export_folder: Path):
@@ -143,6 +143,24 @@ def _setup_conv2D(conv):
         str(ROOT / "kernels" / "rescaling.hpp")
     ]
 
+
+def _setup_elemwise_op(elemwise, op):
+    """Common code (template and kernel setup) shared across all the different elementWise operator (Add, Sub,...)."""
+
+    elemwise.attributes["elemwise_op"] = op
+    elemwise.attributes["activation"] = "Linear"
+    elemwise.attributes["rescaling"] = "NoScaling"
+    elemwise.config_template = str(
+        ROOT / "templates" / "configuration" / "elemwise_config.jinja")
+    elemwise.forward_template = str(
+        ROOT / "templates" / "kernel_forward" / "elemwise_forward.jinja")
+    elemwise.include_list = []
+    elemwise.kernels_to_copy = [
+        str(ROOT / "kernels" / "elemwise.hpp"),
+        str(ROOT / "kernels" / "activation.hpp"),
+        str(ROOT / "kernels" / "rescaling.hpp")
+    ]
+
 @ExportLibCpp.register("Conv2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
 class ConvCPP(ExportNodeCpp):
     def __init__(self, node, mem_info):
@@ -150,7 +168,7 @@ class ConvCPP(ExportNodeCpp):
         # No padding with Conv
         # Use PaddedConv to add padding attribute
         self.attributes["padding"] = [0, 0]
-
+        self.attributes["groups"] = 1
         _setup_conv2D(self)
 
 @ExportLibCpp.register_metaop("PaddedConv2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
@@ -169,25 +187,28 @@ class PaddedConvCPP(ExportNodeCpp):
                 ).attr.stride_dims
                 self.attributes["dilation_dims"] = n.get_operator(
                 ).attr.dilation_dims
-
+        self.attributes["groups"] = 1
         _setup_conv2D(self)
 
-def _setup_elemwise_op(elemwise, op):
-    """Common code (template and kernel setup) shared across all the different elementWise operator (Add, Sub,...)."""
+@ExportLibCpp.register_metaop("PaddedConvDepthWise2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
+class PaddedConvDepthWiseCPP(ExportNodeCpp):
+    def __init__(self, node, mem_info):
+        super().__init__(node, mem_info)
+        # TODO find a way to retrive attr for meta op
+        for n in self.operator.get_micro_graph().get_nodes():
+            if n.type() == "Pad2D":
+                self.attributes["padding"] = n.get_operator(
+                ).attr.begin_end_borders
+            if n.type() == "ConvDepthWise2D":
+                self.attributes["kernel_dims"] = n.get_operator(
+                ).attr.kernel_dims
+                self.attributes["stride_dims"] = n.get_operator(
+                ).attr.stride_dims
+                self.attributes["dilation_dims"] = n.get_operator(
+                ).attr.dilation_dims
 
-    elemwise.attributes["elemwise_op"] = op
-    elemwise.attributes["activation"] = "Linear"
-    elemwise.attributes["rescaling"] = "NoScaling"
-    elemwise.config_template = str(
-        ROOT / "templates" / "configuration" / "elemwise_config.jinja")
-    elemwise.forward_template = str(
-        ROOT / "templates" / "kernel_forward" / "elemwise_forward.jinja")
-    elemwise.include_list = []
-    elemwise.kernels_to_copy = [
-        str(ROOT / "kernels" / "elemwise.hpp"),
-        str(ROOT / "kernels" / "activation.hpp"),
-        str(ROOT / "kernels" / "rescaling.hpp")
-    ]
+        self.attributes["groups"] = self.attributes["out_chan"][0]
+        _setup_conv2D(self)
 
 @ExportLibCpp.register("Add", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
 class AddCPP(ExportNodeCpp):
@@ -210,6 +231,14 @@ class MulCPP(ExportNodeCpp):
 
         _setup_elemwise_op(self, "Mul")
 
+@ExportLibCpp.register("Div", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
+class DivCPP(ExportNodeCpp):
+    def __init__(self, node, mem_info):
+        super().__init__(node, mem_info)
+
+        _setup_elemwise_op(self, "Div")
+
+
 def _setup_pooling(pooling):
     """Common code (template and kernel setup) shared across all the different pooling operator."""
 
@@ -302,4 +331,39 @@ class TransposeCPP(ExportNodeCpp):
         self.include_list = []
         self.kernels_to_copy = [
             str(ROOT / "kernels" / "transpose.hpp")
+        ]
+
+
+@ExportLibCpp.register("Erf", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
+class ErfCPP(ExportNodeCpp):
+    def __init__(self, node, mem_info):
+        super().__init__(node, mem_info)
+        self.attributes["activation"] = "Linear"
+        self.attributes["rescaling"] = "NoScaling"
+        self.config_template = str(
+            ROOT / "templates" / "configuration" / "erf_config.jinja")
+        self.forward_template = str(
+            ROOT / "templates" / "kernel_forward" / "erf_forward.jinja")
+        self.include_list = []
+        self.kernels_to_copy = [
+            str(ROOT / "kernels" / "erf.hpp"),
+            str(ROOT / "kernels" / "activation.hpp"),
+            str(ROOT / "kernels" / "rescaling.hpp")
+        ]
+
+@ExportLibCpp.register("BatchNorm2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
+class BatchNorm2DCPP(ExportNodeCpp):
+    def __init__(self, node, mem_info):
+        super().__init__(node, mem_info)
+        self.attributes["activation"] = "Linear"
+        self.attributes["rescaling"] = "NoScaling"
+        self.config_template = str(
+            ROOT / "templates" / "configuration" / "batchnorm_config.jinja")
+        self.forward_template = str(
+            ROOT / "templates" / "kernel_forward" / "batchnorm_forward.jinja")
+        self.include_list = []
+        self.kernels_to_copy = [
+            str(ROOT / "kernels" / "batchnorm.hpp"),
+            str(ROOT / "kernels" / "activation.hpp"),
+            str(ROOT / "kernels" / "rescaling.hpp")
         ]
\ No newline at end of file
diff --git a/aidge_export_cpp/static/include/network/typedefs.hpp b/aidge_export_cpp/static/include/network/typedefs.hpp
index acece91115f73a57197c8a423cd34ec37b2f2e2a..9b8360287969a8e1583a94472e1f55db2d8d5673 100644
--- a/aidge_export_cpp/static/include/network/typedefs.hpp
+++ b/aidge_export_cpp/static/include/network/typedefs.hpp
@@ -19,7 +19,8 @@ typedef enum {
 typedef enum {
     Add,
     Sub,
-    Mul
+    Mul,
+    Div
 } ElemWise_T;
 
 typedef enum {
diff --git a/aidge_export_cpp/templates/configuration/add_config.jinja b/aidge_export_cpp/templates/configuration/add_config.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..143d0048cd771dd1e47ef0758395538d83663a20
--- /dev/null
+++ b/aidge_export_cpp/templates/configuration/add_config.jinja
@@ -0,0 +1,25 @@
+{#- For name header -#}
+#ifndef {{ name|upper }}_LAYER_H
+#define {{ name|upper }}_LAYER_H
+#include "kernels/rescaling.hpp"
+
+{% include "./_def_io.jinja" %}
+{% include "./_meminfo.jinja" %}
+{# For layer configuration -#}
+#define {{ name|upper }}_NB_ELTS {{ in_dims[0]|join('*') }}
+#define {{ name|upper }}_NB_ELTS_B {{ in_dims[1]|join('*')}}
+
+int {{name|upper}}_OUTPUT_DIMS[] =  { {{ out_dims[0]|join(", ") }} };
+int {{name|upper}}_INPUT_A_DIMS[] = { {{ in_dims[0]|join(", ") }} };
+int {{name|upper}}_INPUT_B_DIMS[] = { {{ in_dims[1]|join(", ") }} };
+
+#define {{name|upper}}_SIZE_DIM_IN_A {{in_dims[0]|length}}
+#define {{name|upper}}_SIZE_DIM_IN_B {{in_dims[1]|length}}
+#define {{name|upper}}_SIZE_DIM_OUT {{out_dims[0]|length}}
+
+#define {{ name|upper }}_OUT_SIZE {{out_size[0]}}
+#define {{name|upper }}_SIZE_DIM_OUT {{out_dims[0]|length}}
+
+#define {{ name|upper }}_ACTIVATION {{ activation }}
+static const {{ rescaling }} {{ name|upper }}_RESCALING = {};
+#endif /* {{ name|upper }}_LAYER_H */
diff --git a/aidge_export_cpp/templates/configuration/batchnorm_config.jinja b/aidge_export_cpp/templates/configuration/batchnorm_config.jinja
index 701ba7c46e4727eca86fcabf3ed997cab69f4e92..3706ee654671ce1b7bfb532de64a0ee298f8b744 100644
--- a/aidge_export_cpp/templates/configuration/batchnorm_config.jinja
+++ b/aidge_export_cpp/templates/configuration/batchnorm_config.jinja
@@ -8,4 +8,4 @@
 #define {{ name|upper }}_ACTIVATION {{ activation }}
 #define {{ name|upper }}_EPSILON {{ epsilon }}
 
-#endif /* {{ name|upper }}_LAYER_H */
+#endif /* {{ name|upper }}_LAYER_H */
\ No newline at end of file
diff --git a/aidge_export_cpp/templates/configuration/convolution_config.jinja b/aidge_export_cpp/templates/configuration/convolution_config.jinja
index a4a2462d1f475424d799f38e52075fafb333c0d0..417f240c64b3dfe1dc204ba6344b693aa4409998 100644
--- a/aidge_export_cpp/templates/configuration/convolution_config.jinja
+++ b/aidge_export_cpp/templates/configuration/convolution_config.jinja
@@ -5,6 +5,8 @@
 {# For layer configuration -#}
 {% include "./_def_io.jinja" %}
 {% include "./_meminfo.jinja" %}
+
+#define {{ name|upper }}_GROUPS {{ groups }}
 #define {{ name|upper }}_PADDING_Y {{ padding[1] }}
 #define {{ name|upper }}_PADDING_X {{ padding[0] }}
 #define {{ name|upper }}_STRIDE_Y {{ stride_dims[1] }}
@@ -21,5 +23,4 @@ static const {{ rescaling }} {{ name|upper }}_RESCALING = {};
 #define {{ name|upper }}_WEIGHTS_SIZE {{ weights_size }}
 #define {{ name|upper }}_BIASES_SIZE {{ out_chan[0] }}
 
-
 #endif /* {{ name|upper }}_LAYER_H */
diff --git a/aidge_export_cpp/templates/configuration/div_config.jinja b/aidge_export_cpp/templates/configuration/div_config.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..143d0048cd771dd1e47ef0758395538d83663a20
--- /dev/null
+++ b/aidge_export_cpp/templates/configuration/div_config.jinja
@@ -0,0 +1,25 @@
+{#- For name header -#}
+#ifndef {{ name|upper }}_LAYER_H
+#define {{ name|upper }}_LAYER_H
+#include "kernels/rescaling.hpp"
+
+{% include "./_def_io.jinja" %}
+{% include "./_meminfo.jinja" %}
+{# For layer configuration -#}
+#define {{ name|upper }}_NB_ELTS {{ in_dims[0]|join('*') }}
+#define {{ name|upper }}_NB_ELTS_B {{ in_dims[1]|join('*')}}
+
+int {{name|upper}}_OUTPUT_DIMS[] =  { {{ out_dims[0]|join(", ") }} };
+int {{name|upper}}_INPUT_A_DIMS[] = { {{ in_dims[0]|join(", ") }} };
+int {{name|upper}}_INPUT_B_DIMS[] = { {{ in_dims[1]|join(", ") }} };
+
+#define {{name|upper}}_SIZE_DIM_IN_A {{in_dims[0]|length}}
+#define {{name|upper}}_SIZE_DIM_IN_B {{in_dims[1]|length}}
+#define {{name|upper}}_SIZE_DIM_OUT {{out_dims[0]|length}}
+
+#define {{ name|upper }}_OUT_SIZE {{out_size[0]}}
+#define {{name|upper }}_SIZE_DIM_OUT {{out_dims[0]|length}}
+
+#define {{ name|upper }}_ACTIVATION {{ activation }}
+static const {{ rescaling }} {{ name|upper }}_RESCALING = {};
+#endif /* {{ name|upper }}_LAYER_H */
diff --git a/aidge_export_cpp/templates/configuration/elemwise_config.jinja b/aidge_export_cpp/templates/configuration/elemwise_config.jinja
index 91a0be4cc4b6fc15e8b979ecd3ca01f122ebc63d..c00ec86a33d44ea2a4c1632736378d52f85657d8 100644
--- a/aidge_export_cpp/templates/configuration/elemwise_config.jinja
+++ b/aidge_export_cpp/templates/configuration/elemwise_config.jinja
@@ -7,6 +7,19 @@
 {% include "./_meminfo.jinja" %}
 {# For layer configuration -#}
 #define {{ name|upper }}_NB_ELTS {{ in_dims[0]|join('*') }}
+#define {{ name|upper }}_NB_ELTS_B {{ in_dims[1]|join('*')}}
+
+int {{name|upper}}_OUTPUT_DIMS[] =  { {{ out_dims[0]|join(", ") }} };
+int {{name|upper}}_INPUT_A_DIMS[] = { {{ in_dims[0]|join(", ") }} };
+int {{name|upper}}_INPUT_B_DIMS[] = { {{ in_dims[1]|join(", ") }} };
+
+#define {{name|upper}}_SIZE_DIM_IN_A {{in_dims[0]|length}}
+#define {{name|upper}}_SIZE_DIM_IN_B {{in_dims[1]|length}}
+#define {{name|upper}}_SIZE_DIM_OUT {{out_dims[0]|length}}
+
+#define {{ name|upper }}_OUT_SIZE {{out_size[0]}}
+#define {{name|upper }}_SIZE_DIM_OUT {{out_dims[0]|length}}
+
 #define {{ name|upper }}_ACTIVATION {{ activation }}
 #define {{ name|upper }}_ELEM_OP {{ elemwise_op }}
 static const {{ rescaling }} {{ name|upper }}_RESCALING = {};
diff --git a/aidge_export_cpp/templates/configuration/erf_config.jinja b/aidge_export_cpp/templates/configuration/erf_config.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..b273472091c2789837f473ef71f03de6d1702473
--- /dev/null
+++ b/aidge_export_cpp/templates/configuration/erf_config.jinja
@@ -0,0 +1,11 @@
+{#- For name header -#}
+#ifndef {{ name|upper }}_LAYER_H
+#define {{ name|upper }}_LAYER_H
+
+{# For layer configuration -#}
+{# For layer configuration -#}
+{% include "./_def_io.jinja" %}
+{% include "./_meminfo.jinja" %}
+#define {{ name|upper }}_NB_ELTS {{ in_dims[0]|join('*') }}
+
+#endif /* {{ name|upper }}_LAYER_H */
\ No newline at end of file
diff --git a/aidge_export_cpp/templates/configuration/matmul_config.jinja b/aidge_export_cpp/templates/configuration/matmul_config.jinja
index 38316f20947fa726085bf3577ead510e6c5096f3..6ef27fef712ed0417b33da062a6dfbea1b49d8f7 100644
--- a/aidge_export_cpp/templates/configuration/matmul_config.jinja
+++ b/aidge_export_cpp/templates/configuration/matmul_config.jinja
@@ -1,16 +1,29 @@
 {#- For name header -#}
 #ifndef {{ name|upper }}_LAYER_H
 #define {{ name|upper }}_LAYER_H
+#include "kernels/rescaling.hpp"
 
 {% include "./_def_io.jinja" %}
 {% include "./_meminfo.jinja" %}
 
 {# For layer configuration -#}
-#define {{ name|upper }}_M {{ in_dims[0][0] }}
-#define {{ name|upper }}_K {{ in_dims[0][1] }}
-#define {{ name|upper }}_N {{ in_dims[1][1] }}
+{% include "./_def_io.jinja" %}
+{% include "./_meminfo.jinja" %}
+#define {{ name|upper }}_B {{ in_dims[0][0]}}
+#define {{ name|upper }}_C {{ in_chan[0]}}
+#define {{ name|upper }}_M {{ in_height[0]}}
+#define {{ name|upper }}_K {{ in_width[0] }}
+#define {{ name|upper }}_N {{ out_width[0] }}
+
+#define {{name|upper}}_SIZE_DIM_IN_A {{in_dims[0]|length}}
+#define {{name|upper}}_SIZE_DIM_IN_B {{in_dims[1]|length}}
+#define {{name|upper}}_SIZE_DIM_OUT {{out_dims[0]|length}}
+
+int {{name|upper}}_OUTPUT_DIMS[] =  { {{ out_dims[0]|join(", ") }} };
+int {{name|upper}}_INPUT_A_DIMS[] = { {{ in_dims[0]|join(", ") }} };
+int {{name|upper}}_INPUT_B_DIMS[] = { {{ in_dims[1]|join(", ") }} };
+
 #define {{ name|upper }}_ACTIVATION {{ activation }}
-static const {{ rescaling }} {{ name|upper }}_RESCALING = {};
 
 {#- Calculate sizes #}
 
diff --git a/aidge_export_cpp/templates/configuration/mul_config.jinja b/aidge_export_cpp/templates/configuration/mul_config.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..143d0048cd771dd1e47ef0758395538d83663a20
--- /dev/null
+++ b/aidge_export_cpp/templates/configuration/mul_config.jinja
@@ -0,0 +1,25 @@
+{#- For name header -#}
+#ifndef {{ name|upper }}_LAYER_H
+#define {{ name|upper }}_LAYER_H
+#include "kernels/rescaling.hpp"
+
+{% include "./_def_io.jinja" %}
+{% include "./_meminfo.jinja" %}
+{# For layer configuration -#}
+#define {{ name|upper }}_NB_ELTS {{ in_dims[0]|join('*') }}
+#define {{ name|upper }}_NB_ELTS_B {{ in_dims[1]|join('*')}}
+
+int {{name|upper}}_OUTPUT_DIMS[] =  { {{ out_dims[0]|join(", ") }} };
+int {{name|upper}}_INPUT_A_DIMS[] = { {{ in_dims[0]|join(", ") }} };
+int {{name|upper}}_INPUT_B_DIMS[] = { {{ in_dims[1]|join(", ") }} };
+
+#define {{name|upper}}_SIZE_DIM_IN_A {{in_dims[0]|length}}
+#define {{name|upper}}_SIZE_DIM_IN_B {{in_dims[1]|length}}
+#define {{name|upper}}_SIZE_DIM_OUT {{out_dims[0]|length}}
+
+#define {{ name|upper }}_OUT_SIZE {{out_size[0]}}
+#define {{name|upper }}_SIZE_DIM_OUT {{out_dims[0]|length}}
+
+#define {{ name|upper }}_ACTIVATION {{ activation }}
+static const {{ rescaling }} {{ name|upper }}_RESCALING = {};
+#endif /* {{ name|upper }}_LAYER_H */
diff --git a/aidge_export_cpp/templates/configuration/sub_config.jinja b/aidge_export_cpp/templates/configuration/sub_config.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..143d0048cd771dd1e47ef0758395538d83663a20
--- /dev/null
+++ b/aidge_export_cpp/templates/configuration/sub_config.jinja
@@ -0,0 +1,25 @@
+{#- For name header -#}
+#ifndef {{ name|upper }}_LAYER_H
+#define {{ name|upper }}_LAYER_H
+#include "kernels/rescaling.hpp"
+
+{% include "./_def_io.jinja" %}
+{% include "./_meminfo.jinja" %}
+{# For layer configuration -#}
+#define {{ name|upper }}_NB_ELTS {{ in_dims[0]|join('*') }}
+#define {{ name|upper }}_NB_ELTS_B {{ in_dims[1]|join('*')}}
+
+int {{name|upper}}_OUTPUT_DIMS[] =  { {{ out_dims[0]|join(", ") }} };
+int {{name|upper}}_INPUT_A_DIMS[] = { {{ in_dims[0]|join(", ") }} };
+int {{name|upper}}_INPUT_B_DIMS[] = { {{ in_dims[1]|join(", ") }} };
+
+#define {{name|upper}}_SIZE_DIM_IN_A {{in_dims[0]|length}}
+#define {{name|upper}}_SIZE_DIM_IN_B {{in_dims[1]|length}}
+#define {{name|upper}}_SIZE_DIM_OUT {{out_dims[0]|length}}
+
+#define {{ name|upper }}_OUT_SIZE {{out_size[0]}}
+#define {{name|upper }}_SIZE_DIM_OUT {{out_dims[0]|length}}
+
+#define {{ name|upper }}_ACTIVATION {{ activation }}
+static const {{ rescaling }} {{ name|upper }}_RESCALING = {};
+#endif /* {{ name|upper }}_LAYER_H */
diff --git a/aidge_export_cpp/templates/configuration/transpose_config.jinja b/aidge_export_cpp/templates/configuration/transpose_config.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..01e57b89ac2cbd64635db236aa85b342778c9bab
--- /dev/null
+++ b/aidge_export_cpp/templates/configuration/transpose_config.jinja
@@ -0,0 +1,15 @@
+{#- For name header -#}
+#ifndef {{ name|upper }}_LAYER_H
+#define {{ name|upper }}_LAYER_H
+
+{# For layer configuration -#}
+{% include "./_def_io.jinja" %}
+{% include "./_meminfo.jinja" %}
+#define {{ name|upper }}_SIZE {{out_size[0]}}
+#define {{name|upper }}_SIZE_OUTPUT_DIMS {{out_dims[0]|length}}
+
+int {{name|upper}}_OUTPUT_DIMS[] =  { {{ out_dims[0]|join(", ") }} };
+int {{name|upper}}_INPUT_DIMS[] = { {{ in_dims[0]|join(", ") }} };
+int {{name|upper}}_PERM[] = { {{ output_dims_order|join(", ") }} };
+
+#endif /* {{ name|upper }}_LAYER_H */
\ No newline at end of file
diff --git a/aidge_export_cpp/templates/kernel_forward/add_forward.jinja b/aidge_export_cpp/templates/kernel_forward/add_forward.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..3176cedfbc9c172612cae7b50dcb87c4efedfd09
--- /dev/null
+++ b/aidge_export_cpp/templates/kernel_forward/add_forward.jinja
@@ -0,0 +1,14 @@
+{% filter indent(width=4, first=False) %}
+{% include "./_mem_offset.jinja" %}
+add_forward<{{name|upper}}_NB_ELTS,
+               {{name|upper}}_INPUT_A_DIMS,
+               {{name|upper}}_INPUT_B_DIMS,
+               {{name|upper}}_OUTPUT_DIMS,
+               {{name|upper}}_SIZE_DIM_IN_A,
+               {{name|upper}}_SIZE_DIM_IN_B,
+               {{name|upper}}_SIZE_DIM_OUT,
+               {{name|upper}}_OUT_SIZE,
+               {{name|upper}}_ACTIVATION>
+                 ({{out_name[0]}}, {{in_name[0]}}, {{in_name[1]}});
+{% include "./_save_outputs.jinja" %}
+{% endfilter %}
diff --git a/aidge_export_cpp/templates/kernel_forward/batchnorm_forward.jinja b/aidge_export_cpp/templates/kernel_forward/batchnorm_forward.jinja
index 5a759b839cd0b04b3b82f8ca4cb8dd1b0201f4f7..a18e3a755a0743a33660ef92fac4edd59e464969 100644
--- a/aidge_export_cpp/templates/kernel_forward/batchnorm_forward.jinja
+++ b/aidge_export_cpp/templates/kernel_forward/batchnorm_forward.jinja
@@ -6,4 +6,4 @@ batchnorm_forward<{{ out_name[0]|upper }}_NB_OUTPUTS,
                   {{name|upper}}_ACTIVATION>
                   ({{in_name[0]}}, {{out_name[0]}}, {{in_name[1]}}, {{in_name[2]}}, {{in_name[3]}}, {{in_name[4]}}, {{name|upper}}_EPSILON);
 {% include "./_save_outputs.jinja" %}
-{% endfilter %}
+{% endfilter %}
\ No newline at end of file
diff --git a/aidge_export_cpp/templates/kernel_forward/convolution_forward.jinja b/aidge_export_cpp/templates/kernel_forward/convolution_forward.jinja
index 421013b9590dabe6ee0ac12f969494913414a530..98b9e0394c737afa4ed12a4424b0004b365d3049 100644
--- a/aidge_export_cpp/templates/kernel_forward/convolution_forward.jinja
+++ b/aidge_export_cpp/templates/kernel_forward/convolution_forward.jinja
@@ -4,6 +4,7 @@ convolution_forward<{{ in_name[0]|upper }}_NB_CHANNELS,
                     {{ in_name[0]|upper }}_IN_HEIGHT,
                     {{ in_name[0]|upper }}_IN_WIDTH,
                     {{ out_name[0]|upper }}_NB_OUTPUTS,
+                    {{name|upper}}_GROUPS,
                     {{ out_name[0]|upper }}_OUT_HEIGHT,
                     {{ out_name[0]|upper }}_OUT_WIDTH,
                     {{name|upper}}_PADDING_Y,
@@ -16,5 +17,6 @@ convolution_forward<{{ in_name[0]|upper }}_NB_CHANNELS,
                     {{name|upper}}_KERNEL_WIDTH,
                     {{name|upper}}_ACTIVATION>
                     ({{in_name[0]}}, {{out_name[0]}}, {{in_name[1]}}, {{in_name[2]}}, {{name|upper}}_RESCALING);
+
 {% include "./_save_outputs.jinja" %}
-{% endfilter %}
+{% endfilter %}
\ No newline at end of file
diff --git a/aidge_export_cpp/templates/kernel_forward/div_forward.jinja b/aidge_export_cpp/templates/kernel_forward/div_forward.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..4b793570a01405ac7fbdac1a7e4b84c283024c8e
--- /dev/null
+++ b/aidge_export_cpp/templates/kernel_forward/div_forward.jinja
@@ -0,0 +1,14 @@
+{% filter indent(width=4, first=False) %}
+{% include "./_mem_offset.jinja" %}
+div_forward<{{name|upper}}_NB_ELTS,
+               {{name|upper}}_INPUT_A_DIMS,
+               {{name|upper}}_INPUT_B_DIMS,
+               {{name|upper}}_OUTPUT_DIMS,
+               {{name|upper}}_SIZE_DIM_IN_A,
+               {{name|upper}}_SIZE_DIM_IN_B,
+               {{name|upper}}_SIZE_DIM_OUT,
+               {{name|upper}}_OUT_SIZE,
+               {{name|upper}}_ACTIVATION>
+                 ({{out_name[0]}}, {{in_name[0]}}, {{in_name[1]}});
+{% include "./_save_outputs.jinja" %}
+{% endfilter %}
diff --git a/aidge_export_cpp/templates/kernel_forward/elemwise_forward.jinja b/aidge_export_cpp/templates/kernel_forward/elemwise_forward.jinja
index f60d163dcbfd6eff75e6b66c37bc5e57cf2cfca9..3ac42a524432aba9e2082e24e8a2755c99ff84cc 100644
--- a/aidge_export_cpp/templates/kernel_forward/elemwise_forward.jinja
+++ b/aidge_export_cpp/templates/kernel_forward/elemwise_forward.jinja
@@ -1,8 +1,15 @@
 {% filter indent(width=4, first=False) %}
 {% include "./_mem_offset.jinja" %}
 elemwise_forward<{{name|upper}}_NB_ELTS,
-                 {{name|upper}}_ELEM_OP,
-                 {{name|upper}}_ACTIVATION>
+               {{name|upper}}_ELEM_OP,
+               {{name|upper}}_INPUT_A_DIMS,
+               {{name|upper}}_INPUT_B_DIMS,
+               {{name|upper}}_OUTPUT_DIMS,
+               {{name|upper}}_SIZE_DIM_IN_A,
+               {{name|upper}}_SIZE_DIM_IN_B,
+               {{name|upper}}_SIZE_DIM_OUT,
+               {{name|upper}}_OUT_SIZE,
+               {{name|upper}}_ACTIVATION>
                  ({{out_name[0]}}, {{name|upper}}_RESCALING, {{in_name[0]}}, {{in_name[1]}});
 {% include "./_save_outputs.jinja" %}
 {% endfilter %}
diff --git a/aidge_export_cpp/templates/kernel_forward/erf_forward.jinja b/aidge_export_cpp/templates/kernel_forward/erf_forward.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..9f3fbf3009c96454aceb306042358355e6b60a22
--- /dev/null
+++ b/aidge_export_cpp/templates/kernel_forward/erf_forward.jinja
@@ -0,0 +1,6 @@
+{% filter indent(width=4, first=False) %}
+{% include "./_mem_offset.jinja" %}
+erf_forward<{{name|upper}}_NB_ELTS>
+                   ({{in_name[0]}}, {{out_name[0]}});
+{% include "./_save_outputs.jinja" %}
+{% endfilter %}
\ No newline at end of file
diff --git a/aidge_export_cpp/templates/kernel_forward/matmul_forward.jinja b/aidge_export_cpp/templates/kernel_forward/matmul_forward.jinja
index 64b3df301794e1cb3d56170646a6b9524f18a6ab..1a634fe53b1c77f45d87193410d738a1c0c03760 100644
--- a/aidge_export_cpp/templates/kernel_forward/matmul_forward.jinja
+++ b/aidge_export_cpp/templates/kernel_forward/matmul_forward.jinja
@@ -1,5 +1,17 @@
 {% filter indent(width=4, first=False) %}
 {% include "./_mem_offset.jinja" %}
+<<<<<<< HEAD
+matmul_forward<{{name|upper}}_INPUT_A_DIMS,
+               {{name|upper}}_INPUT_B_DIMS,
+               {{name|upper}}_OUTPUT_DIMS,
+               {{name|upper}}_SIZE_DIM_IN_A,
+               {{name|upper}}_SIZE_DIM_IN_B,
+               {{name|upper}}_SIZE_DIM_OUT,
+               {{name|upper}}_ACTIVATION>
+               ({{in_name[0]}}, {{in_name[1]}}, {{out_name[0]}});
+{% include "./_save_outputs.jinja" %}
+{% endfilter %} 
+=======
 matmul_forward<{{name|upper}}_M,
                {{name|upper}}_K,
                {{name|upper}}_N,
@@ -7,3 +19,4 @@ matmul_forward<{{name|upper}}_M,
                ({{in_name[0]}}, {{in_name[1]}}, {{out_name[0]}}, {{name|upper}}_RESCALING);
 {% include "./_save_outputs.jinja" %}
 {% endfilter %}
+>>>>>>> origin/dev
diff --git a/aidge_export_cpp/templates/kernel_forward/mul_forward.jinja b/aidge_export_cpp/templates/kernel_forward/mul_forward.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..9a7170b510b56ed75872aab5d7b08b35921352b0
--- /dev/null
+++ b/aidge_export_cpp/templates/kernel_forward/mul_forward.jinja
@@ -0,0 +1,14 @@
+{% filter indent(width=4, first=False) %}
+{% include "./_mem_offset.jinja" %}
+mul_forward<{{name|upper}}_NB_ELTS,
+               {{name|upper}}_INPUT_A_DIMS,
+               {{name|upper}}_INPUT_B_DIMS,
+               {{name|upper}}_OUTPUT_DIMS,
+               {{name|upper}}_SIZE_DIM_IN_A,
+               {{name|upper}}_SIZE_DIM_IN_B,
+               {{name|upper}}_SIZE_DIM_OUT,
+               {{name|upper}}_OUT_SIZE,
+               {{name|upper}}_ACTIVATION>
+                 ({{out_name[0]}}, {{in_name[0]}}, {{in_name[1]}});
+{% include "./_save_outputs.jinja" %}
+{% endfilter %}
diff --git a/aidge_export_cpp/templates/kernel_forward/sub_forward.jinja b/aidge_export_cpp/templates/kernel_forward/sub_forward.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..51b47a8a83d62eb87ac4e5c96026d8e5458d7e65
--- /dev/null
+++ b/aidge_export_cpp/templates/kernel_forward/sub_forward.jinja
@@ -0,0 +1,14 @@
+{% filter indent(width=4, first=False) %}
+{% include "./_mem_offset.jinja" %}
+sub_forward<{{name|upper}}_NB_ELTS,
+               {{name|upper}}_INPUT_A_DIMS,
+               {{name|upper}}_INPUT_B_DIMS,
+               {{name|upper}}_OUTPUT_DIMS,
+               {{name|upper}}_SIZE_DIM_IN_A,
+               {{name|upper}}_SIZE_DIM_IN_B,
+               {{name|upper}}_SIZE_DIM_OUT,
+               {{name|upper}}_OUT_SIZE,
+               {{name|upper}}_ACTIVATION>
+                 ({{out_name[0]}}, {{in_name[0]}}, {{in_name[1]}});
+{% include "./_save_outputs.jinja" %}
+{% endfilter %}
diff --git a/aidge_export_cpp/templates/kernel_forward/transpose_forward.jinja b/aidge_export_cpp/templates/kernel_forward/transpose_forward.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..2a5433c20636a22c2a2fde1f15cbc7c39b2f8123
--- /dev/null
+++ b/aidge_export_cpp/templates/kernel_forward/transpose_forward.jinja
@@ -0,0 +1,10 @@
+{% filter indent(width=4, first=False) %}
+{% include "./_mem_offset.jinja" %}
+transpose_forward<{{ name|upper }}_INPUT_DIMS,
+                    {{ name|upper }}_PERM, 
+                    {{ name|upper}}_OUTPUT_DIMS, 
+                    {{ name|upper}}_SIZE_OUTPUT_DIMS, 
+                    {{name|upper}}_SIZE>
+                   ({{in_name[0]}}, {{out_name[0]}});
+{% include "./_save_outputs.jinja" %}
+{% endfilter %}
\ No newline at end of file