diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Add/aidge_add.hpp b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Add/aidge_add.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..57cbc6fd386b5b5c7b71c7ca6942230444eb3d55
--- /dev/null
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Add/aidge_add.hpp
@@ -0,0 +1,8 @@
+template <unsigned int SIZE, typename T>
+__attribute__((always_inline)) inline static
+void aidge_add(T* __restrict input_a, T* __restrict input_b, T* __restrict output) {
+  for (unsigned int i = 0; i < SIZE; ++i) {
+    // Note : no cast to get compiler warning if we lose precision during auto cast!
+    output[i] = input_a[i] + input_b[i];
+  }
+}
\ No newline at end of file
diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Atan/aidge_atan.hpp b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Atan/aidge_atan.hpp
index d4da329a44eb8669bc0469a31beeca670b63e6c3..cbbe4e3ccb5ec4ee8483cdf8cdef2042ad6b0446 100644
--- a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Atan/aidge_atan.hpp
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Atan/aidge_atan.hpp
@@ -1,8 +1,8 @@
 #include <cmath>
 
-template <unsigned int SIZE, typename Input_T, typename Output_T>
+template <unsigned int SIZE, typename T>
 __attribute__((always_inline)) inline static
-void aidge_atan(Input_T* __restrict input, Output_T* __restrict output) {
+void aidge_atan(T* __restrict input, T* __restrict output) {
   for (unsigned int i = 0; i < SIZE; ++i) {
     // Note : no cast to get compiler warning if we lose precision during auto cast!
     output[i] = std::atan(input[i]);
diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/BatchNorm/aidge_batchnorm.hpp b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/BatchNorm/aidge_batchnorm.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9accb6072bd2c09ee97cfe23aa3d906764ef9cf2
--- /dev/null
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/BatchNorm/aidge_batchnorm.hpp
@@ -0,0 +1,36 @@
+#include <cmath>
+
+template <
+            typename T,
+            typename MeanVar_T,
+            typename ScaleBias_T,
+            typename SpatialDims_T,
+            unsigned int NB_Channels,
+            unsigned int NB_SpatialDims
+        >
+__attribute__((always_inline)) inline static 
+void aidge_batchnorm(T* __restrict inputs,
+                    T* __restrict outputs,
+                    MeanVar_T* __restrict input_mean,
+                    MeanVar_T* __restrict input_var,
+                    ScaleBias_T* __restrict scale,
+                    ScaleBias_T* __restrict bias,
+                    SpatialDims_T* __restrict spatial_dims,
+                    float epsilon)
+{
+    int featureMapSize = 1;
+    for (int index = 0; index < NB_SpatialDims; ++index){
+        featureMapSize *= spatial_dims[index];
+    }
+    for (int current_channel = 0; current_channel < NB_Channels; ++current_channel){
+        int ioIndex = current_channel * featureMapSize;
+
+        for (int index = ioIndex; index < (ioIndex + featureMapSize); index++ ){
+            outputs[index] = bias[current_channel];
+        }
+        float var = sqrt(input_var[current_channel] + epsilon);
+        for (int current_feature = 0; current_feature < featureMapSize; ++current_feature){
+            outputs[ioIndex + current_feature] += scale[current_channel] * (inputs[ioIndex + current_feature] - input_mean[current_channel]) / var;
+        }
+    }
+}
\ No newline at end of file
diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/BatchNorm/aidge_batchnorm2d_chw_float32.c b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/BatchNorm/aidge_batchnorm2d_chw_float32.hpp
similarity index 100%
rename from aidge_export_arm_cortexm/_Aidge_Arm/kernels/BatchNorm/aidge_batchnorm2d_chw_float32.c
rename to aidge_export_arm_cortexm/_Aidge_Arm/kernels/BatchNorm/aidge_batchnorm2d_chw_float32.hpp
diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Div/aidge_div.hpp b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Div/aidge_div.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3fa4772d5af824b491cc6307e1dcf6ad58af5083
--- /dev/null
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Div/aidge_div.hpp
@@ -0,0 +1,9 @@
+template <unsigned int SIZE, typename T>
+__attribute__((always_inline)) inline static
+void aidge_div(T* __restrict input_a, T* __restrict input_b, T* __restrict output) {
+  for (unsigned int i = 0; i < SIZE; ++i) {
+    // Note : no cast to get compiler warning if we lose precision during auto cast!
+    // [TODO] : input_b[i] = 0 
+    output[i] = input_a[i] / input_b[i];
+  }
+}
\ No newline at end of file
diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/MatMul/aidge_matmul.hpp b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/MatMul/aidge_matmul.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2a1935661a2a643e7c1cd0d2d2e80ab0670c62ca
--- /dev/null
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/MatMul/aidge_matmul.hpp
@@ -0,0 +1,95 @@
+template <
+            typename T,
+            typename Dim_T,
+            typename Size_T
+        >
+__attribute__((always_inline)) inline static 
+void aidge_matmul(T* __restrict input_a,
+                  T* __restrict input_b,
+                  T* __restrict output,
+                  Dim_T* __restrict dim_a,
+                  Dim_T* __restrict dim_b,
+                  Dim_T* __restrict dim_output,
+                  Size_T __restrict size_aDim,
+                  Size_T __restrict size_bDim,
+                  Size_T __restrict size_outputDim)
+{
+    int ndim_a[size_outputDim];
+    int ndim_b[size_outputDim];
+    if (size_aDim == 1) { 
+        ndim_a[0] = 1;
+        ndim_a[1] = dim_a[0];
+    }
+    if (size_bDim == 1) { 
+        ndim_b[0] = dim_b[0];
+        ndim_b[1] = 1;
+    }
+    for (int i = 0; i < size_outputDim; ++i) {
+        int idx = size_outputDim - size_aDim;
+        ndim_a[i] = (i < idx) ? 1 : dim_a[i - idx];
+    }
+    for (int i = 0; i < size_outputDim; ++i) {
+        int idx = size_outputDim - size_bDim;
+        ndim_b[i] = (i < idx) ? 1 : dim_b[i - idx];
+    }
+
+    int stride_post0[size_outputDim - 2];
+    int stride_post1[size_outputDim - 2];
+    int stride_step0[size_outputDim - 2];
+    int stride_step1[size_outputDim - 2];
+
+    if (size_outputDim > 2) {
+        stride_post0[size_outputDim - 3] = 1;
+        stride_post1[size_outputDim - 3] = 1;
+        for (int i = size_outputDim - 4; i != -1; --i) {
+            stride_post0[i] = stride_post0[i + 1] * ndim_a[i + 1];
+            stride_post1[i] = stride_post1[i + 1] * ndim_b[i + 1];
+        }
+        for (int i = 0; i < size_outputDim - 2; ++i) {
+            stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1;
+            stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1;
+        }
+    }
+
+    int nbMatrices = 1;
+    for (int i = size_outputDim - 3; i >= 0; --i) {
+        nbMatrices *= dim_output[i];
+    }
+    int dim = size_outputDim - 3;
+
+    int offsetIn0 = 0;
+    int offsetIn1 = 0;
+    int offsetOut = 0;
+    const int n = ndim_a[size_outputDim - 2];
+    const int k = ndim_a[size_outputDim - 1];
+    const int m = ndim_b[size_outputDim - 1];
+    const int matrix0Size = n * k;
+    const int matrix1Size = k * m;
+    const int matrixOutSize = n * m;
+
+    for(int stack = 0; stack < nbMatrices;){
+        for (int i = 0; i < n; ++i) {
+            for (int j = 0; j < m; ++j) {
+                float sum = 0;
+                for (int l = 0; l < k; ++l) {
+                    sum += (input_a[ offsetIn0*matrix0Size + i*k + l] * input_b[offsetIn1*matrix1Size + l*m + j]);
+                }
+                output[ offsetOut*matrixOutSize + i*m + j] = sum;
+            }
+        } 
+
+        if (++stack < nbMatrices) {
+            int tmp_stack = stack;
+            while(tmp_stack % dim_output[dim] == 0) {
+                tmp_stack /= dim_output[dim];
+                dim--;
+            }
+            offsetIn0 += stride_step0[dim];
+            offsetIn1 += stride_step1[dim];
+            ++offsetOut;
+            dim = size_outputDim -3;
+        }
+
+    }
+
+}
\ No newline at end of file
diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Mul/aidge_mul.hpp b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Mul/aidge_mul.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7f522b94e38ca7c4b71399fc718fec932a720d3b
--- /dev/null
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Mul/aidge_mul.hpp
@@ -0,0 +1,8 @@
+template <unsigned int SIZE, typename T>
+__attribute__((always_inline)) inline static
+void aidge_mul(T* __restrict input_a, T* __restrict input_b, T* __restrict output) {
+  for (unsigned int i = 0; i < SIZE; ++i) {
+    // Note : no cast to get compiler warning if we lose precision during auto cast!
+    output[i] = input_a[i] * input_b[i];
+  }
+}
\ No newline at end of file
diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Relu/aidge_relu.hpp b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Relu/aidge_relu.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9b84a33b5484a8e294737d38f861b4f62d69995a
--- /dev/null
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Relu/aidge_relu.hpp
@@ -0,0 +1,8 @@
+
+template <unsigned int SIZE, typename T>
+__attribute__((always_inline)) inline static
+void aidge_relu(T* __restrict input, T* __restrict output) {
+  for (unsigned int i = 0; i < SIZE; ++i) {
+    output[i] = (input[i] < 0.0f) ? 0.0f : input[i];
+  }
+}
\ No newline at end of file
diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Reshape/aidge_reshape.hpp b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Reshape/aidge_reshape.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..8dfc52d42dc35650feb220c10dc8576439a64722
--- /dev/null
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Reshape/aidge_reshape.hpp
@@ -0,0 +1,7 @@
+template <unsigned int SIZE, typename T>
+__attribute__((always_inline)) inline static
+void aidge_reshape(T* __restrict input, T* __restrict output) {
+  for (unsigned int i = 0; i < SIZE; ++i) {
+    output[i] = input[i];
+  }
+}
\ No newline at end of file
diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Reshape/aidge_reshape_chw_float32.c b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Reshape/aidge_reshape_chw_float32.h
similarity index 100%
rename from aidge_export_arm_cortexm/_Aidge_Arm/kernels/Reshape/aidge_reshape_chw_float32.c
rename to aidge_export_arm_cortexm/_Aidge_Arm/kernels/Reshape/aidge_reshape_chw_float32.h
diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Sigmoid/aidge_sigmoid.hpp b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Sigmoid/aidge_sigmoid.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..eb287dd5d7fcf838d9f6d6b1091804abdb9794a3
--- /dev/null
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Sigmoid/aidge_sigmoid.hpp
@@ -0,0 +1,10 @@
+#include <math.h>
+
+template <unsigned int SIZE, typename T>
+__attribute__((always_inline)) inline static
+void aidge_sigmoid(T* __restrict inputs, T* __restrict outputs) {
+  for (unsigned int i = 0; i < SIZE; ++i) {
+    // Note : no cast to get compiler warning if we lose precision during auto cast!
+    outputs[i] = 1 / (1 + exp(-inputs[i]) );
+  }
+}
\ No newline at end of file
diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Softmax/aidge_softmax.hpp b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Softmax/aidge_softmax.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..00db1f22dfc18ee8c7b9ef8b93fa9c176faac9ac
--- /dev/null
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Softmax/aidge_softmax.hpp
@@ -0,0 +1,48 @@
+#include <math.h>
+
+#define MAX_DIMS_AXIS_SIZE 128 /** TODO : is 128 enough or to big ? | Other possibility is to use a shared buffer as param, but this could have a side effect on Aidge's overall mechanics **/
+float exps[MAX_DIMS_AXIS_SIZE];
+
+template <
+            typename T,
+            typename Dim_T,
+            typename Size_T
+        >
+__attribute__((always_inline)) inline static 
+void aidge_softmax(T* __restrict input,
+                  T* __restrict output,
+                  Dim_T* __restrict dims,
+                  Size_T __restrict dim_size,
+                  int axis)
+{
+    axis += (axis >= 0 ) ? 0 : dim_size;
+
+    int postAxisElems = 1;
+    for (unsigned int index = axis+1; index < dim_size; ++index) {
+        postAxisElems *= dims[index];
+    }
+    int preAxisElems = 1;
+    for (int index = 0; index < axis; ++index) {
+        preAxisElems *= dims[index];
+    }
+
+
+    for (int i = 0; i < preAxisElems; ++i) {
+        for (int j = 0; j < postAxisElems; ++j) {
+            float sumExp = 0.0;
+
+            int baseIdx = i * dims[axis] * postAxisElems + j;
+            
+            for (int k = 0; k < dims[axis]; ++k) {
+                int inIdx = baseIdx + k * postAxisElems;
+                exps[k] = exp(input[inIdx]);
+                sumExp += exps[k];
+            }
+
+            for (int k = 0; k < dims[axis]; ++k) {
+                int inIdx = baseIdx + k * postAxisElems;
+                output[inIdx] = exps[k] / sumExp;
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Sub/aidge_sub.hpp b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Sub/aidge_sub.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..8bbc194c5e4c430360f6058491ec752d18abe863
--- /dev/null
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Sub/aidge_sub.hpp
@@ -0,0 +1,8 @@
+template <unsigned int SIZE, typename T>
+__attribute__((always_inline)) inline static
+void aidge_sub(T* __restrict input_a, T* __restrict input_b, T* __restrict output) {
+  for (unsigned int i = 0; i < SIZE; ++i) {
+    // Note : no cast to get compiler warning if we lose precision during auto cast!
+    output[i] = input_a[i] - input_b[i];
+  }
+}
\ No newline at end of file