diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Add/aidge_add.h b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Add/aidge_add.h
new file mode 100644
index 0000000000000000000000000000000000000000..9d3c1a95d4eb2361120cff884d8d16e49045ef90
--- /dev/null
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Add/aidge_add.h
@@ -0,0 +1,8 @@
+template <unsigned int SIZE, typename Input_T, typename Output_T>
+__attribute__((always_inline)) inline static
+void aidge_add(Input_T* __restrict input_a, Input_T* __restrict input_b, Output_T* __restrict output) {
+  for (unsigned int i = 0; i < SIZE; ++i) {
+    // Note : no cast to get compiler warning if we lose precision during auto cast!
+    output[i] = input_a[i] + input_b[i];
+  }
+}
\ No newline at end of file
diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Atan/aidge_atan.hpp b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Atan/aidge_atan.h
similarity index 100%
rename from aidge_export_arm_cortexm/_Aidge_Arm/kernels/Atan/aidge_atan.hpp
rename to aidge_export_arm_cortexm/_Aidge_Arm/kernels/Atan/aidge_atan.h
diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/BatchNorm/aidge_batchNorm.h b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/BatchNorm/aidge_batchNorm.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6cf9f75d19d4eb153345ac034115eb98edf78a3
--- /dev/null
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/BatchNorm/aidge_batchNorm.h
@@ -0,0 +1,37 @@
+#include <cmath>
+
+template <
+            typename Input_T,
+            typename Output_T,
+            typename MeanVar_T,
+            typename ScaleBias_T,
+            typename SpatialDims_T,
+            unsigned int NB_Channels,
+            unsigned int NB_SpatialDims
+        >
+__attribute__((always_inline)) inline static 
+void aidge_batchnorm(Input_T* __restrict inputs,
+                    Output_T* __restrict outputs,
+                    MeanVar_T* __restrict input_mean,
+                    MeanVar_T* __restrict input_var,
+                    ScaleBias_T* __restrict scale,
+                    ScaleBias_T* __restrict bias,
+                    SpatialDims_T* __restrict spatial_dims,
+                    float epsilon)
+{
+    int featureMapSize = 1;
+    for (int index = 0; index < NB_SpatialDims; ++index){
+        featureMapSize *= spatial_dims[index];
+    }
+    for (int current_channel = 0; current_channel < NB_Channels; ++current_channel){
+        int ioIndex = current_channel * featureMapSize;
+
+        for (int index = ioIndex; index < (ioIndex + featureMapSize); index++ ){
+            outputs[index] = bias[current_channel];
+        }
+        float var = sqrt(input_var[current_channel] + epsilon);
+        for (int current_feature = 0; current_feature < featureMapSize; ++current_feature){
+            outputs[ioIndex + current_feature] += scale[current_channel] * (inputs[ioIndex + current_feature] - input_mean[current_channel]) / var;
+        }
+    }
+}
\ No newline at end of file
diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/BatchNorm/aidge_batchnorm2d_chw_float32.c b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/BatchNorm/aidge_batchnorm2d_chw_float32.h
similarity index 100%
rename from aidge_export_arm_cortexm/_Aidge_Arm/kernels/BatchNorm/aidge_batchnorm2d_chw_float32.c
rename to aidge_export_arm_cortexm/_Aidge_Arm/kernels/BatchNorm/aidge_batchnorm2d_chw_float32.h
diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Broadcast/aidge_broadcast.h b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Broadcast/aidge_broadcast.h
new file mode 100644
index 0000000000000000000000000000000000000000..dd76ae8e8a4f318b63092f1643b6da27835aa5b7
--- /dev/null
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Broadcast/aidge_broadcast.h
@@ -0,0 +1,77 @@
+/*
+ @inputs    Tensor values to broadcast
+ @output    Tensor values to return
+ @input     Tab of input dimension
+ @input     Tab of output dimension
+ @input     Size of input tab dim
+ @input     Size of output tab dim
+ @input     Count of output data needed
+*/
+#include <iostream>
+#include <vector>
+#include <stdexcept>
+
+void broadcast(
+    const float* inputs, 
+    float* output, 
+    const int dim_input[], 
+    const int dim_output[], 
+    int nb_dimInput, 
+    int nb_dimOutput, 
+    int input_size,
+    int output_size) {
+
+    std::vector<int> stride_input(nb_dimInput, 1);
+    for (int i = nb_dimInput - 2; i >= 0; --i) {
+        stride_input[i] = stride_input[i + 1] * dim_input[i + 1];
+    }
+
+    for (int i = 0; i < output_size; ++i) {
+        int idx_input = 0;
+        int idx_output = i;
+        
+        for (int d = nb_dimOutput - 1, d_in = nb_dimInput - 1; d >= 0; --d) {
+            int coord_out = idx_output % dim_output[d];
+            idx_output /= dim_output[d];
+            
+            int coord_in = (d_in >= 0 && dim_input[d_in] == dim_output[d]) ? coord_out : 0;
+
+            if (d_in >= 0) {
+                idx_input += coord_in * stride_input[d_in];
+                --d_in;
+            }
+        }
+        
+        if (idx_input < input_size) {
+            output[i] = inputs[idx_input];
+        } else {
+            throw std::out_of_range("Index out of range in input tensor.");
+        }
+    }
+}
+
+bool should_broadcast(
+    const float* tensor_a, const float* tensor_b, 
+    const int dim_a[], const int dim_b[], const int dim_output[],
+    int nb_dimA, int nb_dimB, int nb_dimOutput,
+    int size_a, int size_b) {
+
+    bool need_broadcast_a = false;
+    bool need_broadcast_b = false;
+
+    for (int i = 0; i < nb_dimOutput; ++i) {
+        int dim_a_idx = (i >= nb_dimOutput - nb_dimA) ? i - (nb_dimOutput - nb_dimA) : -1;
+        int dim_b_idx = (i >= nb_dimOutput - nb_dimB) ? i - (nb_dimOutput - nb_dimB) : -1;
+
+        int dim_a_val = (dim_a_idx >= 0) ? dim_a[dim_a_idx] : 1;
+        int dim_b_val = (dim_b_idx >= 0) ? dim_b[dim_b_idx] : 1;
+
+        if (dim_a_val != dim_output[i]) {
+            if (dim_a_val != 1 && dim_a_val != dim_output[i]) {
+                throw std::invalid_argument("Tensor A cannot be broadcasted to output dimensions.");
+            }
+            return true;
+        }
+    }
+    return false;
+}
diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Concat/aidge_concat_float32.hpp b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Concat/aidge_concat.h
similarity index 100%
rename from aidge_export_arm_cortexm/_Aidge_Arm/kernels/Concat/aidge_concat_float32.hpp
rename to aidge_export_arm_cortexm/_Aidge_Arm/kernels/Concat/aidge_concat.h
diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Div/aidge_div.h b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Div/aidge_div.h
new file mode 100644
index 0000000000000000000000000000000000000000..fafa29834d14e61bdcc6431e6e3f248efcc4bfdd
--- /dev/null
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Div/aidge_div.h
@@ -0,0 +1,9 @@
+template <unsigned int SIZE, typename Input_T, typename Output_T>
+__attribute__((always_inline)) inline static
+void aidge_div(Input_T* __restrict input_a, Input_T* __restrict input_b, Output_T* __restrict output) {
+  for (unsigned int i = 0; i < SIZE; ++i) {
+    // Note : no cast to get compiler warning if we lose precision during auto cast!
+    // [TODO] : input_b[i] = 0 
+    output[i] = input_a[i] / input_b[i];
+  }
+}
\ No newline at end of file
diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Mul/aidge_mul.h b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Mul/aidge_mul.h
new file mode 100644
index 0000000000000000000000000000000000000000..b18c2ab4b6a9f43d399d3cb0b7a0bff08a366c50
--- /dev/null
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Mul/aidge_mul.h
@@ -0,0 +1,8 @@
+template <unsigned int SIZE, typename Input_T, typename Output_T>
+__attribute__((always_inline)) inline static
+void aidge_mul(Input_T* __restrict input_a, Input_T* __restrict input_b, Output_T* __restrict output) {
+  for (unsigned int i = 0; i < SIZE; ++i) {
+    // Note : no cast to get compiler warning if we lose precision during auto cast!
+    output[i] = input_a[i] * input_b[i];
+  }
+}
\ No newline at end of file
diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Relu/aidge_elu.h b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Relu/aidge_elu.h
new file mode 100644
index 0000000000000000000000000000000000000000..f55c5afe301009f5d1cf67e3f30017232c03087e
--- /dev/null
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Relu/aidge_elu.h
@@ -0,0 +1,7 @@
+template <unsigned int SIZE, typename Input_T, typename Output_T>
+__attribute__((always_inline)) inline static
+void aidge_relu(Input_T* __restrict input, Output_T* __restrict output) {
+  for (unsigned int i = 0; i < SIZE; ++i) {
+    output[i] = (input[i] < 0.0f) ? 0.0f : input[i];
+  }
+}
\ No newline at end of file
diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Reshape/aidge_reshape.h b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Reshape/aidge_reshape.h
new file mode 100644
index 0000000000000000000000000000000000000000..95465aba70e86afa1e087193d2ee63536934a68a
--- /dev/null
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Reshape/aidge_reshape.h
@@ -0,0 +1,7 @@
+template <unsigned int SIZE, typename Input_T, typename Output_T>
+__attribute__((always_inline)) inline static
+void aidge_reshape(Input_T* __restrict input, Output_T* __restrict output) {
+  for (unsigned int i = 0; i < SIZE; ++i) {
+    output[i] = input[i];
+  }
+}
\ No newline at end of file
diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Reshape/aidge_reshape_chw_float32.c b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Reshape/aidge_reshape_chw_float32.h
similarity index 100%
rename from aidge_export_arm_cortexm/_Aidge_Arm/kernels/Reshape/aidge_reshape_chw_float32.c
rename to aidge_export_arm_cortexm/_Aidge_Arm/kernels/Reshape/aidge_reshape_chw_float32.h
diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Sigmoid/aidge_sigmoid.h b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Sigmoid/aidge_sigmoid.h
new file mode 100644
index 0000000000000000000000000000000000000000..e1f3f5a571047bc3276e3312f4aa8809e65ac93f
--- /dev/null
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Sigmoid/aidge_sigmoid.h
@@ -0,0 +1,10 @@
+#include <math.h>
+
+template <unsigned int SIZE, typename Input_T, typename Output_T>
+__attribute__((always_inline)) inline static
+void aidge_sigmoid(Input_T* __restrict input_a, Input_T* __restrict input_b, Output_T* __restrict output) {
+  for (unsigned int i = 0; i < SIZE; ++i) {
+    // Note : no cast to get compiler warning if we lose precision during auto cast!
+    output[i] = 1 / (1 + exp(-inputs[i]) );
+  }
+}
\ No newline at end of file
diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Sub/aidge_sub.h b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Sub/aidge_sub.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e9954d519216d1d43bb13c1d8f0e4eb1370bc74
--- /dev/null
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Sub/aidge_sub.h
@@ -0,0 +1,8 @@
+template <unsigned int SIZE, typename Input_T, typename Output_T>
+__attribute__((always_inline)) inline static
+void aidge_sub(Input_T* __restrict input_a, Input_T* __restrict input_b, Output_T* __restrict output) {
+  for (unsigned int i = 0; i < SIZE; ++i) {
+    // Note : no cast to get compiler warning if we lose precision during auto cast!
+    output[i] = input_a[i] - input_b[i];
+  }
+}
\ No newline at end of file