From b29fd0a7f367572fff0ba06c3ab2f4376c900458 Mon Sep 17 00:00:00 2001
From: Wissam Boussella <wissam.boussella@cea.fr>
Date: Fri, 11 Apr 2025 13:51:35 +0200
Subject: [PATCH 1/6] first step for benchmark

---
 aidge_export_arm_cortexm/benchmark.py         | 138 +++++
 .../templates/main_call/main_stm32f7.jinja    | 474 ++++++++++++++++++
 .../templates/main_call/main_stm32h7.jinja    | 365 ++++++++++++++
 .../templates/main_call/print_output.jinja    |  44 ++
 ba.py                                         | 102 ++++
 5 files changed, 1123 insertions(+)
 create mode 100644 aidge_export_arm_cortexm/benchmark.py
 create mode 100644 aidge_export_arm_cortexm/templates/main_call/main_stm32f7.jinja
 create mode 100644 aidge_export_arm_cortexm/templates/main_call/main_stm32h7.jinja
 create mode 100644 aidge_export_arm_cortexm/templates/main_call/print_output.jinja
 create mode 100644 ba.py

diff --git a/aidge_export_arm_cortexm/benchmark.py b/aidge_export_arm_cortexm/benchmark.py
new file mode 100644
index 0000000..6366c57
--- /dev/null
+++ b/aidge_export_arm_cortexm/benchmark.py
@@ -0,0 +1,138 @@
+import contextlib
+import os
+from shutil import rmtree
+from subprocess import run
+
+import numpy as np
+
+import aidge_core
+import aidge_backend_cpu
+import aidge_export_arm_cortexm
+
+def measure_inference_time(model: aidge_core.GraphView, input_data: list[str, np.ndarray], nb_warmup: int = 10, nb_iterations: int = 50) -> list[float]:
+    # load and set up the model
+    # model.set_datatype(ai.dtype.float32)
+    model.set_backend("cpu")
+
+    # create input Tensor list for the GraphView
+    ordered_inputs: list[aidge_core.Tensor] = []
+    # [tmp fix] manual transpositin of data for input of export BEFORE converting to Tensor
+    for i in input_data:
+        nb_dims = len(i[1].shape)
+        if nb_dims == 3:
+            ordered_inputs.append(aidge_core.Tensor(i[1].transpose(0,2,1).reshape(i[1].shape).copy()))
+        if nb_dims == 4:
+            ordered_inputs.append(aidge_core.Tensor(np.transpose(i[1], axes=(0,2,3,1)).reshape(i[1].shape).copy()))
+        else:
+            ordered_inputs.append(aidge_core.Tensor(i[1]))
+
+    # set inputs for the export
+    for i, inp in enumerate(model.get_ordered_inputs()):
+        op = inp[0].get_operator()
+        op.set_input(i, ordered_inputs[i])
+
+    model.forward_dims([t.dims() for t in ordered_inputs])
+
+    scheduler = aidge_core.SequentialScheduler(model)
+    scheduler.generate_scheduling()
+
+    # for ordered_input in ordered_inputs:
+        # ordered_input.set_backend("cpu")
+    operator_type: str = model.get_ordered_outputs()[0][0].get_operator().type()
+    print("  ├─Generating export...", end="", flush=True)
+    folder_name: str = f"{operator_type.lower()}_test_export_cpp"
+    with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f):
+        aidge_core.export_utils.scheduler_export(
+            scheduler,
+            folder_name,
+            aidge_export_arm_cortexm.ExportLibAidgeARM,
+            memory_manager=aidge_core.mem_info.generate_optimized_memory_info,
+            memory_manager_args={"wrapping": False }
+        )
+        aidge_core.export_utils.generate_main_inference_time_cpp(folder_name, model, nb_iterations, nb_warmup)
+    print(" ok")
+
+    print("  ├─Compiling...", end="", flush=True)
+    with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f):
+        run(['make'], cwd=folder_name, stdout=f)
+    print(" ok")
+    timings_str = run(f'./{folder_name}/bin/run_export', capture_output=True, text=True)
+
+    folder_path = os.path.abspath(folder_name)
+    if os.path.exists(folder_path):
+        rmtree(folder_path, ignore_errors=True)
+
+    timings = [float(t) for t in timings_str.stdout.split(' ') if t.strip()]
+    return timings
+
+def compute_output(model: aidge_core.GraphView, input_data: list[str, np.ndarray]) -> list[np.ndarray]:
+    # load and set up the model
+    model.set_backend("cpu")
+
+    # create input Tensor list for the GraphView
+    ordered_inputs: list[aidge_core.Tensor] = []
+    # [tmp fix] manual transpositin of data for input of export BEFORE converting to Tensor
+    for i in input_data:
+        nb_dims = len(i[1].shape)
+        if nb_dims == 3:
+            ordered_inputs.append(aidge_core.Tensor(i[1].transpose(0,2,1).reshape(i[1].shape).copy()))
+        if nb_dims == 4:
+            ordered_inputs.append(aidge_core.Tensor(np.transpose(i[1], axes=(0,2,3,1)).reshape(i[1].shape).copy()))
+        else:
+            ordered_inputs.append(aidge_core.Tensor(i[1]))
+
+    # set inputs for the export
+    for i, inp in enumerate(model.get_ordered_inputs()):
+        op = inp[0].get_operator()
+        op.set_input(i, ordered_inputs[i])
+
+    model.forward_dims([t.dims() for t in ordered_inputs])
+
+    scheduler = aidge_core.SequentialScheduler(model)
+    scheduler.generate_scheduling()
+
+
+    operator_type: str = model.get_ordered_outputs()[0][0].get_operator().type()
+    print("  │ Generating export...", end="", flush=True)
+    folder_name: str = f"{operator_type.lower()}_test_export_cpp"
+    with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f):
+        aidge_core.export_utils.scheduler_export(
+            scheduler,
+            folder_name,
+            aidge_export_cpp.ExportLibCpp,
+            memory_manager=aidge_core.mem_info.generate_optimized_memory_info,
+            memory_manager_args={"wrapping": False }
+        )
+        aidge_core.export_utils.generate_main_display_output_cpp(folder_name, model)
+    print(" ok")
+
+    print("  │ Compiling...", end="", flush=True)
+    with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f):
+        run(['make'], cwd=folder_name, stdout=f)
+    print(" ok")
+    output_str: str = run(f'./{folder_name}/bin/run_export', capture_output=True, text=True)
+    folder_path = os.path.abspath(folder_name)
+    if os.path.exists(folder_path):
+        rmtree(folder_path, ignore_errors=True)
+
+    outputs_str: list[str] = output_str.stdout.strip().split('\n')
+    outputs = [np.array([float(val) for val in single_output_str.split(' ') if val.strip()]) for i, single_output_str in enumerate(outputs_str)]
+
+    for i, pair in enumerate(model.get_ordered_outputs()):
+        dims = pair[0].get_operator().get_output(pair[1]).dims()
+        nb_dims = len(dims)
+        dims_permutted = dims
+        if nb_dims == 3:
+            dims_permutted = [dims[0], dims[2], dims[1]]
+        if nb_dims == 4:
+            dims_permutted = [dims[0], dims[2], dims[3], dims[1]]
+
+        if np.prod(dims) != outputs[i].size:
+            aidge_core.Log.fatal("Incompatible export output size ({}) with required shape {}", outputs[i].size, dims)
+        outputs[i] = outputs[i].reshape(dims_permutted)
+        if nb_dims == 3:
+            outputs[i] = outputs[i].transpose(0,2,1)
+        if nb_dims == 4:
+            outputs[i] = outputs[i].transpose(0,3,1,2)
+
+    return outputs
\ No newline at end of file
diff --git a/aidge_export_arm_cortexm/templates/main_call/main_stm32f7.jinja b/aidge_export_arm_cortexm/templates/main_call/main_stm32f7.jinja
new file mode 100644
index 0000000..a5f8e5e
--- /dev/null
+++ b/aidge_export_arm_cortexm/templates/main_call/main_stm32f7.jinja
@@ -0,0 +1,474 @@
+/* USER CODE BEGIN Header */
+/**
+  ******************************************************************************
+  * @file           : main.c
+  * @brief          : Main program body
+  ******************************************************************************
+  * @attention
+  *
+  * Copyright (c) 2024 STMicroelectronics.
+  * All rights reserved.
+  *
+  * This software is licensed under terms that can be found in the LICENSE file
+  * in the root directory of this software component.
+  * If no LICENSE file comes with this software, it is provided AS-IS.
+  *
+  ******************************************************************************
+  */
+/* USER CODE END Header */
+/* Includes ------------------------------------------------------------------*/
+#include "main.h"
+#include "string.h"
+
+/* Private includes ----------------------------------------------------------*/
+/* USER CODE BEGIN Includes */
+
+#include <stdio.h>
+#include <stdint.h>
+
+/* USER CODE END Includes */
+
+/* Private typedef -----------------------------------------------------------*/
+/* USER CODE BEGIN PTD */
+
+/* USER CODE END PTD */
+
+/* Private define ------------------------------------------------------------*/
+/* USER CODE BEGIN PD */
+
+/* USER CODE END PD */
+
+/* Private macro -------------------------------------------------------------*/
+/* USER CODE BEGIN PM */
+
+/* USER CODE END PM */
+
+/* Private variables ---------------------------------------------------------*/
+#if defined ( __ICCARM__ ) /*!< IAR Compiler */
+#pragma location=0x2004c000
+ETH_DMADescTypeDef  DMARxDscrTab[ETH_RX_DESC_CNT]; /* Ethernet Rx DMA Descriptors */
+#pragma location=0x2004c0a0
+ETH_DMADescTypeDef  DMATxDscrTab[ETH_TX_DESC_CNT]; /* Ethernet Tx DMA Descriptors */
+
+#elif defined ( __CC_ARM )  /* MDK ARM Compiler */
+
+__attribute__((at(0x2004c000))) ETH_DMADescTypeDef  DMARxDscrTab[ETH_RX_DESC_CNT]; /* Ethernet Rx DMA Descriptors */
+__attribute__((at(0x2004c0a0))) ETH_DMADescTypeDef  DMATxDscrTab[ETH_TX_DESC_CNT]; /* Ethernet Tx DMA Descriptors */
+
+#elif defined ( __GNUC__ ) /* GNU Compiler */
+
+ETH_DMADescTypeDef DMARxDscrTab[ETH_RX_DESC_CNT] __attribute__((section(".RxDecripSection"))); /* Ethernet Rx DMA Descriptors */
+ETH_DMADescTypeDef DMATxDscrTab[ETH_TX_DESC_CNT] __attribute__((section(".TxDecripSection")));   /* Ethernet Tx DMA Descriptors */
+#endif
+
+ETH_TxPacketConfig TxConfig;
+
+ETH_HandleTypeDef heth;
+
+TIM_HandleTypeDef htim2;
+
+UART_HandleTypeDef huart3;
+
+PCD_HandleTypeDef hpcd_USB_OTG_FS;
+
+/* USER CODE BEGIN PV */
+
+/* USER CODE END PV */
+
+/* Private function prototypes -----------------------------------------------*/
+void SystemClock_Config(void);
+static void MX_GPIO_Init(void);
+static void MX_ETH_Init(void);
+static void MX_TIM2_Init(void);
+static void MX_USART3_UART_Init(void);
+static void MX_USB_OTG_FS_PCD_Init(void);
+/* USER CODE BEGIN PFP */
+
+/* USER CODE END PFP */
+
+/* Private user code ---------------------------------------------------------*/
+/* USER CODE BEGIN 0 */
+
+int _write(int file, char *data, int len)
+{
+   HAL_StatusTypeDef status = HAL_UART_Transmit(&huart3, (uint8_t*) data, len, 1000);
+
+   return (status == HAL_OK ? len : 0);
+}
+
+/* USER CODE END 0 */
+
+/**
+  * @brief  The application entry point.
+  * @retval int
+  */
+#include "{{ call_function }}.hpp"
+int main(void)
+{
+
+  /* USER CODE BEGIN 1 */
+
+  /* USER CODE END 1 */
+
+  /* MCU Configuration--------------------------------------------------------*/
+
+  /* Reset of all peripherals, Initializes the Flash interface and the Systick. */
+  HAL_Init();
+
+  /* USER CODE BEGIN Init */
+
+  /* USER CODE END Init */
+
+  /* Configure the system clock */
+  SystemClock_Config();
+
+  /* USER CODE BEGIN SysInit */
+
+  /* USER CODE END SysInit */
+
+  /* Initialize all configured peripherals */
+  MX_GPIO_Init();
+  MX_ETH_Init();
+  MX_TIM2_Init();
+  MX_USART3_UART_Init();
+  MX_USB_OTG_FS_PCD_Init();
+  /* USER CODE BEGIN 2 */
+
+  printf("\r\n");
+  printf("*****************************************************\r\n");
+  printf("****************** DEMO EXPORT ARM ******************\r\n");
+  printf("*****************************************************\r\n");
+  printf("\r\n");
+  {{ call_function }}();
+  printf("\r\n");
+  printf("*****************************************************\r\n");
+  printf("********************** END DEMO *********************\r\n");
+  printf("*****************************************************\r\n");
+  printf("\r\n");
+
+  /* USER CODE END 2 */
+
+  /* Infinite loop */
+  /* USER CODE BEGIN WHILE */
+  while (1)
+  {
+    /* USER CODE END WHILE */
+
+    /* USER CODE BEGIN 3 */
+  }
+  /* USER CODE END 3 */
+}
+
+/**
+  * @brief System Clock Configuration
+  * @retval None
+  */
+void SystemClock_Config(void)
+{
+  RCC_OscInitTypeDef RCC_OscInitStruct = {0};
+  RCC_ClkInitTypeDef RCC_ClkInitStruct = {0};
+
+  /** Configure LSE Drive Capability
+  */
+  HAL_PWR_EnableBkUpAccess();
+
+  /** Configure the main internal regulator output voltage
+  */
+  __HAL_RCC_PWR_CLK_ENABLE();
+  __HAL_PWR_VOLTAGESCALING_CONFIG(PWR_REGULATOR_VOLTAGE_SCALE3);
+
+  /** Initializes the RCC Oscillators according to the specified parameters
+  * in the RCC_OscInitTypeDef structure.
+  */
+  RCC_OscInitStruct.OscillatorType = RCC_OSCILLATORTYPE_HSE;
+  RCC_OscInitStruct.HSEState = RCC_HSE_BYPASS;
+  RCC_OscInitStruct.PLL.PLLState = RCC_PLL_ON;
+  RCC_OscInitStruct.PLL.PLLSource = RCC_PLLSOURCE_HSE;
+  RCC_OscInitStruct.PLL.PLLM = 4;
+  RCC_OscInitStruct.PLL.PLLN = 72;
+  RCC_OscInitStruct.PLL.PLLP = RCC_PLLP_DIV2;
+  RCC_OscInitStruct.PLL.PLLQ = 3;
+  if (HAL_RCC_OscConfig(&RCC_OscInitStruct) != HAL_OK)
+  {
+    Error_Handler();
+  }
+
+  /** Initializes the CPU, AHB and APB buses clocks
+  */
+  RCC_ClkInitStruct.ClockType = RCC_CLOCKTYPE_HCLK|RCC_CLOCKTYPE_SYSCLK
+                              |RCC_CLOCKTYPE_PCLK1|RCC_CLOCKTYPE_PCLK2;
+  RCC_ClkInitStruct.SYSCLKSource = RCC_SYSCLKSOURCE_PLLCLK;
+  RCC_ClkInitStruct.AHBCLKDivider = RCC_SYSCLK_DIV1;
+  RCC_ClkInitStruct.APB1CLKDivider = RCC_HCLK_DIV2;
+  RCC_ClkInitStruct.APB2CLKDivider = RCC_HCLK_DIV1;
+
+  if (HAL_RCC_ClockConfig(&RCC_ClkInitStruct, FLASH_LATENCY_2) != HAL_OK)
+  {
+    Error_Handler();
+  }
+}
+
+/**
+  * @brief ETH Initialization Function
+  * @param None
+  * @retval None
+  */
+static void MX_ETH_Init(void)
+{
+
+  /* USER CODE BEGIN ETH_Init 0 */
+
+  /* USER CODE END ETH_Init 0 */
+
+   static uint8_t MACAddr[6];
+
+  /* USER CODE BEGIN ETH_Init 1 */
+
+  /* USER CODE END ETH_Init 1 */
+  heth.Instance = ETH;
+  MACAddr[0] = 0x00;
+  MACAddr[1] = 0x80;
+  MACAddr[2] = 0xE1;
+  MACAddr[3] = 0x00;
+  MACAddr[4] = 0x00;
+  MACAddr[5] = 0x00;
+  heth.Init.MACAddr = &MACAddr[0];
+  heth.Init.MediaInterface = HAL_ETH_RMII_MODE;
+  heth.Init.TxDesc = DMATxDscrTab;
+  heth.Init.RxDesc = DMARxDscrTab;
+  heth.Init.RxBuffLen = 1524;
+
+  /* USER CODE BEGIN MACADDRESS */
+
+  /* USER CODE END MACADDRESS */
+
+  if (HAL_ETH_Init(&heth) != HAL_OK)
+  {
+    Error_Handler();
+  }
+
+  memset(&TxConfig, 0 , sizeof(ETH_TxPacketConfig));
+  TxConfig.Attributes = ETH_TX_PACKETS_FEATURES_CSUM | ETH_TX_PACKETS_FEATURES_CRCPAD;
+  TxConfig.ChecksumCtrl = ETH_CHECKSUM_IPHDR_PAYLOAD_INSERT_PHDR_CALC;
+  TxConfig.CRCPadCtrl = ETH_CRC_PAD_INSERT;
+  /* USER CODE BEGIN ETH_Init 2 */
+
+  /* USER CODE END ETH_Init 2 */
+
+}
+
+/**
+  * @brief TIM2 Initialization Function
+  * @param None
+  * @retval None
+  */
+static void MX_TIM2_Init(void)
+{
+
+  /* USER CODE BEGIN TIM2_Init 0 */
+
+  /* USER CODE END TIM2_Init 0 */
+
+  TIM_ClockConfigTypeDef sClockSourceConfig = {0};
+  TIM_MasterConfigTypeDef sMasterConfig = {0};
+
+  /* USER CODE BEGIN TIM2_Init 1 */
+
+  /* USER CODE END TIM2_Init 1 */
+  htim2.Instance = TIM2;
+  htim2.Init.Prescaler = 16000;
+  htim2.Init.CounterMode = TIM_COUNTERMODE_UP;
+  htim2.Init.Period = 1000;
+  htim2.Init.ClockDivision = TIM_CLOCKDIVISION_DIV1;
+  htim2.Init.AutoReloadPreload = TIM_AUTORELOAD_PRELOAD_DISABLE;
+  if (HAL_TIM_Base_Init(&htim2) != HAL_OK)
+  {
+    Error_Handler();
+  }
+  sClockSourceConfig.ClockSource = TIM_CLOCKSOURCE_INTERNAL;
+  if (HAL_TIM_ConfigClockSource(&htim2, &sClockSourceConfig) != HAL_OK)
+  {
+    Error_Handler();
+  }
+  sMasterConfig.MasterOutputTrigger = TIM_TRGO_RESET;
+  sMasterConfig.MasterSlaveMode = TIM_MASTERSLAVEMODE_DISABLE;
+  if (HAL_TIMEx_MasterConfigSynchronization(&htim2, &sMasterConfig) != HAL_OK)
+  {
+    Error_Handler();
+  }
+  /* USER CODE BEGIN TIM2_Init 2 */
+
+  /* USER CODE END TIM2_Init 2 */
+
+}
+
+/**
+  * @brief USART3 Initialization Function
+  * @param None
+  * @retval None
+  */
+static void MX_USART3_UART_Init(void)
+{
+
+  /* USER CODE BEGIN USART3_Init 0 */
+
+  /* USER CODE END USART3_Init 0 */
+
+  /* USER CODE BEGIN USART3_Init 1 */
+
+  /* USER CODE END USART3_Init 1 */
+  huart3.Instance = USART3;
+  huart3.Init.BaudRate = 115200;
+  huart3.Init.WordLength = UART_WORDLENGTH_8B;
+  huart3.Init.StopBits = UART_STOPBITS_1;
+  huart3.Init.Parity = UART_PARITY_NONE;
+  huart3.Init.Mode = UART_MODE_TX_RX;
+  huart3.Init.HwFlowCtl = UART_HWCONTROL_NONE;
+  huart3.Init.OverSampling = UART_OVERSAMPLING_16;
+  huart3.Init.OneBitSampling = UART_ONE_BIT_SAMPLE_DISABLE;
+  huart3.AdvancedInit.AdvFeatureInit = UART_ADVFEATURE_NO_INIT;
+  if (HAL_UART_Init(&huart3) != HAL_OK)
+  {
+    Error_Handler();
+  }
+  /* USER CODE BEGIN USART3_Init 2 */
+
+  /* USER CODE END USART3_Init 2 */
+
+}
+
+/**
+  * @brief USB_OTG_FS Initialization Function
+  * @param None
+  * @retval None
+  */
+static void MX_USB_OTG_FS_PCD_Init(void)
+{
+
+  /* USER CODE BEGIN USB_OTG_FS_Init 0 */
+
+  /* USER CODE END USB_OTG_FS_Init 0 */
+
+  /* USER CODE BEGIN USB_OTG_FS_Init 1 */
+
+  /* USER CODE END USB_OTG_FS_Init 1 */
+  hpcd_USB_OTG_FS.Instance = USB_OTG_FS;
+  hpcd_USB_OTG_FS.Init.dev_endpoints = 6;
+  hpcd_USB_OTG_FS.Init.speed = PCD_SPEED_FULL;
+  hpcd_USB_OTG_FS.Init.dma_enable = DISABLE;
+  hpcd_USB_OTG_FS.Init.phy_itface = PCD_PHY_EMBEDDED;
+  hpcd_USB_OTG_FS.Init.Sof_enable = ENABLE;
+  hpcd_USB_OTG_FS.Init.low_power_enable = DISABLE;
+  hpcd_USB_OTG_FS.Init.lpm_enable = DISABLE;
+  hpcd_USB_OTG_FS.Init.vbus_sensing_enable = ENABLE;
+  hpcd_USB_OTG_FS.Init.use_dedicated_ep1 = DISABLE;
+  if (HAL_PCD_Init(&hpcd_USB_OTG_FS) != HAL_OK)
+  {
+    Error_Handler();
+  }
+  /* USER CODE BEGIN USB_OTG_FS_Init 2 */
+
+  /* USER CODE END USB_OTG_FS_Init 2 */
+
+}
+
+/**
+  * @brief GPIO Initialization Function
+  * @param None
+  * @retval None
+  */
+static void MX_GPIO_Init(void)
+{
+  GPIO_InitTypeDef GPIO_InitStruct = {0};
+/* USER CODE BEGIN MX_GPIO_Init_1 */
+/* USER CODE END MX_GPIO_Init_1 */
+
+  /* GPIO Ports Clock Enable */
+  __HAL_RCC_GPIOC_CLK_ENABLE();
+  __HAL_RCC_GPIOH_CLK_ENABLE();
+  __HAL_RCC_GPIOA_CLK_ENABLE();
+  __HAL_RCC_GPIOB_CLK_ENABLE();
+  __HAL_RCC_GPIOD_CLK_ENABLE();
+  __HAL_RCC_GPIOG_CLK_ENABLE();
+
+  /*Configure GPIO pin Output Level */
+  HAL_GPIO_WritePin(GPIOB, LD1_Pin|LD3_Pin|LD2_Pin, GPIO_PIN_RESET);
+
+  /*Configure GPIO pin Output Level */
+  HAL_GPIO_WritePin(USB_PowerSwitchOn_GPIO_Port, USB_PowerSwitchOn_Pin, GPIO_PIN_RESET);
+
+  /*Configure GPIO pin Output Level */
+  HAL_GPIO_WritePin(GPIOC, GPIO_PIN_7, GPIO_PIN_RESET);
+
+  /*Configure GPIO pin : USER_Btn_Pin */
+  GPIO_InitStruct.Pin = USER_Btn_Pin;
+  GPIO_InitStruct.Mode = GPIO_MODE_IT_RISING;
+  GPIO_InitStruct.Pull = GPIO_NOPULL;
+  HAL_GPIO_Init(USER_Btn_GPIO_Port, &GPIO_InitStruct);
+
+  /*Configure GPIO pins : LD1_Pin LD3_Pin LD2_Pin */
+  GPIO_InitStruct.Pin = LD1_Pin|LD3_Pin|LD2_Pin;
+  GPIO_InitStruct.Mode = GPIO_MODE_OUTPUT_PP;
+  GPIO_InitStruct.Pull = GPIO_NOPULL;
+  GPIO_InitStruct.Speed = GPIO_SPEED_FREQ_LOW;
+  HAL_GPIO_Init(GPIOB, &GPIO_InitStruct);
+
+  /*Configure GPIO pin : USB_PowerSwitchOn_Pin */
+  GPIO_InitStruct.Pin = USB_PowerSwitchOn_Pin;
+  GPIO_InitStruct.Mode = GPIO_MODE_OUTPUT_PP;
+  GPIO_InitStruct.Pull = GPIO_NOPULL;
+  GPIO_InitStruct.Speed = GPIO_SPEED_FREQ_LOW;
+  HAL_GPIO_Init(USB_PowerSwitchOn_GPIO_Port, &GPIO_InitStruct);
+
+  /*Configure GPIO pin : USB_OverCurrent_Pin */
+  GPIO_InitStruct.Pin = USB_OverCurrent_Pin;
+  GPIO_InitStruct.Mode = GPIO_MODE_INPUT;
+  GPIO_InitStruct.Pull = GPIO_NOPULL;
+  HAL_GPIO_Init(USB_OverCurrent_GPIO_Port, &GPIO_InitStruct);
+
+  /*Configure GPIO pin : PC7 */
+  GPIO_InitStruct.Pin = GPIO_PIN_7;
+  GPIO_InitStruct.Mode = GPIO_MODE_OUTPUT_PP;
+  GPIO_InitStruct.Pull = GPIO_NOPULL;
+  GPIO_InitStruct.Speed = GPIO_SPEED_FREQ_LOW;
+  HAL_GPIO_Init(GPIOC, &GPIO_InitStruct);
+
+/* USER CODE BEGIN MX_GPIO_Init_2 */
+/* USER CODE END MX_GPIO_Init_2 */
+}
+
+/* USER CODE BEGIN 4 */
+
+/* USER CODE END 4 */
+
+/**
+  * @brief  This function is executed in case of error occurrence.
+  * @retval None
+  */
+void Error_Handler(void)
+{
+  /* USER CODE BEGIN Error_Handler_Debug */
+  /* User can add his own implementation to report the HAL error return state */
+  __disable_irq();
+  while (1)
+  {
+  }
+  /* USER CODE END Error_Handler_Debug */
+}
+
+#ifdef  USE_FULL_ASSERT
+/**
+  * @brief  Reports the name of the source file and the source line number
+  *         where the assert_param error has occurred.
+  * @param  file: pointer to the source file name
+  * @param  line: assert_param error line source number
+  * @retval None
+  */
+void assert_failed(uint8_t *file, uint32_t line)
+{
+  /* USER CODE BEGIN 6 */
+  /* User can add his own implementation to report the file name and line number,
+     ex: printf("Wrong parameters value: file %s on line %d\r\n", file, line) */
+  /* USER CODE END 6 */
+}
+#endif /* USE_FULL_ASSERT */
diff --git a/aidge_export_arm_cortexm/templates/main_call/main_stm32h7.jinja b/aidge_export_arm_cortexm/templates/main_call/main_stm32h7.jinja
new file mode 100644
index 0000000..ac5b695
--- /dev/null
+++ b/aidge_export_arm_cortexm/templates/main_call/main_stm32h7.jinja
@@ -0,0 +1,365 @@
+/* USER CODE BEGIN Header */
+/**
+  ******************************************************************************
+  * @file           : main.c
+  * @brief          : Main program body
+  ******************************************************************************
+  * @attention
+  *
+  * <h2><center>&copy; Copyright (c) 2019 STMicroelectronics.
+  * All rights reserved.</center></h2>
+  *
+  * This software component is licensed by ST under BSD 3-Clause license,
+  * the "License"; You may not use this file except in compliance with the
+  * License. You may obtain a copy of the License at:
+  *                        opensource.org/licenses/BSD-3-Clause
+  *
+  ******************************************************************************
+  */
+/* USER CODE END Header */
+
+/* Includes ------------------------------------------------------------------*/
+#include "main.h"
+
+/* Private includes ----------------------------------------------------------*/
+/* USER CODE BEGIN Includes */
+
+#include <stdio.h>
+
+#include "dnn/include/forward.hpp"
+
+/* USER CODE END Includes */
+
+/* Private typedef -----------------------------------------------------------*/
+/* USER CODE BEGIN PTD */
+
+/* USER CODE END PTD */
+
+/* Private define ------------------------------------------------------------*/
+/* USER CODE BEGIN PD */
+
+
+/* USER CODE END PD */
+
+/* Private macro -------------------------------------------------------------*/
+/* USER CODE BEGIN PM */
+
+/* USER CODE END PM */
+
+/* Private variables ---------------------------------------------------------*/
+
+UART_HandleTypeDef huart3;
+
+/* USER CODE BEGIN PV */
+
+/* USER CODE END PV */
+
+/* Private function prototypes -----------------------------------------------*/
+void SystemClock_Config(void);
+static void MX_GPIO_Init(void);
+static void MX_USART3_UART_Init(void);
+/* USER CODE BEGIN PFP */
+
+/* USER CODE END PFP */
+
+/* Private user code ---------------------------------------------------------*/
+/* USER CODE BEGIN 0 */
+
+int _write(int file, char *data, int len)
+{
+   HAL_StatusTypeDef status = HAL_UART_Transmit(&huart3, (uint8_t*) data, len, 1000);
+
+   return (status == HAL_OK ? len : 0);
+}
+
+/* USER CODE END 0 */
+
+/**
+  * @brief  The application entry point.
+  * @retval int
+  */
+#include "{{ call_function }}.hpp"
+int main(void)
+{
+  /* USER CODE BEGIN 1 */
+
+  /* USER CODE END 1 */
+
+
+  /* Enable I-Cache---------------------------------------------------------*/
+  SCB_EnableICache();
+
+  /* Enable D-Cache---------------------------------------------------------*/
+  SCB_EnableDCache();
+
+  /* MCU Configuration--------------------------------------------------------*/
+
+  /* Reset of all peripherals, Initializes the Flash interface and the Systick. */
+  HAL_Init();
+
+  /* USER CODE BEGIN Init */
+
+  /* USER CODE END Init */
+
+  /* Configure the system clock */
+  SystemClock_Config();
+
+  /* USER CODE BEGIN SysInit */
+
+  /* USER CODE END SysInit */
+
+  /* Initialize all configured peripherals */
+  MX_GPIO_Init();
+  MX_USART3_UART_Init();
+  /* USER CODE BEGIN 2 */
+
+  printf("\r\n");
+  printf("*****************************************************\r\n");
+  printf("****************** DEMO EXPORT ARM ******************\r\n");
+  printf("*****************************************************\r\n");
+  printf("\r\n");
+  {{ call_function }}();
+  printf("\r\n");
+  printf("*****************************************************\r\n");
+  printf("********************** END DEMO *********************\r\n");
+  printf("*****************************************************\r\n");
+  printf("\r\n");
+
+  /* USER CODE END 3 */
+}
+
+/**
+  * @brief System Clock Configuration
+  * @retval None
+  */
+void SystemClock_Config(void)
+{
+  RCC_OscInitTypeDef RCC_OscInitStruct = {0};
+  RCC_ClkInitTypeDef RCC_ClkInitStruct = {0};
+  RCC_PeriphCLKInitTypeDef PeriphClkInitStruct = {0};
+
+  /** Supply configuration update enable
+  */
+  HAL_PWREx_ConfigSupply(PWR_LDO_SUPPLY);
+  /** Configure the main internal regulator output voltage
+  */
+  __HAL_PWR_VOLTAGESCALING_CONFIG(PWR_REGULATOR_VOLTAGE_SCALE1);
+
+  while(!__HAL_PWR_GET_FLAG(PWR_FLAG_VOSRDY)) {}
+  /** Initializes the CPU, AHB and APB busses clocks
+  */
+  RCC_OscInitStruct.OscillatorType = RCC_OSCILLATORTYPE_HSI;
+  RCC_OscInitStruct.HSIState = RCC_HSI_DIV1;
+  RCC_OscInitStruct.HSICalibrationValue = RCC_HSICALIBRATION_DEFAULT;
+  RCC_OscInitStruct.PLL.PLLState = RCC_PLL_ON;
+  RCC_OscInitStruct.PLL.PLLSource = RCC_PLLSOURCE_HSI;
+  RCC_OscInitStruct.PLL.PLLM = 4;
+  RCC_OscInitStruct.PLL.PLLN = 50;
+  RCC_OscInitStruct.PLL.PLLP = 2;
+  RCC_OscInitStruct.PLL.PLLQ = 2;
+  RCC_OscInitStruct.PLL.PLLR = 2;
+  RCC_OscInitStruct.PLL.PLLRGE = RCC_PLL1VCIRANGE_3;
+  RCC_OscInitStruct.PLL.PLLVCOSEL = RCC_PLL1VCOWIDE;
+  RCC_OscInitStruct.PLL.PLLFRACN = 0;
+  if (HAL_RCC_OscConfig(&RCC_OscInitStruct) != HAL_OK)
+  {
+    Error_Handler();
+  }
+  /** Initializes the CPU, AHB and APB busses clocks
+  */
+  RCC_ClkInitStruct.ClockType = RCC_CLOCKTYPE_HCLK|RCC_CLOCKTYPE_SYSCLK
+                              |RCC_CLOCKTYPE_PCLK1|RCC_CLOCKTYPE_PCLK2
+                              |RCC_CLOCKTYPE_D3PCLK1|RCC_CLOCKTYPE_D1PCLK1;
+  RCC_ClkInitStruct.SYSCLKSource = RCC_SYSCLKSOURCE_PLLCLK;
+  RCC_ClkInitStruct.SYSCLKDivider = RCC_SYSCLK_DIV1;
+  RCC_ClkInitStruct.AHBCLKDivider = RCC_HCLK_DIV2;
+  RCC_ClkInitStruct.APB3CLKDivider = RCC_APB3_DIV2;
+  RCC_ClkInitStruct.APB1CLKDivider = RCC_APB1_DIV2;
+  RCC_ClkInitStruct.APB2CLKDivider = RCC_APB2_DIV2;
+  RCC_ClkInitStruct.APB4CLKDivider = RCC_APB4_DIV2;
+
+  if (HAL_RCC_ClockConfig(&RCC_ClkInitStruct, FLASH_LATENCY_2) != HAL_OK)
+  {
+    Error_Handler();
+  }
+  PeriphClkInitStruct.PeriphClockSelection = RCC_PERIPHCLK_USART3;
+  PeriphClkInitStruct.Usart234578ClockSelection = RCC_USART234578CLKSOURCE_D2PCLK1;
+  if (HAL_RCCEx_PeriphCLKConfig(&PeriphClkInitStruct) != HAL_OK)
+  {
+    Error_Handler();
+  }
+}
+
+/**
+  * @brief USART3 Initialization Function
+  * @param None
+  * @retval None
+  */
+static void MX_USART3_UART_Init(void)
+{
+
+  /* USER CODE BEGIN USART3_Init 0 */
+
+  /* USER CODE END USART3_Init 0 */
+
+  /* USER CODE BEGIN USART3_Init 1 */
+
+  /* USER CODE END USART3_Init 1 */
+  huart3.Instance = USART3;
+  huart3.Init.BaudRate = 115200;
+  huart3.Init.WordLength = UART_WORDLENGTH_8B;
+  huart3.Init.StopBits = UART_STOPBITS_1;
+  huart3.Init.Parity = UART_PARITY_NONE;
+  huart3.Init.Mode = UART_MODE_TX_RX;
+  huart3.Init.HwFlowCtl = UART_HWCONTROL_NONE;
+  huart3.Init.OverSampling = UART_OVERSAMPLING_16;
+  huart3.Init.OneBitSampling = UART_ONE_BIT_SAMPLE_DISABLE;
+  huart3.Init.ClockPrescaler = UART_PRESCALER_DIV1;
+  huart3.AdvancedInit.AdvFeatureInit = UART_ADVFEATURE_NO_INIT;
+  if (HAL_UART_Init(&huart3) != HAL_OK)
+  {
+    Error_Handler();
+  }
+  if (HAL_UARTEx_SetTxFifoThreshold(&huart3, UART_TXFIFO_THRESHOLD_1_8) != HAL_OK)
+  {
+    Error_Handler();
+  }
+  if (HAL_UARTEx_SetRxFifoThreshold(&huart3, UART_RXFIFO_THRESHOLD_1_8) != HAL_OK)
+  {
+    Error_Handler();
+  }
+  if (HAL_UARTEx_DisableFifoMode(&huart3) != HAL_OK)
+  {
+    Error_Handler();
+  }
+  /* USER CODE BEGIN USART3_Init 2 */
+
+  /* USER CODE END USART3_Init 2 */
+
+}
+
+/**
+  * @brief GPIO Initialization Function
+  * @param None
+  * @retval None
+  */
+static void MX_GPIO_Init(void)
+{
+  GPIO_InitTypeDef GPIO_InitStruct = {0};
+
+  /* GPIO Ports Clock Enable */
+  __HAL_RCC_GPIOC_CLK_ENABLE();
+  __HAL_RCC_GPIOH_CLK_ENABLE();
+  __HAL_RCC_GPIOA_CLK_ENABLE();
+  __HAL_RCC_GPIOB_CLK_ENABLE();
+  __HAL_RCC_GPIOD_CLK_ENABLE();
+  __HAL_RCC_GPIOG_CLK_ENABLE();
+
+  /*Configure GPIO pin Output Level */
+  HAL_GPIO_WritePin(GPIOB, GPIO_PIN_14|GPIO_PIN_7, GPIO_PIN_RESET);
+
+  /*Configure GPIO pin Output Level */
+  HAL_GPIO_WritePin(GPIOG, GPIO_PIN_6, GPIO_PIN_RESET);
+
+  /*Configure GPIO pin : PC13 */
+  GPIO_InitStruct.Pin = GPIO_PIN_13;
+  GPIO_InitStruct.Mode = GPIO_MODE_IT_RISING;
+  GPIO_InitStruct.Pull = GPIO_NOPULL;
+  HAL_GPIO_Init(GPIOC, &GPIO_InitStruct);
+
+  /*Configure GPIO pins : PC1 PC4 PC5 */
+  GPIO_InitStruct.Pin = GPIO_PIN_1|GPIO_PIN_4|GPIO_PIN_5;
+  GPIO_InitStruct.Mode = GPIO_MODE_AF_PP;
+  GPIO_InitStruct.Pull = GPIO_NOPULL;
+  GPIO_InitStruct.Speed = GPIO_SPEED_FREQ_LOW;
+  GPIO_InitStruct.Alternate = GPIO_AF11_ETH;
+  HAL_GPIO_Init(GPIOC, &GPIO_InitStruct);
+
+  /*Configure GPIO pins : PA1 PA2 PA7 */
+  GPIO_InitStruct.Pin = GPIO_PIN_1|GPIO_PIN_2|GPIO_PIN_7;
+  GPIO_InitStruct.Mode = GPIO_MODE_AF_PP;
+  GPIO_InitStruct.Pull = GPIO_NOPULL;
+  GPIO_InitStruct.Speed = GPIO_SPEED_FREQ_LOW;
+  GPIO_InitStruct.Alternate = GPIO_AF11_ETH;
+  HAL_GPIO_Init(GPIOA, &GPIO_InitStruct);
+
+  /*Configure GPIO pin : PB13 */
+  GPIO_InitStruct.Pin = GPIO_PIN_13;
+  GPIO_InitStruct.Mode = GPIO_MODE_AF_PP;
+  GPIO_InitStruct.Pull = GPIO_NOPULL;
+  GPIO_InitStruct.Speed = GPIO_SPEED_FREQ_LOW;
+  GPIO_InitStruct.Alternate = GPIO_AF11_ETH;
+  HAL_GPIO_Init(GPIOB, &GPIO_InitStruct);
+
+  /*Configure GPIO pins : PB14 PB7 */
+  GPIO_InitStruct.Pin = GPIO_PIN_14|GPIO_PIN_7;
+  GPIO_InitStruct.Mode = GPIO_MODE_OUTPUT_PP;
+  GPIO_InitStruct.Pull = GPIO_NOPULL;
+  GPIO_InitStruct.Speed = GPIO_SPEED_FREQ_LOW;
+  HAL_GPIO_Init(GPIOB, &GPIO_InitStruct);
+
+  /*Configure GPIO pin : PG6 */
+  GPIO_InitStruct.Pin = GPIO_PIN_6;
+  GPIO_InitStruct.Mode = GPIO_MODE_OUTPUT_PP;
+  GPIO_InitStruct.Pull = GPIO_NOPULL;
+  GPIO_InitStruct.Speed = GPIO_SPEED_FREQ_LOW;
+  HAL_GPIO_Init(GPIOG, &GPIO_InitStruct);
+
+  /*Configure GPIO pin : PG7 */
+  GPIO_InitStruct.Pin = GPIO_PIN_7;
+  GPIO_InitStruct.Mode = GPIO_MODE_INPUT;
+  GPIO_InitStruct.Pull = GPIO_NOPULL;
+  HAL_GPIO_Init(GPIOG, &GPIO_InitStruct);
+
+  /*Configure GPIO pins : PA8 PA10 PA11 PA12 */
+  GPIO_InitStruct.Pin = GPIO_PIN_8|GPIO_PIN_10|GPIO_PIN_11|GPIO_PIN_12;
+  GPIO_InitStruct.Mode = GPIO_MODE_AF_PP;
+  GPIO_InitStruct.Pull = GPIO_NOPULL;
+  GPIO_InitStruct.Speed = GPIO_SPEED_FREQ_LOW;
+  GPIO_InitStruct.Alternate = GPIO_AF10_OTG1_FS;
+  HAL_GPIO_Init(GPIOA, &GPIO_InitStruct);
+
+  /*Configure GPIO pins : PG11 PG13 */
+  GPIO_InitStruct.Pin = GPIO_PIN_11|GPIO_PIN_13;
+  GPIO_InitStruct.Mode = GPIO_MODE_AF_PP;
+  GPIO_InitStruct.Pull = GPIO_NOPULL;
+  GPIO_InitStruct.Speed = GPIO_SPEED_FREQ_LOW;
+  GPIO_InitStruct.Alternate = GPIO_AF11_ETH;
+  HAL_GPIO_Init(GPIOG, &GPIO_InitStruct);
+
+}
+
+/* USER CODE BEGIN 4 */
+
+/* USER CODE END 4 */
+
+/**
+  * @brief  This function is executed in case of error occurrence.
+  * @retval None
+  */
+void Error_Handler(void)
+{
+  /* USER CODE BEGIN Error_Handler_Debug */
+  /* User can add his own implementation to report the HAL error return state */
+
+  /* USER CODE END Error_Handler_Debug */
+}
+
+#ifdef  USE_FULL_ASSERT
+/**
+  * @brief  Reports the name of the source file and the source line number
+  *         where the assert_param error has occurred.
+  * @param  file: pointer to the source file name
+  * @param  line: assert_param error line source number
+  * @retval None
+  */
+void assert_failed(uint8_t *file, uint32_t line)
+{
+  /* USER CODE BEGIN 6 */
+  /* User can add his own implementation to report the file name and line number,
+     tex: printf("Wrong parameters value: file %s on line %d\r\n", file, line) */
+  /* USER CODE END 6 */
+}
+#endif /* USE_FULL_ASSERT */
+
+/************************ (C) COPYRIGHT STMicroelectronics *****END OF FILE****/
diff --git a/aidge_export_arm_cortexm/templates/main_call/print_output.jinja b/aidge_export_arm_cortexm/templates/main_call/print_output.jinja
new file mode 100644
index 0000000..5f250b6
--- /dev/null
+++ b/aidge_export_arm_cortexm/templates/main_call/print_output.jinja
@@ -0,0 +1,44 @@
+#ifndef MAIN_CALL_HPP
+#define MAIN_CALL_HPP
+
+#include "forward.hpp"
+#include <cstdio> 
+{% for name in inputs_name %}
+#include "{{ name }}.h"
+{% endfor %}
+
+{% set printf_formats = {
+    "double": "%lf",
+    "float": "%f",
+    "int8_t": "%hhd",
+    "int16_t": "%hd",
+    "int32_t": "%d",
+    "int64_t": "%lld",
+    "uint8_t": "%hhu",
+    "uint16_t": "%hu",
+    "uint32_t": "%u",
+    "uint64_t": "%llu"
+} %}
+
+// Function declaration for main_call
+int print_output(
+    // Initialize the output arrays
+    {%- for o in range(outputs_name | length) %}
+    {{ outputs_dtype[o] }}* {{ outputs_name[o] }} = nullptr;
+    {% endfor %}
+
+    // Call the forward function
+    {{ func_name }}({{ inputs_name|join(", ") }}{% if inputs_name %}, {% endif %}&{{ outputs_name|join(", &") }});
+
+    // Print the results of each output
+    {%- for o in range(outputs_name | length) %}
+    printf("{{ outputs_name[o] }}:\n\r");
+    for (int o = 0; o < {{ outputs_size[o] }}; ++o) {
+        printf("{{ printf_formats[outputs_dtype[o]] }} ", {{ outputs_name[o] }}[o]);
+    }
+    printf("\n\r");
+    {% endfor %}
+    return 0;
+);
+
+#endif // MAIN_CALL_HPP
\ No newline at end of file
diff --git a/ba.py b/ba.py
new file mode 100644
index 0000000..4930900
--- /dev/null
+++ b/ba.py
@@ -0,0 +1,102 @@
+
+import aidge_core
+import aidge_backend_cpu
+import aidge_export_arm_cortexm
+import aidge_export_cpp
+import aidge_onnx 
+# import aidge_quantization
+import numpy as np
+from aidge_export_arm_cortexm.export_registry import ExportLibAidgeARM, ExportLibCMSISNN
+from aidge_export_cpp.export_registry import ExportLibCpp
+
+SEED = 123
+np.random.seed(SEED)
+# PARAMETERS
+INPUT_DIMS = [1, 3, 5, 5]
+KERNEL_DIMS = [1, 1]
+STRIDE_DIMS = [1, 1]
+DILATION_DIMS = [1, 1]
+IN_CHANNELS = 2
+OUT_CHANNELS = 2
+NO_BIAS = False
+
+
+# EXECUTION
+
+PROPAGATE = True
+FUSE = True
+ADAPT_TO_BACKEND = True
+CONSTANT_FOLDING = False
+GENERATE_SCHEDULING = True
+EXPORT = True
+PRINT_INPUT_OUTPUT_INFOS = True
+NAME = "test_cmsis_nn"
+LIB = "CMSIS-NN"
+INIT_WEIGHTS = True
+
+model = aidge_core.sequential([
+    # aidge_core.Producer(aidge_core.Tensor(dims=INPUT_DIMS), name="dataProvider"),
+    aidge_core.Conv2D(in_channels=3, out_channels=OUT_CHANNELS, kernel_dims=KERNEL_DIMS, name='conv', stride_dims=STRIDE_DIMS, dilation_dims=DILATION_DIMS, no_bias=NO_BIAS),
+])
+
+model.save("init")
+
+input_array = np.random.random( size=INPUT_DIMS).astype(np.float32)
+#Init des poids 
+if INIT_WEIGHTS:
+    for n in model.get_nodes() :
+        print( "Node : " + str(n))
+        if n.type() == "Producer":
+            dims = n.get_operator().get_output(0).dims()
+            print(dims)
+            array = np.random.random( size=dims).astype(np.float32)
+            tensor = aidge_core.Tensor(array).set_datatype(aidge_core.dtype.float32)
+            n.get_operator().set_output(0, aidge_core.Tensor(array))
+            print(str(n.get_operator().get_output(0).dformat()))
+            print("Data in :")
+            print(array)
+            
+
+model.compile("cpu", aidge_core.dtype.float32, dims=[INPUT_DIMS])
+
+
+
+def propagate(model, scheduler, tensor):
+    # Setup the input
+    
+    input_tensor = aidge_core.Tensor(tensor)
+    # Tensor backend must be set again ...
+    input_tensor.set_backend("cpu")
+    input_tensor.set_datatype(aidge_core.dtype.float32)
+    # Run the inference 
+    scheduler.forward(True, [input_tensor])    
+    # Gather the results
+    output_node = model.get_output_nodes().pop()
+    output_tensor = output_node.get_operator().get_output(0)
+    aidge_core.export_utils.generate_input_file(export_folder="conv_export", array_name="expected_output", tensor=output_tensor)
+    return np.array(output_tensor)
+
+
+if PROPAGATE:
+    model.set_datatype(aidge_core.dtype.float32)
+    model.set_backend("cpu")
+    scheduler = aidge_core.SequentialScheduler(model)
+    output_array =propagate(model, scheduler, input_array)
+    print(output_array)
+
+ 
+scheduler = aidge_core.SequentialScheduler(model)
+scheduler.generate_scheduling()
+scheduler.graph_view()
+
+
+aidge_export_arm_cortexm.export(
+        "conv_export",
+        graphview=model,
+        scheduler = scheduler,
+        board="stm32h7"
+        
+)
+aidge_export_arm_cortexm.utils.generate_call_function_arm_cortex_m("conv_export","main","stm32h7")
+aidge_export_arm_cortexm.utils.generate_print_output_arm_cortex_m("conv_export",model,"stm32h7",input_array)
+# aidge_core.export_utils.generate_input_file(export_folder="conv_export", array_name="_input_0", tensor=aidge_core.Tensor(input_array))
-- 
GitLab


From 0ea91be627ae10cc7f7a6d899361a529fe410f5d Mon Sep 17 00:00:00 2001
From: Wissam Boussella <wissam.boussella@cea.fr>
Date: Fri, 11 Apr 2025 14:55:51 +0200
Subject: [PATCH 2/6] Refactor print_output function and update imports in
 utils.py

---
 .../templates/main_call/print_output.jinja    |  8 +--
 aidge_export_arm_cortexm/utils.py             | 56 ++++++++++++++++++-
 2 files changed, 59 insertions(+), 5 deletions(-)

diff --git a/aidge_export_arm_cortexm/templates/main_call/print_output.jinja b/aidge_export_arm_cortexm/templates/main_call/print_output.jinja
index 5f250b6..5ee79cd 100644
--- a/aidge_export_arm_cortexm/templates/main_call/print_output.jinja
+++ b/aidge_export_arm_cortexm/templates/main_call/print_output.jinja
@@ -2,7 +2,7 @@
 #define MAIN_CALL_HPP
 
 #include "forward.hpp"
-#include <cstdio> 
+#include "stdio.h" 
 {% for name in inputs_name %}
 #include "{{ name }}.h"
 {% endfor %}
@@ -21,10 +21,10 @@
 } %}
 
 // Function declaration for main_call
-int print_output(
+int print_output(){
     // Initialize the output arrays
     {%- for o in range(outputs_name | length) %}
-    {{ outputs_dtype[o] }}* {{ outputs_name[o] }} = nullptr;
+    {{ outputs_dtype[o] }}* {{ outputs_name[o] }} = NULL;
     {% endfor %}
 
     // Call the forward function
@@ -39,6 +39,6 @@ int print_output(
     printf("\n\r");
     {% endfor %}
     return 0;
-);
+};
 
 #endif // MAIN_CALL_HPP
\ No newline at end of file
diff --git a/aidge_export_arm_cortexm/utils.py b/aidge_export_arm_cortexm/utils.py
index e5b166d..cf8cf74 100644
--- a/aidge_export_arm_cortexm/utils.py
+++ b/aidge_export_arm_cortexm/utils.py
@@ -1,5 +1,8 @@
 from importlib.metadata import version
-
+from pathlib import Path
+from aidge_core.export_utils import generate_file, data_conversion
+import aidge_core
+from aidge_export_arm_cortexm import ROOT
 
 def show_version():
     version_aidge_export_arm_cortexm = version("aidge_export_arm_cortexm")
@@ -7,3 +10,54 @@ def show_version():
 
 def get_project_version()->str:
     return version("aidge_export_arm_cortexm")
+
+def generate_call_function_arm_cortex_m(export_folder: str, call_function: str, board: str) -> None:
+    generate_file(
+        str(Path(export_folder) / "Src" / "main.c"),
+        str(ROOT / "templates" / "main_call" / str("main_" + board + ".jinja")),
+        call_function=call_function
+    )
+
+def generate_print_output_arm_cortex_m(export_folder: str, graph_view: aidge_core.GraphView, board: str, inputs_tensor=None) -> None:
+
+    outputs_name: list[str] = []
+    outputs_dtype: list[str] = []
+    outputs_size: list[int] = []
+    inputs_name: list[str] = []
+    gv_inputs: list[tuple[aidge_core.Node, int]] = graph_view.get_ordered_inputs()
+    gv_outputs: list[tuple[aidge_core.Node, int]] = graph_view.get_ordered_outputs()
+
+    for in_node, in_idx in gv_inputs:
+        in_node_input, in_node_input_idx = in_node.input(in_idx)
+        in_name = f"{in_node.name()}_input_{in_idx}" if in_node_input is None else f"{in_node_input.name()}_output_{in_node_input_idx}"
+        inputs_name.append(in_name)
+        input_tensor = in_node.get_operator().get_input(in_idx)
+        if input_tensor is None or input_tensor.undefined() or not input_tensor.has_impl():
+            if inputs_tensor is not None:
+                aidge_core.Log.notice("No support for inputs_tensor argument yet.")
+                aidge_core.Log.notice(f"No input tensor set for {in_name}, main generated will not be functionnal after code generation.")
+            else:
+                aidge_core.Log.notice(f"No input tensor set for {in_name}, main generated will not be functionnal after code generation.")
+        else:
+            aidge_core.export_utils.generate_input_file(export_folder=export_folder, array_name=in_name, tensor=input_tensor)
+
+    for out_node, out_id in gv_outputs:
+        outputs_name.append(f"{out_node.name()}_output_{out_id}")
+        out_tensor = out_node.get_operator().get_output(out_id)
+        outputs_dtype.append(data_conversion.aidge2c(out_tensor.dtype()))
+        outputs_size.append(out_tensor.size())
+
+    if len(outputs_name) != len(outputs_dtype) or len(outputs_name) != len(outputs_size):
+            raise RuntimeError("FATAL: Output args list does not have the same length this is an internal bug.")
+
+    ROOT = Path(__file__).resolve().parents[0]
+    generate_call_function_arm_cortex_m(Path(export_folder),"print_output",board=board)
+    generate_file(
+        str(Path(export_folder)/"Src" / "print_output.hpp"),
+        str(ROOT / "templates" / "main_call" / "print_output.jinja"),
+        func_name="model_forward",
+        inputs_name=inputs_name,
+        outputs_name=outputs_name,
+        outputs_dtype=outputs_dtype,
+        outputs_size=outputs_size
+    )
\ No newline at end of file
-- 
GitLab


From ca3258a87d1e8cc00efbb6f3e0fa14f281a3cdc5 Mon Sep 17 00:00:00 2001
From: Wissam Boussella <wissam.boussella@cea.fr>
Date: Fri, 11 Apr 2025 14:58:41 +0200
Subject: [PATCH 3/6] Remove ba.py file, needs to fix also the
 main_stm32F7.jinja, he's not working, and moves method to benchmark.py

---
 ba.py | 102 ----------------------------------------------------------
 1 file changed, 102 deletions(-)
 delete mode 100644 ba.py

diff --git a/ba.py b/ba.py
deleted file mode 100644
index 4930900..0000000
--- a/ba.py
+++ /dev/null
@@ -1,102 +0,0 @@
-
-import aidge_core
-import aidge_backend_cpu
-import aidge_export_arm_cortexm
-import aidge_export_cpp
-import aidge_onnx 
-# import aidge_quantization
-import numpy as np
-from aidge_export_arm_cortexm.export_registry import ExportLibAidgeARM, ExportLibCMSISNN
-from aidge_export_cpp.export_registry import ExportLibCpp
-
-SEED = 123
-np.random.seed(SEED)
-# PARAMETERS
-INPUT_DIMS = [1, 3, 5, 5]
-KERNEL_DIMS = [1, 1]
-STRIDE_DIMS = [1, 1]
-DILATION_DIMS = [1, 1]
-IN_CHANNELS = 2
-OUT_CHANNELS = 2
-NO_BIAS = False
-
-
-# EXECUTION
-
-PROPAGATE = True
-FUSE = True
-ADAPT_TO_BACKEND = True
-CONSTANT_FOLDING = False
-GENERATE_SCHEDULING = True
-EXPORT = True
-PRINT_INPUT_OUTPUT_INFOS = True
-NAME = "test_cmsis_nn"
-LIB = "CMSIS-NN"
-INIT_WEIGHTS = True
-
-model = aidge_core.sequential([
-    # aidge_core.Producer(aidge_core.Tensor(dims=INPUT_DIMS), name="dataProvider"),
-    aidge_core.Conv2D(in_channels=3, out_channels=OUT_CHANNELS, kernel_dims=KERNEL_DIMS, name='conv', stride_dims=STRIDE_DIMS, dilation_dims=DILATION_DIMS, no_bias=NO_BIAS),
-])
-
-model.save("init")
-
-input_array = np.random.random( size=INPUT_DIMS).astype(np.float32)
-#Init des poids 
-if INIT_WEIGHTS:
-    for n in model.get_nodes() :
-        print( "Node : " + str(n))
-        if n.type() == "Producer":
-            dims = n.get_operator().get_output(0).dims()
-            print(dims)
-            array = np.random.random( size=dims).astype(np.float32)
-            tensor = aidge_core.Tensor(array).set_datatype(aidge_core.dtype.float32)
-            n.get_operator().set_output(0, aidge_core.Tensor(array))
-            print(str(n.get_operator().get_output(0).dformat()))
-            print("Data in :")
-            print(array)
-            
-
-model.compile("cpu", aidge_core.dtype.float32, dims=[INPUT_DIMS])
-
-
-
-def propagate(model, scheduler, tensor):
-    # Setup the input
-    
-    input_tensor = aidge_core.Tensor(tensor)
-    # Tensor backend must be set again ...
-    input_tensor.set_backend("cpu")
-    input_tensor.set_datatype(aidge_core.dtype.float32)
-    # Run the inference 
-    scheduler.forward(True, [input_tensor])    
-    # Gather the results
-    output_node = model.get_output_nodes().pop()
-    output_tensor = output_node.get_operator().get_output(0)
-    aidge_core.export_utils.generate_input_file(export_folder="conv_export", array_name="expected_output", tensor=output_tensor)
-    return np.array(output_tensor)
-
-
-if PROPAGATE:
-    model.set_datatype(aidge_core.dtype.float32)
-    model.set_backend("cpu")
-    scheduler = aidge_core.SequentialScheduler(model)
-    output_array =propagate(model, scheduler, input_array)
-    print(output_array)
-
- 
-scheduler = aidge_core.SequentialScheduler(model)
-scheduler.generate_scheduling()
-scheduler.graph_view()
-
-
-aidge_export_arm_cortexm.export(
-        "conv_export",
-        graphview=model,
-        scheduler = scheduler,
-        board="stm32h7"
-        
-)
-aidge_export_arm_cortexm.utils.generate_call_function_arm_cortex_m("conv_export","main","stm32h7")
-aidge_export_arm_cortexm.utils.generate_print_output_arm_cortex_m("conv_export",model,"stm32h7",input_array)
-# aidge_core.export_utils.generate_input_file(export_folder="conv_export", array_name="_input_0", tensor=aidge_core.Tensor(input_array))
-- 
GitLab


From e4bced38535e7820a4408be0aee99af5f57382d5 Mon Sep 17 00:00:00 2001
From: Wissam Boussella <wissam.boussella@cea.fr>
Date: Fri, 11 Apr 2025 15:09:20 +0200
Subject: [PATCH 4/6] Refactor benchmark and utils: move
 generate_call_function_arm_cortex_m and generate_print_output_arm_cortex_m to
 benchmark.py

---
 aidge_export_arm_cortexm/benchmark.py | 189 ++++++++------------------
 aidge_export_arm_cortexm/utils.py     |  50 -------
 2 files changed, 55 insertions(+), 184 deletions(-)

diff --git a/aidge_export_arm_cortexm/benchmark.py b/aidge_export_arm_cortexm/benchmark.py
index 6366c57..1c3004b 100644
--- a/aidge_export_arm_cortexm/benchmark.py
+++ b/aidge_export_arm_cortexm/benchmark.py
@@ -1,138 +1,59 @@
-import contextlib
-import os
-from shutil import rmtree
-from subprocess import run
-
 import numpy as np
 
 import aidge_core
-import aidge_backend_cpu
-import aidge_export_arm_cortexm
-
-def measure_inference_time(model: aidge_core.GraphView, input_data: list[str, np.ndarray], nb_warmup: int = 10, nb_iterations: int = 50) -> list[float]:
-    # load and set up the model
-    # model.set_datatype(ai.dtype.float32)
-    model.set_backend("cpu")
-
-    # create input Tensor list for the GraphView
-    ordered_inputs: list[aidge_core.Tensor] = []
-    # [tmp fix] manual transpositin of data for input of export BEFORE converting to Tensor
-    for i in input_data:
-        nb_dims = len(i[1].shape)
-        if nb_dims == 3:
-            ordered_inputs.append(aidge_core.Tensor(i[1].transpose(0,2,1).reshape(i[1].shape).copy()))
-        if nb_dims == 4:
-            ordered_inputs.append(aidge_core.Tensor(np.transpose(i[1], axes=(0,2,3,1)).reshape(i[1].shape).copy()))
+from pathlib import Path
+from aidge_core.export_utils import generate_file, data_conversion
+from aidge_export_arm_cortexm import ROOT
+
+
+
+def generate_call_function_arm_cortex_m(export_folder: str, call_function: str, board: str) -> None:
+    generate_file(
+        str(Path(export_folder) / "Src" / "main.c"),
+        str(ROOT / "templates" / "main_call" / str("main_" + board + ".jinja")),
+        call_function=call_function
+    )
+
+def generate_print_output_arm_cortex_m(export_folder: str, graph_view: aidge_core.GraphView, board: str, inputs_tensor=None) -> None:
+
+    outputs_name: list[str] = []
+    outputs_dtype: list[str] = []
+    outputs_size: list[int] = []
+    inputs_name: list[str] = []
+    gv_inputs: list[tuple[aidge_core.Node, int]] = graph_view.get_ordered_inputs()
+    gv_outputs: list[tuple[aidge_core.Node, int]] = graph_view.get_ordered_outputs()
+
+    for in_node, in_idx in gv_inputs:
+        in_node_input, in_node_input_idx = in_node.input(in_idx)
+        in_name = f"{in_node.name()}_input_{in_idx}" if in_node_input is None else f"{in_node_input.name()}_output_{in_node_input_idx}"
+        inputs_name.append(in_name)
+        input_tensor = in_node.get_operator().get_input(in_idx)
+        if input_tensor is None or input_tensor.undefined() or not input_tensor.has_impl():
+            if inputs_tensor is not None:
+                aidge_core.Log.notice("No support for inputs_tensor argument yet.")
+                aidge_core.Log.notice(f"No input tensor set for {in_name}, main generated will not be functionnal after code generation.")
+            else:
+                aidge_core.Log.notice(f"No input tensor set for {in_name}, main generated will not be functionnal after code generation.")
         else:
-            ordered_inputs.append(aidge_core.Tensor(i[1]))
-
-    # set inputs for the export
-    for i, inp in enumerate(model.get_ordered_inputs()):
-        op = inp[0].get_operator()
-        op.set_input(i, ordered_inputs[i])
-
-    model.forward_dims([t.dims() for t in ordered_inputs])
-
-    scheduler = aidge_core.SequentialScheduler(model)
-    scheduler.generate_scheduling()
-
-    # for ordered_input in ordered_inputs:
-        # ordered_input.set_backend("cpu")
-    operator_type: str = model.get_ordered_outputs()[0][0].get_operator().type()
-    print("  ├─Generating export...", end="", flush=True)
-    folder_name: str = f"{operator_type.lower()}_test_export_cpp"
-    with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f):
-        aidge_core.export_utils.scheduler_export(
-            scheduler,
-            folder_name,
-            aidge_export_arm_cortexm.ExportLibAidgeARM,
-            memory_manager=aidge_core.mem_info.generate_optimized_memory_info,
-            memory_manager_args={"wrapping": False }
-        )
-        aidge_core.export_utils.generate_main_inference_time_cpp(folder_name, model, nb_iterations, nb_warmup)
-    print(" ok")
-
-    print("  ├─Compiling...", end="", flush=True)
-    with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f):
-        run(['make'], cwd=folder_name, stdout=f)
-    print(" ok")
-    timings_str = run(f'./{folder_name}/bin/run_export', capture_output=True, text=True)
-
-    folder_path = os.path.abspath(folder_name)
-    if os.path.exists(folder_path):
-        rmtree(folder_path, ignore_errors=True)
-
-    timings = [float(t) for t in timings_str.stdout.split(' ') if t.strip()]
-    return timings
-
-def compute_output(model: aidge_core.GraphView, input_data: list[str, np.ndarray]) -> list[np.ndarray]:
-    # load and set up the model
-    model.set_backend("cpu")
-
-    # create input Tensor list for the GraphView
-    ordered_inputs: list[aidge_core.Tensor] = []
-    # [tmp fix] manual transpositin of data for input of export BEFORE converting to Tensor
-    for i in input_data:
-        nb_dims = len(i[1].shape)
-        if nb_dims == 3:
-            ordered_inputs.append(aidge_core.Tensor(i[1].transpose(0,2,1).reshape(i[1].shape).copy()))
-        if nb_dims == 4:
-            ordered_inputs.append(aidge_core.Tensor(np.transpose(i[1], axes=(0,2,3,1)).reshape(i[1].shape).copy()))
-        else:
-            ordered_inputs.append(aidge_core.Tensor(i[1]))
-
-    # set inputs for the export
-    for i, inp in enumerate(model.get_ordered_inputs()):
-        op = inp[0].get_operator()
-        op.set_input(i, ordered_inputs[i])
-
-    model.forward_dims([t.dims() for t in ordered_inputs])
-
-    scheduler = aidge_core.SequentialScheduler(model)
-    scheduler.generate_scheduling()
-
-
-    operator_type: str = model.get_ordered_outputs()[0][0].get_operator().type()
-    print("  │ Generating export...", end="", flush=True)
-    folder_name: str = f"{operator_type.lower()}_test_export_cpp"
-    with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f):
-        aidge_core.export_utils.scheduler_export(
-            scheduler,
-            folder_name,
-            aidge_export_cpp.ExportLibCpp,
-            memory_manager=aidge_core.mem_info.generate_optimized_memory_info,
-            memory_manager_args={"wrapping": False }
-        )
-        aidge_core.export_utils.generate_main_display_output_cpp(folder_name, model)
-    print(" ok")
-
-    print("  │ Compiling...", end="", flush=True)
-    with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f):
-        run(['make'], cwd=folder_name, stdout=f)
-    print(" ok")
-    output_str: str = run(f'./{folder_name}/bin/run_export', capture_output=True, text=True)
-    folder_path = os.path.abspath(folder_name)
-    if os.path.exists(folder_path):
-        rmtree(folder_path, ignore_errors=True)
-
-    outputs_str: list[str] = output_str.stdout.strip().split('\n')
-    outputs = [np.array([float(val) for val in single_output_str.split(' ') if val.strip()]) for i, single_output_str in enumerate(outputs_str)]
-
-    for i, pair in enumerate(model.get_ordered_outputs()):
-        dims = pair[0].get_operator().get_output(pair[1]).dims()
-        nb_dims = len(dims)
-        dims_permutted = dims
-        if nb_dims == 3:
-            dims_permutted = [dims[0], dims[2], dims[1]]
-        if nb_dims == 4:
-            dims_permutted = [dims[0], dims[2], dims[3], dims[1]]
-
-        if np.prod(dims) != outputs[i].size:
-            aidge_core.Log.fatal("Incompatible export output size ({}) with required shape {}", outputs[i].size, dims)
-        outputs[i] = outputs[i].reshape(dims_permutted)
-        if nb_dims == 3:
-            outputs[i] = outputs[i].transpose(0,2,1)
-        if nb_dims == 4:
-            outputs[i] = outputs[i].transpose(0,3,1,2)
-
-    return outputs
\ No newline at end of file
+            aidge_core.export_utils.generate_input_file(export_folder=export_folder, array_name=in_name, tensor=input_tensor)
+
+    for out_node, out_id in gv_outputs:
+        outputs_name.append(f"{out_node.name()}_output_{out_id}")
+        out_tensor = out_node.get_operator().get_output(out_id)
+        outputs_dtype.append(data_conversion.aidge2c(out_tensor.dtype()))
+        outputs_size.append(out_tensor.size())
+
+    if len(outputs_name) != len(outputs_dtype) or len(outputs_name) != len(outputs_size):
+            raise RuntimeError("FATAL: Output args list does not have the same length this is an internal bug.")
+
+    ROOT = Path(__file__).resolve().parents[0]
+    generate_call_function_arm_cortex_m(Path(export_folder),"print_output",board=board)
+    generate_file(
+        str(Path(export_folder)/"Src" / "print_output.hpp"),
+        str(ROOT / "templates" / "main_call" / "print_output.jinja"),
+        func_name="model_forward",
+        inputs_name=inputs_name,
+        outputs_name=outputs_name,
+        outputs_dtype=outputs_dtype,
+        outputs_size=outputs_size
+    )
\ No newline at end of file
diff --git a/aidge_export_arm_cortexm/utils.py b/aidge_export_arm_cortexm/utils.py
index cf8cf74..295c2fe 100644
--- a/aidge_export_arm_cortexm/utils.py
+++ b/aidge_export_arm_cortexm/utils.py
@@ -11,53 +11,3 @@ def show_version():
 def get_project_version()->str:
     return version("aidge_export_arm_cortexm")
 
-def generate_call_function_arm_cortex_m(export_folder: str, call_function: str, board: str) -> None:
-    generate_file(
-        str(Path(export_folder) / "Src" / "main.c"),
-        str(ROOT / "templates" / "main_call" / str("main_" + board + ".jinja")),
-        call_function=call_function
-    )
-
-def generate_print_output_arm_cortex_m(export_folder: str, graph_view: aidge_core.GraphView, board: str, inputs_tensor=None) -> None:
-
-    outputs_name: list[str] = []
-    outputs_dtype: list[str] = []
-    outputs_size: list[int] = []
-    inputs_name: list[str] = []
-    gv_inputs: list[tuple[aidge_core.Node, int]] = graph_view.get_ordered_inputs()
-    gv_outputs: list[tuple[aidge_core.Node, int]] = graph_view.get_ordered_outputs()
-
-    for in_node, in_idx in gv_inputs:
-        in_node_input, in_node_input_idx = in_node.input(in_idx)
-        in_name = f"{in_node.name()}_input_{in_idx}" if in_node_input is None else f"{in_node_input.name()}_output_{in_node_input_idx}"
-        inputs_name.append(in_name)
-        input_tensor = in_node.get_operator().get_input(in_idx)
-        if input_tensor is None or input_tensor.undefined() or not input_tensor.has_impl():
-            if inputs_tensor is not None:
-                aidge_core.Log.notice("No support for inputs_tensor argument yet.")
-                aidge_core.Log.notice(f"No input tensor set for {in_name}, main generated will not be functionnal after code generation.")
-            else:
-                aidge_core.Log.notice(f"No input tensor set for {in_name}, main generated will not be functionnal after code generation.")
-        else:
-            aidge_core.export_utils.generate_input_file(export_folder=export_folder, array_name=in_name, tensor=input_tensor)
-
-    for out_node, out_id in gv_outputs:
-        outputs_name.append(f"{out_node.name()}_output_{out_id}")
-        out_tensor = out_node.get_operator().get_output(out_id)
-        outputs_dtype.append(data_conversion.aidge2c(out_tensor.dtype()))
-        outputs_size.append(out_tensor.size())
-
-    if len(outputs_name) != len(outputs_dtype) or len(outputs_name) != len(outputs_size):
-            raise RuntimeError("FATAL: Output args list does not have the same length this is an internal bug.")
-
-    ROOT = Path(__file__).resolve().parents[0]
-    generate_call_function_arm_cortex_m(Path(export_folder),"print_output",board=board)
-    generate_file(
-        str(Path(export_folder)/"Src" / "print_output.hpp"),
-        str(ROOT / "templates" / "main_call" / "print_output.jinja"),
-        func_name="model_forward",
-        inputs_name=inputs_name,
-        outputs_name=outputs_name,
-        outputs_dtype=outputs_dtype,
-        outputs_size=outputs_size
-    )
\ No newline at end of file
-- 
GitLab


From 2c9ef702373900770b3f6b8e72fb7e21d5efe634 Mon Sep 17 00:00:00 2001
From: Wissam Boussella <wissam.boussella@cea.fr>
Date: Tue, 22 Apr 2025 14:25:31 +0200
Subject: [PATCH 5/6] Refactor benchmark and generation utilities: move
 functions to generate.py and add benchmark_inference_time template

---
 aidge_export_arm_cortexm/benchmark.py         | 188 +++++++++++++-----
 aidge_export_arm_cortexm/generate.py          | 103 ++++++++++
 .../main_call/benchmark_inference_time.jinja  |  56 ++++++
 .../templates/main_call/main_stm32f7.jinja    |   2 +
 4 files changed, 299 insertions(+), 50 deletions(-)
 create mode 100644 aidge_export_arm_cortexm/generate.py
 create mode 100644 aidge_export_arm_cortexm/templates/main_call/benchmark_inference_time.jinja

diff --git a/aidge_export_arm_cortexm/benchmark.py b/aidge_export_arm_cortexm/benchmark.py
index 1c3004b..e842dde 100644
--- a/aidge_export_arm_cortexm/benchmark.py
+++ b/aidge_export_arm_cortexm/benchmark.py
@@ -2,58 +2,146 @@ import numpy as np
 
 import aidge_core
 from pathlib import Path
-from aidge_core.export_utils import generate_file, data_conversion
+import aidge_export_arm_cortexm.generate 
 from aidge_export_arm_cortexm import ROOT
 
+import contextlib
+import os
+from shutil import rmtree
+from subprocess import run
 
+import numpy as np
+
+import aidge_core
+import aidge_backend_cpu
+import aidge_export_cpp
 
-def generate_call_function_arm_cortex_m(export_folder: str, call_function: str, board: str) -> None:
-    generate_file(
-        str(Path(export_folder) / "Src" / "main.c"),
-        str(ROOT / "templates" / "main_call" / str("main_" + board + ".jinja")),
-        call_function=call_function
-    )
-
-def generate_print_output_arm_cortex_m(export_folder: str, graph_view: aidge_core.GraphView, board: str, inputs_tensor=None) -> None:
-
-    outputs_name: list[str] = []
-    outputs_dtype: list[str] = []
-    outputs_size: list[int] = []
-    inputs_name: list[str] = []
-    gv_inputs: list[tuple[aidge_core.Node, int]] = graph_view.get_ordered_inputs()
-    gv_outputs: list[tuple[aidge_core.Node, int]] = graph_view.get_ordered_outputs()
-
-    for in_node, in_idx in gv_inputs:
-        in_node_input, in_node_input_idx = in_node.input(in_idx)
-        in_name = f"{in_node.name()}_input_{in_idx}" if in_node_input is None else f"{in_node_input.name()}_output_{in_node_input_idx}"
-        inputs_name.append(in_name)
-        input_tensor = in_node.get_operator().get_input(in_idx)
-        if input_tensor is None or input_tensor.undefined() or not input_tensor.has_impl():
-            if inputs_tensor is not None:
-                aidge_core.Log.notice("No support for inputs_tensor argument yet.")
-                aidge_core.Log.notice(f"No input tensor set for {in_name}, main generated will not be functionnal after code generation.")
-            else:
-                aidge_core.Log.notice(f"No input tensor set for {in_name}, main generated will not be functionnal after code generation.")
+def measure_inference_time_lib_aidge(model: aidge_core.GraphView, input_data: list[str, np.ndarray], nb_warmup: int = 10, nb_iterations: int = 50) -> list[float]:
+    # load and set up the model
+    # model.set_datatype(ai.dtype.float32)
+    model.set_backend("cpu")
+
+    # create input Tensor list for the GraphView
+    ordered_inputs: list[aidge_core.Tensor] = []
+    # [tmp fix] manual transpositin of data for input of export BEFORE converting to Tensor
+    for i in input_data:
+        nb_dims = len(i[1].shape)
+        if nb_dims == 3:
+            ordered_inputs.append(aidge_core.Tensor(i[1].transpose(0,2,1).reshape(i[1].shape).copy()))
+        if nb_dims == 4:
+            ordered_inputs.append(aidge_core.Tensor(np.transpose(i[1], axes=(0,2,3,1)).reshape(i[1].shape).copy()))
         else:
-            aidge_core.export_utils.generate_input_file(export_folder=export_folder, array_name=in_name, tensor=input_tensor)
-
-    for out_node, out_id in gv_outputs:
-        outputs_name.append(f"{out_node.name()}_output_{out_id}")
-        out_tensor = out_node.get_operator().get_output(out_id)
-        outputs_dtype.append(data_conversion.aidge2c(out_tensor.dtype()))
-        outputs_size.append(out_tensor.size())
-
-    if len(outputs_name) != len(outputs_dtype) or len(outputs_name) != len(outputs_size):
-            raise RuntimeError("FATAL: Output args list does not have the same length this is an internal bug.")
-
-    ROOT = Path(__file__).resolve().parents[0]
-    generate_call_function_arm_cortex_m(Path(export_folder),"print_output",board=board)
-    generate_file(
-        str(Path(export_folder)/"Src" / "print_output.hpp"),
-        str(ROOT / "templates" / "main_call" / "print_output.jinja"),
-        func_name="model_forward",
-        inputs_name=inputs_name,
-        outputs_name=outputs_name,
-        outputs_dtype=outputs_dtype,
-        outputs_size=outputs_size
-    )
\ No newline at end of file
+            ordered_inputs.append(aidge_core.Tensor(i[1]))
+
+    # set inputs for the export
+    for i, inp in enumerate(model.get_ordered_inputs()):
+        op = inp[0].get_operator()
+        op.set_input(i, ordered_inputs[i])
+
+    model.forward_dims([t.dims() for t in ordered_inputs])
+
+    scheduler = aidge_core.SequentialScheduler(model)
+    scheduler.generate_scheduling()
+
+    # for ordered_input in ordered_inputs:
+        # ordered_input.set_backend("cpu")
+    operator_type: str = model.get_ordered_outputs()[0][0].get_operator().type()
+    print("  ├─Generating export...", end="", flush=True)
+    folder_name: str = f"{operator_type.lower()}_test_export_arm_cortexm"
+    with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f):
+        aidge_core.export_utils.scheduler_export(
+            scheduler,
+            folder_name,
+            aidge_export_arm_cortexm.ExportLibAidgeARM,
+            memory_manager=aidge_core.mem_info.generate_optimized_memory_info,
+            memory_manager_args={"wrapping": False }
+        )
+        aidge_core.export_utils.generate_main_inference_time_cpp(folder_name, model, nb_iterations, nb_warmup)
+        aidge_export_arm_cortexm.generate.generate_print_output_arm_cortex_m(folder_name, model, nb_iterations, nb_warmup)
+        
+    print(" ok")
+
+    print("  ├─Compiling...", end="", flush=True)
+    with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f):
+        run(['make'], cwd=folder_name, stdout=f)
+    print(" ok")
+    timings_str = run(f'./{folder_name}/bin/run_export', capture_output=True, text=True)
+
+    folder_path = os.path.abspath(folder_name)
+    if os.path.exists(folder_path):
+        rmtree(folder_path, ignore_errors=True)
+
+    timings = [float(t) for t in timings_str.stdout.split(' ') if t.strip()]
+    return timings
+
+# def compute_output(model: aidge_core.GraphView, input_data: list[str, np.ndarray]) -> list[np.ndarray]:
+#     # load and set up the model
+#     model.set_backend("cpu")
+
+#     # create input Tensor list for the GraphView
+#     ordered_inputs: list[aidge_core.Tensor] = []
+#     # [tmp fix] manual transpositin of data for input of export BEFORE converting to Tensor
+#     for i in input_data:
+#         nb_dims = len(i[1].shape)
+#         if nb_dims == 3:
+#             ordered_inputs.append(aidge_core.Tensor(i[1].transpose(0,2,1).reshape(i[1].shape).copy()))
+#         if nb_dims == 4:
+#             ordered_inputs.append(aidge_core.Tensor(np.transpose(i[1], axes=(0,2,3,1)).reshape(i[1].shape).copy()))
+#         else:
+#             ordered_inputs.append(aidge_core.Tensor(i[1]))
+
+#     # set inputs for the export
+#     for i, inp in enumerate(model.get_ordered_inputs()):
+#         op = inp[0].get_operator()
+#         op.set_input(i, ordered_inputs[i])
+
+#     model.forward_dims([t.dims() for t in ordered_inputs])
+
+#     scheduler = aidge_core.SequentialScheduler(model)
+#     scheduler.generate_scheduling()
+
+
+#     operator_type: str = model.get_ordered_outputs()[0][0].get_operator().type()
+#     print("  │ Generating export...", end="", flush=True)
+#     folder_name: str = f"{operator_type.lower()}_test_export_cpp"
+#     with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f):
+#         aidge_core.export_utils.scheduler_export(
+#             scheduler,
+#             folder_name,
+#             aidge_export_cpp.ExportLibCpp,
+#             memory_manager=aidge_core.mem_info.generate_optimized_memory_info,
+#             memory_manager_args={"wrapping": False }
+#         )
+#         aidge_core.export_utils.generate_main_display_output_cpp(folder_name, model)
+#     print(" ok")
+
+#     print("  │ Compiling...", end="", flush=True)
+#     with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f):
+#         run(['make'], cwd=folder_name, stdout=f)
+#     print(" ok")
+#     output_str: str = run(f'./{folder_name}/bin/run_export', capture_output=True, text=True)
+#     folder_path = os.path.abspath(folder_name)
+#     if os.path.exists(folder_path):
+#         rmtree(folder_path, ignore_errors=True)
+
+#     outputs_str: list[str] = output_str.stdout.strip().split('\n')
+#     outputs = [np.array([float(val) for val in single_output_str.split(' ') if val.strip()]) for i, single_output_str in enumerate(outputs_str)]
+
+#     for i, pair in enumerate(model.get_ordered_outputs()):
+#         dims = pair[0].get_operator().get_output(pair[1]).dims()
+#         nb_dims = len(dims)
+#         dims_permutted = dims
+#         if nb_dims == 3:
+#             dims_permutted = [dims[0], dims[2], dims[1]]
+#         if nb_dims == 4:
+#             dims_permutted = [dims[0], dims[2], dims[3], dims[1]]
+
+#         if np.prod(dims) != outputs[i].size:
+#             aidge_core.Log.fatal("Incompatible export output size ({}) with required shape {}", outputs[i].size, dims)
+#         outputs[i] = outputs[i].reshape(dims_permutted)
+#         if nb_dims == 3:
+#             outputs[i] = outputs[i].transpose(0,2,1)
+#         if nb_dims == 4:
+#             outputs[i] = outputs[i].transpose(0,3,1,2)
+
+#     return outputs
\ No newline at end of file
diff --git a/aidge_export_arm_cortexm/generate.py b/aidge_export_arm_cortexm/generate.py
new file mode 100644
index 0000000..934ad29
--- /dev/null
+++ b/aidge_export_arm_cortexm/generate.py
@@ -0,0 +1,103 @@
+from pathlib import Path
+from aidge_core.export_utils import generate_file, data_conversion
+from aidge_export_arm_cortexm import ROOT
+import aidge_core
+import aidge_export_arm_cortexm
+
+def generate_call_function_arm_cortex_m(export_folder: str, call_function: str, board: str) -> None:
+    generate_file(
+        str(Path(export_folder) / "Src" / "main.c"),
+        str(ROOT / "templates" / "main_call" / str("main_" + board + ".jinja")),
+        call_function=call_function
+    )
+
+def generate_print_output_arm_cortex_m(export_folder: str, graph_view: aidge_core.GraphView, board: str, inputs_tensor=None) -> None:
+
+    outputs_name: list[str] = []
+    outputs_dtype: list[str] = []
+    outputs_size: list[int] = []
+    inputs_name: list[str] = []
+    gv_inputs: list[tuple[aidge_core.Node, int]] = graph_view.get_ordered_inputs()
+    gv_outputs: list[tuple[aidge_core.Node, int]] = graph_view.get_ordered_outputs()
+
+    for in_node, in_idx in gv_inputs:
+        in_node_input, in_node_input_idx = in_node.input(in_idx)
+        in_name = f"{in_node.name()}_input_{in_idx}" if in_node_input is None else f"{in_node_input.name()}_output_{in_node_input_idx}"
+        inputs_name.append(in_name)
+        input_tensor = in_node.get_operator().get_input(in_idx)
+        if input_tensor is None or input_tensor.undefined() or not input_tensor.has_impl():
+            if inputs_tensor is not None:
+                aidge_core.Log.notice("No support for inputs_tensor argument yet.")
+                aidge_core.Log.notice(f"No input tensor set for {in_name}, main generated will not be functionnal after code generation.")
+            else:
+                aidge_core.Log.notice(f"No input tensor set for {in_name}, main generated will not be functionnal after code generation.")
+        else:
+            aidge_core.export_utils.generate_input_file(export_folder=export_folder, array_name=in_name, tensor=input_tensor)
+
+    for out_node, out_id in gv_outputs:
+        outputs_name.append(f"{out_node.name()}_output_{out_id}")
+        out_tensor = out_node.get_operator().get_output(out_id)
+        outputs_dtype.append(data_conversion.aidge2c(out_tensor.dtype()))
+        outputs_size.append(out_tensor.size())
+
+    if len(outputs_name) != len(outputs_dtype) or len(outputs_name) != len(outputs_size):
+            raise RuntimeError("FATAL: Output args list does not have the same length this is an internal bug.")
+
+    ROOT = Path(__file__).resolve().parents[0]
+    generate_call_function_arm_cortex_m(Path(export_folder),"print_output",board=board)
+    generate_file(
+        str(Path(export_folder)/"Src" / "print_output.hpp"),
+        str(ROOT / "templates" / "main_call" / "print_output.jinja"),
+        func_name="model_forward",
+        inputs_name=inputs_name,
+        outputs_name=outputs_name,
+        outputs_dtype=outputs_dtype,
+        outputs_size=outputs_size
+    )
+
+
+def generate_inference_time_arm_cortex_m(export_folder: str, graph_view: aidge_core.GraphView, board: str, nb_iterations, nb_warmup, inputs_tensor=None) -> None:
+    outputs_name: list[str] = []
+    outputs_dtype: list[str] = []
+    outputs_size: list[int] = []
+    inputs_name: list[str] = []
+    gv_inputs: list[tuple[aidge_core.Node, int]] = graph_view.get_ordered_inputs()
+    gv_outputs: list[tuple[aidge_core.Node, int]] = graph_view.get_ordered_outputs()
+
+    for in_node, in_idx in gv_inputs:
+        in_node_input, in_node_input_idx = in_node.input(in_idx)
+        in_name = f"{in_node.name()}_input_{in_idx}" if in_node_input is None else f"{in_node_input.name()}_output_{in_node_input_idx}"
+        inputs_name.append(in_name)
+        input_tensor = in_node.get_operator().get_input(in_idx)
+        if input_tensor is None or input_tensor.undefined() or not input_tensor.has_impl():
+            if inputs_tensor is not None:
+                aidge_core.Log.notice("No support for inputs_tensor argument yet.")
+                aidge_core.Log.notice(f"No input tensor set for {in_name}, main generated will not be functionnal after code generation.")
+            else:
+                aidge_core.Log.notice(f"No input tensor set for {in_name}, main generated will not be functionnal after code generation.")
+        else:
+            aidge_core.export_utils.generate_input_file(str(Path(export_folder) / "data"), array_name=in_name, tensor=input_tensor)
+
+    for out_node, out_id in gv_outputs:
+        outputs_name.append(f"{out_node.name()}_output_{out_id}")
+        out_tensor = out_node.get_operator().get_output(out_id)
+        outputs_dtype.append(data_conversion.aidge2c(out_tensor.dtype()))
+        outputs_size.append(out_tensor.size())
+
+    if len(outputs_name) != len(outputs_dtype) or len(outputs_name) != len(outputs_size):
+            raise RuntimeError("FATAL: Output args list does not have the same length this is an internal bug.")
+
+    ROOT = Path(__file__).resolve().parents[0]
+    generate_call_function_arm_cortex_m(Path(export_folder),"print_output",board=board)
+    generate_file(
+        str(Path(export_folder) / "benchmark_inference_time.hpp"),
+        str(ROOT / "templates" / "main_call" / "benchmark_inference_time.jinja"),
+        func_name="model_forward",
+        inputs_name=inputs_name,
+        outputs_name=outputs_name,
+        outputs_dtype=outputs_dtype,
+        outputs_size=outputs_size,
+        nb_iterations=nb_iterations,
+        nb_warmup=nb_warmup,
+        board=board
+    )
\ No newline at end of file
diff --git a/aidge_export_arm_cortexm/templates/main_call/benchmark_inference_time.jinja b/aidge_export_arm_cortexm/templates/main_call/benchmark_inference_time.jinja
new file mode 100644
index 0000000..028a530
--- /dev/null
+++ b/aidge_export_arm_cortexm/templates/main_call/benchmark_inference_time.jinja
@@ -0,0 +1,56 @@
+#ifndef BENCHMARK_INFERENCE_TIME_HPP
+#define BENCHMARK_INFERENCE_TIME_HPP
+
+#include "stdio.h" 
+#include "forward.hpp"
+
+// Necessary to have HAL_GetTick()
+#include "{{ board }}xx_hal.h"
+{% for name in inputs_name %}
+#include "{{ name }}.h"
+{% endfor %}
+
+{% set printf_formats = {
+    "double": "%lf",
+    "float": "%f",
+    "int8_t": "%hhd",
+    "int16_t": "%hd",
+    "int32_t": "%d",
+    "int64_t": "%lld",
+    "uint8_t": "%hhu",
+    "uint16_t": "%hu",
+    "uint32_t": "%u",
+    "uint64_t": "%llu"
+} %}
+
+int benchmark_inference_time()
+{
+    // Initialize the output arrays
+    {%- for o in range(outputs_name | length) %}
+    {{ outputs_dtype[o] }}* {{ outputs_name[o] }} = NULL;
+    {% endfor %}
+    uint32_t start;
+    uint32_t end;
+    double times[{{ nb_iterations }}] = {0};
+    for (std::size_t i = 0; i < {{ nb_iterations }} + {{ nb_warmup }}; ++i) {
+        if (i < {{ nb_warmup }}) {
+            {{ func_name }}({{ inputs_name|join(", ") }}{% if inputs_name %}, {% endif %}&{{ outputs_name|join(", &") }});
+        } else {
+            start = HAL_GetTick();
+            {{ func_name }}({{ inputs_name|join(", ") }}{% if inputs_name %}, {% endif %}&{{ outputs_name|join(", &") }});
+            {{ func_name }}({{ inputs_name|join(", ") }}{% if inputs_name %}, {% endif %}&{{ outputs_name|join(", &") }});
+            {{ func_name }}({{ inputs_name|join(", ") }}{% if inputs_name %}, {% endif %}&{{ outputs_name|join(", &") }});
+            {{ func_name }}({{ inputs_name|join(", ") }}{% if inputs_name %}, {% endif %}&{{ outputs_name|join(", &") }});
+            end = HAL_GetTick();
+            times[i - {{ nb_warmup }}] = ((double)(end - start)/CLOCKS_PER_SEC)/4.0;
+        }
+    }
+
+    for (std::size_t i = 0; i < {{ nb_iterations }}; ++i) {
+        printf("%.10lf ", times[i]);
+    }
+    printf("\n");
+    return 0;
+}
+
+#endif BENCHMARK_INFERENCE_TIME_HPP
\ No newline at end of file
diff --git a/aidge_export_arm_cortexm/templates/main_call/main_stm32f7.jinja b/aidge_export_arm_cortexm/templates/main_call/main_stm32f7.jinja
index a5f8e5e..50dea3f 100644
--- a/aidge_export_arm_cortexm/templates/main_call/main_stm32f7.jinja
+++ b/aidge_export_arm_cortexm/templates/main_call/main_stm32f7.jinja
@@ -26,6 +26,8 @@
 #include <stdio.h>
 #include <stdint.h>
 
+#include "dnn/include/forward.hpp"
+
 /* USER CODE END Includes */
 
 /* Private typedef -----------------------------------------------------------*/
-- 
GitLab


From 8d3507547309538d325b19e843691149c2a8b5bd Mon Sep 17 00:00:00 2001
From: Wissam Boussella <wissam.boussella@cea.fr>
Date: Tue, 22 Apr 2025 16:07:22 +0200
Subject: [PATCH 6/6] now can generate inference time for stm32 : can compile
 but not yet tested on the h7

---
 aidge_export_arm_cortexm/benchmark.py         | 72 -------------------
 aidge_export_arm_cortexm/generate.py          |  2 +-
 .../main_call/benchmark_inference_time.jinja  | 10 +--
 3 files changed, 6 insertions(+), 78 deletions(-)

diff --git a/aidge_export_arm_cortexm/benchmark.py b/aidge_export_arm_cortexm/benchmark.py
index e842dde..17f5ff9 100644
--- a/aidge_export_arm_cortexm/benchmark.py
+++ b/aidge_export_arm_cortexm/benchmark.py
@@ -73,75 +73,3 @@ def measure_inference_time_lib_aidge(model: aidge_core.GraphView, input_data: li
 
     timings = [float(t) for t in timings_str.stdout.split(' ') if t.strip()]
     return timings
-
-# def compute_output(model: aidge_core.GraphView, input_data: list[str, np.ndarray]) -> list[np.ndarray]:
-#     # load and set up the model
-#     model.set_backend("cpu")
-
-#     # create input Tensor list for the GraphView
-#     ordered_inputs: list[aidge_core.Tensor] = []
-#     # [tmp fix] manual transpositin of data for input of export BEFORE converting to Tensor
-#     for i in input_data:
-#         nb_dims = len(i[1].shape)
-#         if nb_dims == 3:
-#             ordered_inputs.append(aidge_core.Tensor(i[1].transpose(0,2,1).reshape(i[1].shape).copy()))
-#         if nb_dims == 4:
-#             ordered_inputs.append(aidge_core.Tensor(np.transpose(i[1], axes=(0,2,3,1)).reshape(i[1].shape).copy()))
-#         else:
-#             ordered_inputs.append(aidge_core.Tensor(i[1]))
-
-#     # set inputs for the export
-#     for i, inp in enumerate(model.get_ordered_inputs()):
-#         op = inp[0].get_operator()
-#         op.set_input(i, ordered_inputs[i])
-
-#     model.forward_dims([t.dims() for t in ordered_inputs])
-
-#     scheduler = aidge_core.SequentialScheduler(model)
-#     scheduler.generate_scheduling()
-
-
-#     operator_type: str = model.get_ordered_outputs()[0][0].get_operator().type()
-#     print("  │ Generating export...", end="", flush=True)
-#     folder_name: str = f"{operator_type.lower()}_test_export_cpp"
-#     with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f):
-#         aidge_core.export_utils.scheduler_export(
-#             scheduler,
-#             folder_name,
-#             aidge_export_cpp.ExportLibCpp,
-#             memory_manager=aidge_core.mem_info.generate_optimized_memory_info,
-#             memory_manager_args={"wrapping": False }
-#         )
-#         aidge_core.export_utils.generate_main_display_output_cpp(folder_name, model)
-#     print(" ok")
-
-#     print("  │ Compiling...", end="", flush=True)
-#     with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f):
-#         run(['make'], cwd=folder_name, stdout=f)
-#     print(" ok")
-#     output_str: str = run(f'./{folder_name}/bin/run_export', capture_output=True, text=True)
-#     folder_path = os.path.abspath(folder_name)
-#     if os.path.exists(folder_path):
-#         rmtree(folder_path, ignore_errors=True)
-
-#     outputs_str: list[str] = output_str.stdout.strip().split('\n')
-#     outputs = [np.array([float(val) for val in single_output_str.split(' ') if val.strip()]) for i, single_output_str in enumerate(outputs_str)]
-
-#     for i, pair in enumerate(model.get_ordered_outputs()):
-#         dims = pair[0].get_operator().get_output(pair[1]).dims()
-#         nb_dims = len(dims)
-#         dims_permutted = dims
-#         if nb_dims == 3:
-#             dims_permutted = [dims[0], dims[2], dims[1]]
-#         if nb_dims == 4:
-#             dims_permutted = [dims[0], dims[2], dims[3], dims[1]]
-
-#         if np.prod(dims) != outputs[i].size:
-#             aidge_core.Log.fatal("Incompatible export output size ({}) with required shape {}", outputs[i].size, dims)
-#         outputs[i] = outputs[i].reshape(dims_permutted)
-#         if nb_dims == 3:
-#             outputs[i] = outputs[i].transpose(0,2,1)
-#         if nb_dims == 4:
-#             outputs[i] = outputs[i].transpose(0,3,1,2)
-
-#     return outputs
\ No newline at end of file
diff --git a/aidge_export_arm_cortexm/generate.py b/aidge_export_arm_cortexm/generate.py
index 934ad29..9a9548b 100644
--- a/aidge_export_arm_cortexm/generate.py
+++ b/aidge_export_arm_cortexm/generate.py
@@ -88,7 +88,7 @@ def generate_inference_time_arm_cortex_m(export_folder: str, graph_view: aidge_c
             raise RuntimeError("FATAL: Output args list does not have the same length this is an internal bug.")
 
     ROOT = Path(__file__).resolve().parents[0]
-    generate_call_function_arm_cortex_m(Path(export_folder),"print_output",board=board)
+    generate_call_function_arm_cortex_m(Path(export_folder),"benchmark_inference_time",board=board)
     generate_file(
         str(Path(export_folder) / "benchmark_inference_time.hpp"),
         str(ROOT / "templates" / "main_call" / "benchmark_inference_time.jinja"),
diff --git a/aidge_export_arm_cortexm/templates/main_call/benchmark_inference_time.jinja b/aidge_export_arm_cortexm/templates/main_call/benchmark_inference_time.jinja
index 028a530..2ce9596 100644
--- a/aidge_export_arm_cortexm/templates/main_call/benchmark_inference_time.jinja
+++ b/aidge_export_arm_cortexm/templates/main_call/benchmark_inference_time.jinja
@@ -7,7 +7,7 @@
 // Necessary to have HAL_GetTick()
 #include "{{ board }}xx_hal.h"
 {% for name in inputs_name %}
-#include "{{ name }}.h"
+#include "data/{{ name }}.h"
 {% endfor %}
 
 {% set printf_formats = {
@@ -32,7 +32,7 @@ int benchmark_inference_time()
     uint32_t start;
     uint32_t end;
     double times[{{ nb_iterations }}] = {0};
-    for (std::size_t i = 0; i < {{ nb_iterations }} + {{ nb_warmup }}; ++i) {
+    for (unsigned int i = 0; i < {{ nb_iterations }} + {{ nb_warmup }}; ++i) {
         if (i < {{ nb_warmup }}) {
             {{ func_name }}({{ inputs_name|join(", ") }}{% if inputs_name %}, {% endif %}&{{ outputs_name|join(", &") }});
         } else {
@@ -42,15 +42,15 @@ int benchmark_inference_time()
             {{ func_name }}({{ inputs_name|join(", ") }}{% if inputs_name %}, {% endif %}&{{ outputs_name|join(", &") }});
             {{ func_name }}({{ inputs_name|join(", ") }}{% if inputs_name %}, {% endif %}&{{ outputs_name|join(", &") }});
             end = HAL_GetTick();
-            times[i - {{ nb_warmup }}] = ((double)(end - start)/CLOCKS_PER_SEC)/4.0;
+            times[i - {{ nb_warmup }}] = (double)(end - start);
         }
     }
 
-    for (std::size_t i = 0; i < {{ nb_iterations }}; ++i) {
+    for (unsigned int i = 0; i < {{ nb_iterations }}; ++i) {
         printf("%.10lf ", times[i]);
     }
     printf("\n");
     return 0;
 }
 
-#endif BENCHMARK_INFERENCE_TIME_HPP
\ No newline at end of file
+#endif //BENCHMARK_INFERENCE_TIME_HPP
\ No newline at end of file
-- 
GitLab