From b29fd0a7f367572fff0ba06c3ab2f4376c900458 Mon Sep 17 00:00:00 2001 From: Wissam Boussella <wissam.boussella@cea.fr> Date: Fri, 11 Apr 2025 13:51:35 +0200 Subject: [PATCH 1/6] first step for benchmark --- aidge_export_arm_cortexm/benchmark.py | 138 +++++ .../templates/main_call/main_stm32f7.jinja | 474 ++++++++++++++++++ .../templates/main_call/main_stm32h7.jinja | 365 ++++++++++++++ .../templates/main_call/print_output.jinja | 44 ++ ba.py | 102 ++++ 5 files changed, 1123 insertions(+) create mode 100644 aidge_export_arm_cortexm/benchmark.py create mode 100644 aidge_export_arm_cortexm/templates/main_call/main_stm32f7.jinja create mode 100644 aidge_export_arm_cortexm/templates/main_call/main_stm32h7.jinja create mode 100644 aidge_export_arm_cortexm/templates/main_call/print_output.jinja create mode 100644 ba.py diff --git a/aidge_export_arm_cortexm/benchmark.py b/aidge_export_arm_cortexm/benchmark.py new file mode 100644 index 0000000..6366c57 --- /dev/null +++ b/aidge_export_arm_cortexm/benchmark.py @@ -0,0 +1,138 @@ +import contextlib +import os +from shutil import rmtree +from subprocess import run + +import numpy as np + +import aidge_core +import aidge_backend_cpu +import aidge_export_arm_cortexm + +def measure_inference_time(model: aidge_core.GraphView, input_data: list[str, np.ndarray], nb_warmup: int = 10, nb_iterations: int = 50) -> list[float]: + # load and set up the model + # model.set_datatype(ai.dtype.float32) + model.set_backend("cpu") + + # create input Tensor list for the GraphView + ordered_inputs: list[aidge_core.Tensor] = [] + # [tmp fix] manual transpositin of data for input of export BEFORE converting to Tensor + for i in input_data: + nb_dims = len(i[1].shape) + if nb_dims == 3: + ordered_inputs.append(aidge_core.Tensor(i[1].transpose(0,2,1).reshape(i[1].shape).copy())) + if nb_dims == 4: + ordered_inputs.append(aidge_core.Tensor(np.transpose(i[1], axes=(0,2,3,1)).reshape(i[1].shape).copy())) + else: + ordered_inputs.append(aidge_core.Tensor(i[1])) + + # set inputs for the export + for i, inp in enumerate(model.get_ordered_inputs()): + op = inp[0].get_operator() + op.set_input(i, ordered_inputs[i]) + + model.forward_dims([t.dims() for t in ordered_inputs]) + + scheduler = aidge_core.SequentialScheduler(model) + scheduler.generate_scheduling() + + # for ordered_input in ordered_inputs: + # ordered_input.set_backend("cpu") + operator_type: str = model.get_ordered_outputs()[0][0].get_operator().type() + print(" ├─Generating export...", end="", flush=True) + folder_name: str = f"{operator_type.lower()}_test_export_cpp" + with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f): + aidge_core.export_utils.scheduler_export( + scheduler, + folder_name, + aidge_export_arm_cortexm.ExportLibAidgeARM, + memory_manager=aidge_core.mem_info.generate_optimized_memory_info, + memory_manager_args={"wrapping": False } + ) + aidge_core.export_utils.generate_main_inference_time_cpp(folder_name, model, nb_iterations, nb_warmup) + print(" ok") + + print(" ├─Compiling...", end="", flush=True) + with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f): + run(['make'], cwd=folder_name, stdout=f) + print(" ok") + timings_str = run(f'./{folder_name}/bin/run_export', capture_output=True, text=True) + + folder_path = os.path.abspath(folder_name) + if os.path.exists(folder_path): + rmtree(folder_path, ignore_errors=True) + + timings = [float(t) for t in timings_str.stdout.split(' ') if t.strip()] + return timings + +def compute_output(model: aidge_core.GraphView, input_data: list[str, np.ndarray]) -> list[np.ndarray]: + # load and set up the model + model.set_backend("cpu") + + # create input Tensor list for the GraphView + ordered_inputs: list[aidge_core.Tensor] = [] + # [tmp fix] manual transpositin of data for input of export BEFORE converting to Tensor + for i in input_data: + nb_dims = len(i[1].shape) + if nb_dims == 3: + ordered_inputs.append(aidge_core.Tensor(i[1].transpose(0,2,1).reshape(i[1].shape).copy())) + if nb_dims == 4: + ordered_inputs.append(aidge_core.Tensor(np.transpose(i[1], axes=(0,2,3,1)).reshape(i[1].shape).copy())) + else: + ordered_inputs.append(aidge_core.Tensor(i[1])) + + # set inputs for the export + for i, inp in enumerate(model.get_ordered_inputs()): + op = inp[0].get_operator() + op.set_input(i, ordered_inputs[i]) + + model.forward_dims([t.dims() for t in ordered_inputs]) + + scheduler = aidge_core.SequentialScheduler(model) + scheduler.generate_scheduling() + + + operator_type: str = model.get_ordered_outputs()[0][0].get_operator().type() + print(" │ Generating export...", end="", flush=True) + folder_name: str = f"{operator_type.lower()}_test_export_cpp" + with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f): + aidge_core.export_utils.scheduler_export( + scheduler, + folder_name, + aidge_export_cpp.ExportLibCpp, + memory_manager=aidge_core.mem_info.generate_optimized_memory_info, + memory_manager_args={"wrapping": False } + ) + aidge_core.export_utils.generate_main_display_output_cpp(folder_name, model) + print(" ok") + + print(" │ Compiling...", end="", flush=True) + with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f): + run(['make'], cwd=folder_name, stdout=f) + print(" ok") + output_str: str = run(f'./{folder_name}/bin/run_export', capture_output=True, text=True) + folder_path = os.path.abspath(folder_name) + if os.path.exists(folder_path): + rmtree(folder_path, ignore_errors=True) + + outputs_str: list[str] = output_str.stdout.strip().split('\n') + outputs = [np.array([float(val) for val in single_output_str.split(' ') if val.strip()]) for i, single_output_str in enumerate(outputs_str)] + + for i, pair in enumerate(model.get_ordered_outputs()): + dims = pair[0].get_operator().get_output(pair[1]).dims() + nb_dims = len(dims) + dims_permutted = dims + if nb_dims == 3: + dims_permutted = [dims[0], dims[2], dims[1]] + if nb_dims == 4: + dims_permutted = [dims[0], dims[2], dims[3], dims[1]] + + if np.prod(dims) != outputs[i].size: + aidge_core.Log.fatal("Incompatible export output size ({}) with required shape {}", outputs[i].size, dims) + outputs[i] = outputs[i].reshape(dims_permutted) + if nb_dims == 3: + outputs[i] = outputs[i].transpose(0,2,1) + if nb_dims == 4: + outputs[i] = outputs[i].transpose(0,3,1,2) + + return outputs \ No newline at end of file diff --git a/aidge_export_arm_cortexm/templates/main_call/main_stm32f7.jinja b/aidge_export_arm_cortexm/templates/main_call/main_stm32f7.jinja new file mode 100644 index 0000000..a5f8e5e --- /dev/null +++ b/aidge_export_arm_cortexm/templates/main_call/main_stm32f7.jinja @@ -0,0 +1,474 @@ +/* USER CODE BEGIN Header */ +/** + ****************************************************************************** + * @file : main.c + * @brief : Main program body + ****************************************************************************** + * @attention + * + * Copyright (c) 2024 STMicroelectronics. + * All rights reserved. + * + * This software is licensed under terms that can be found in the LICENSE file + * in the root directory of this software component. + * If no LICENSE file comes with this software, it is provided AS-IS. + * + ****************************************************************************** + */ +/* USER CODE END Header */ +/* Includes ------------------------------------------------------------------*/ +#include "main.h" +#include "string.h" + +/* Private includes ----------------------------------------------------------*/ +/* USER CODE BEGIN Includes */ + +#include <stdio.h> +#include <stdint.h> + +/* USER CODE END Includes */ + +/* Private typedef -----------------------------------------------------------*/ +/* USER CODE BEGIN PTD */ + +/* USER CODE END PTD */ + +/* Private define ------------------------------------------------------------*/ +/* USER CODE BEGIN PD */ + +/* USER CODE END PD */ + +/* Private macro -------------------------------------------------------------*/ +/* USER CODE BEGIN PM */ + +/* USER CODE END PM */ + +/* Private variables ---------------------------------------------------------*/ +#if defined ( __ICCARM__ ) /*!< IAR Compiler */ +#pragma location=0x2004c000 +ETH_DMADescTypeDef DMARxDscrTab[ETH_RX_DESC_CNT]; /* Ethernet Rx DMA Descriptors */ +#pragma location=0x2004c0a0 +ETH_DMADescTypeDef DMATxDscrTab[ETH_TX_DESC_CNT]; /* Ethernet Tx DMA Descriptors */ + +#elif defined ( __CC_ARM ) /* MDK ARM Compiler */ + +__attribute__((at(0x2004c000))) ETH_DMADescTypeDef DMARxDscrTab[ETH_RX_DESC_CNT]; /* Ethernet Rx DMA Descriptors */ +__attribute__((at(0x2004c0a0))) ETH_DMADescTypeDef DMATxDscrTab[ETH_TX_DESC_CNT]; /* Ethernet Tx DMA Descriptors */ + +#elif defined ( __GNUC__ ) /* GNU Compiler */ + +ETH_DMADescTypeDef DMARxDscrTab[ETH_RX_DESC_CNT] __attribute__((section(".RxDecripSection"))); /* Ethernet Rx DMA Descriptors */ +ETH_DMADescTypeDef DMATxDscrTab[ETH_TX_DESC_CNT] __attribute__((section(".TxDecripSection"))); /* Ethernet Tx DMA Descriptors */ +#endif + +ETH_TxPacketConfig TxConfig; + +ETH_HandleTypeDef heth; + +TIM_HandleTypeDef htim2; + +UART_HandleTypeDef huart3; + +PCD_HandleTypeDef hpcd_USB_OTG_FS; + +/* USER CODE BEGIN PV */ + +/* USER CODE END PV */ + +/* Private function prototypes -----------------------------------------------*/ +void SystemClock_Config(void); +static void MX_GPIO_Init(void); +static void MX_ETH_Init(void); +static void MX_TIM2_Init(void); +static void MX_USART3_UART_Init(void); +static void MX_USB_OTG_FS_PCD_Init(void); +/* USER CODE BEGIN PFP */ + +/* USER CODE END PFP */ + +/* Private user code ---------------------------------------------------------*/ +/* USER CODE BEGIN 0 */ + +int _write(int file, char *data, int len) +{ + HAL_StatusTypeDef status = HAL_UART_Transmit(&huart3, (uint8_t*) data, len, 1000); + + return (status == HAL_OK ? len : 0); +} + +/* USER CODE END 0 */ + +/** + * @brief The application entry point. + * @retval int + */ +#include "{{ call_function }}.hpp" +int main(void) +{ + + /* USER CODE BEGIN 1 */ + + /* USER CODE END 1 */ + + /* MCU Configuration--------------------------------------------------------*/ + + /* Reset of all peripherals, Initializes the Flash interface and the Systick. */ + HAL_Init(); + + /* USER CODE BEGIN Init */ + + /* USER CODE END Init */ + + /* Configure the system clock */ + SystemClock_Config(); + + /* USER CODE BEGIN SysInit */ + + /* USER CODE END SysInit */ + + /* Initialize all configured peripherals */ + MX_GPIO_Init(); + MX_ETH_Init(); + MX_TIM2_Init(); + MX_USART3_UART_Init(); + MX_USB_OTG_FS_PCD_Init(); + /* USER CODE BEGIN 2 */ + + printf("\r\n"); + printf("*****************************************************\r\n"); + printf("****************** DEMO EXPORT ARM ******************\r\n"); + printf("*****************************************************\r\n"); + printf("\r\n"); + {{ call_function }}(); + printf("\r\n"); + printf("*****************************************************\r\n"); + printf("********************** END DEMO *********************\r\n"); + printf("*****************************************************\r\n"); + printf("\r\n"); + + /* USER CODE END 2 */ + + /* Infinite loop */ + /* USER CODE BEGIN WHILE */ + while (1) + { + /* USER CODE END WHILE */ + + /* USER CODE BEGIN 3 */ + } + /* USER CODE END 3 */ +} + +/** + * @brief System Clock Configuration + * @retval None + */ +void SystemClock_Config(void) +{ + RCC_OscInitTypeDef RCC_OscInitStruct = {0}; + RCC_ClkInitTypeDef RCC_ClkInitStruct = {0}; + + /** Configure LSE Drive Capability + */ + HAL_PWR_EnableBkUpAccess(); + + /** Configure the main internal regulator output voltage + */ + __HAL_RCC_PWR_CLK_ENABLE(); + __HAL_PWR_VOLTAGESCALING_CONFIG(PWR_REGULATOR_VOLTAGE_SCALE3); + + /** Initializes the RCC Oscillators according to the specified parameters + * in the RCC_OscInitTypeDef structure. + */ + RCC_OscInitStruct.OscillatorType = RCC_OSCILLATORTYPE_HSE; + RCC_OscInitStruct.HSEState = RCC_HSE_BYPASS; + RCC_OscInitStruct.PLL.PLLState = RCC_PLL_ON; + RCC_OscInitStruct.PLL.PLLSource = RCC_PLLSOURCE_HSE; + RCC_OscInitStruct.PLL.PLLM = 4; + RCC_OscInitStruct.PLL.PLLN = 72; + RCC_OscInitStruct.PLL.PLLP = RCC_PLLP_DIV2; + RCC_OscInitStruct.PLL.PLLQ = 3; + if (HAL_RCC_OscConfig(&RCC_OscInitStruct) != HAL_OK) + { + Error_Handler(); + } + + /** Initializes the CPU, AHB and APB buses clocks + */ + RCC_ClkInitStruct.ClockType = RCC_CLOCKTYPE_HCLK|RCC_CLOCKTYPE_SYSCLK + |RCC_CLOCKTYPE_PCLK1|RCC_CLOCKTYPE_PCLK2; + RCC_ClkInitStruct.SYSCLKSource = RCC_SYSCLKSOURCE_PLLCLK; + RCC_ClkInitStruct.AHBCLKDivider = RCC_SYSCLK_DIV1; + RCC_ClkInitStruct.APB1CLKDivider = RCC_HCLK_DIV2; + RCC_ClkInitStruct.APB2CLKDivider = RCC_HCLK_DIV1; + + if (HAL_RCC_ClockConfig(&RCC_ClkInitStruct, FLASH_LATENCY_2) != HAL_OK) + { + Error_Handler(); + } +} + +/** + * @brief ETH Initialization Function + * @param None + * @retval None + */ +static void MX_ETH_Init(void) +{ + + /* USER CODE BEGIN ETH_Init 0 */ + + /* USER CODE END ETH_Init 0 */ + + static uint8_t MACAddr[6]; + + /* USER CODE BEGIN ETH_Init 1 */ + + /* USER CODE END ETH_Init 1 */ + heth.Instance = ETH; + MACAddr[0] = 0x00; + MACAddr[1] = 0x80; + MACAddr[2] = 0xE1; + MACAddr[3] = 0x00; + MACAddr[4] = 0x00; + MACAddr[5] = 0x00; + heth.Init.MACAddr = &MACAddr[0]; + heth.Init.MediaInterface = HAL_ETH_RMII_MODE; + heth.Init.TxDesc = DMATxDscrTab; + heth.Init.RxDesc = DMARxDscrTab; + heth.Init.RxBuffLen = 1524; + + /* USER CODE BEGIN MACADDRESS */ + + /* USER CODE END MACADDRESS */ + + if (HAL_ETH_Init(&heth) != HAL_OK) + { + Error_Handler(); + } + + memset(&TxConfig, 0 , sizeof(ETH_TxPacketConfig)); + TxConfig.Attributes = ETH_TX_PACKETS_FEATURES_CSUM | ETH_TX_PACKETS_FEATURES_CRCPAD; + TxConfig.ChecksumCtrl = ETH_CHECKSUM_IPHDR_PAYLOAD_INSERT_PHDR_CALC; + TxConfig.CRCPadCtrl = ETH_CRC_PAD_INSERT; + /* USER CODE BEGIN ETH_Init 2 */ + + /* USER CODE END ETH_Init 2 */ + +} + +/** + * @brief TIM2 Initialization Function + * @param None + * @retval None + */ +static void MX_TIM2_Init(void) +{ + + /* USER CODE BEGIN TIM2_Init 0 */ + + /* USER CODE END TIM2_Init 0 */ + + TIM_ClockConfigTypeDef sClockSourceConfig = {0}; + TIM_MasterConfigTypeDef sMasterConfig = {0}; + + /* USER CODE BEGIN TIM2_Init 1 */ + + /* USER CODE END TIM2_Init 1 */ + htim2.Instance = TIM2; + htim2.Init.Prescaler = 16000; + htim2.Init.CounterMode = TIM_COUNTERMODE_UP; + htim2.Init.Period = 1000; + htim2.Init.ClockDivision = TIM_CLOCKDIVISION_DIV1; + htim2.Init.AutoReloadPreload = TIM_AUTORELOAD_PRELOAD_DISABLE; + if (HAL_TIM_Base_Init(&htim2) != HAL_OK) + { + Error_Handler(); + } + sClockSourceConfig.ClockSource = TIM_CLOCKSOURCE_INTERNAL; + if (HAL_TIM_ConfigClockSource(&htim2, &sClockSourceConfig) != HAL_OK) + { + Error_Handler(); + } + sMasterConfig.MasterOutputTrigger = TIM_TRGO_RESET; + sMasterConfig.MasterSlaveMode = TIM_MASTERSLAVEMODE_DISABLE; + if (HAL_TIMEx_MasterConfigSynchronization(&htim2, &sMasterConfig) != HAL_OK) + { + Error_Handler(); + } + /* USER CODE BEGIN TIM2_Init 2 */ + + /* USER CODE END TIM2_Init 2 */ + +} + +/** + * @brief USART3 Initialization Function + * @param None + * @retval None + */ +static void MX_USART3_UART_Init(void) +{ + + /* USER CODE BEGIN USART3_Init 0 */ + + /* USER CODE END USART3_Init 0 */ + + /* USER CODE BEGIN USART3_Init 1 */ + + /* USER CODE END USART3_Init 1 */ + huart3.Instance = USART3; + huart3.Init.BaudRate = 115200; + huart3.Init.WordLength = UART_WORDLENGTH_8B; + huart3.Init.StopBits = UART_STOPBITS_1; + huart3.Init.Parity = UART_PARITY_NONE; + huart3.Init.Mode = UART_MODE_TX_RX; + huart3.Init.HwFlowCtl = UART_HWCONTROL_NONE; + huart3.Init.OverSampling = UART_OVERSAMPLING_16; + huart3.Init.OneBitSampling = UART_ONE_BIT_SAMPLE_DISABLE; + huart3.AdvancedInit.AdvFeatureInit = UART_ADVFEATURE_NO_INIT; + if (HAL_UART_Init(&huart3) != HAL_OK) + { + Error_Handler(); + } + /* USER CODE BEGIN USART3_Init 2 */ + + /* USER CODE END USART3_Init 2 */ + +} + +/** + * @brief USB_OTG_FS Initialization Function + * @param None + * @retval None + */ +static void MX_USB_OTG_FS_PCD_Init(void) +{ + + /* USER CODE BEGIN USB_OTG_FS_Init 0 */ + + /* USER CODE END USB_OTG_FS_Init 0 */ + + /* USER CODE BEGIN USB_OTG_FS_Init 1 */ + + /* USER CODE END USB_OTG_FS_Init 1 */ + hpcd_USB_OTG_FS.Instance = USB_OTG_FS; + hpcd_USB_OTG_FS.Init.dev_endpoints = 6; + hpcd_USB_OTG_FS.Init.speed = PCD_SPEED_FULL; + hpcd_USB_OTG_FS.Init.dma_enable = DISABLE; + hpcd_USB_OTG_FS.Init.phy_itface = PCD_PHY_EMBEDDED; + hpcd_USB_OTG_FS.Init.Sof_enable = ENABLE; + hpcd_USB_OTG_FS.Init.low_power_enable = DISABLE; + hpcd_USB_OTG_FS.Init.lpm_enable = DISABLE; + hpcd_USB_OTG_FS.Init.vbus_sensing_enable = ENABLE; + hpcd_USB_OTG_FS.Init.use_dedicated_ep1 = DISABLE; + if (HAL_PCD_Init(&hpcd_USB_OTG_FS) != HAL_OK) + { + Error_Handler(); + } + /* USER CODE BEGIN USB_OTG_FS_Init 2 */ + + /* USER CODE END USB_OTG_FS_Init 2 */ + +} + +/** + * @brief GPIO Initialization Function + * @param None + * @retval None + */ +static void MX_GPIO_Init(void) +{ + GPIO_InitTypeDef GPIO_InitStruct = {0}; +/* USER CODE BEGIN MX_GPIO_Init_1 */ +/* USER CODE END MX_GPIO_Init_1 */ + + /* GPIO Ports Clock Enable */ + __HAL_RCC_GPIOC_CLK_ENABLE(); + __HAL_RCC_GPIOH_CLK_ENABLE(); + __HAL_RCC_GPIOA_CLK_ENABLE(); + __HAL_RCC_GPIOB_CLK_ENABLE(); + __HAL_RCC_GPIOD_CLK_ENABLE(); + __HAL_RCC_GPIOG_CLK_ENABLE(); + + /*Configure GPIO pin Output Level */ + HAL_GPIO_WritePin(GPIOB, LD1_Pin|LD3_Pin|LD2_Pin, GPIO_PIN_RESET); + + /*Configure GPIO pin Output Level */ + HAL_GPIO_WritePin(USB_PowerSwitchOn_GPIO_Port, USB_PowerSwitchOn_Pin, GPIO_PIN_RESET); + + /*Configure GPIO pin Output Level */ + HAL_GPIO_WritePin(GPIOC, GPIO_PIN_7, GPIO_PIN_RESET); + + /*Configure GPIO pin : USER_Btn_Pin */ + GPIO_InitStruct.Pin = USER_Btn_Pin; + GPIO_InitStruct.Mode = GPIO_MODE_IT_RISING; + GPIO_InitStruct.Pull = GPIO_NOPULL; + HAL_GPIO_Init(USER_Btn_GPIO_Port, &GPIO_InitStruct); + + /*Configure GPIO pins : LD1_Pin LD3_Pin LD2_Pin */ + GPIO_InitStruct.Pin = LD1_Pin|LD3_Pin|LD2_Pin; + GPIO_InitStruct.Mode = GPIO_MODE_OUTPUT_PP; + GPIO_InitStruct.Pull = GPIO_NOPULL; + GPIO_InitStruct.Speed = GPIO_SPEED_FREQ_LOW; + HAL_GPIO_Init(GPIOB, &GPIO_InitStruct); + + /*Configure GPIO pin : USB_PowerSwitchOn_Pin */ + GPIO_InitStruct.Pin = USB_PowerSwitchOn_Pin; + GPIO_InitStruct.Mode = GPIO_MODE_OUTPUT_PP; + GPIO_InitStruct.Pull = GPIO_NOPULL; + GPIO_InitStruct.Speed = GPIO_SPEED_FREQ_LOW; + HAL_GPIO_Init(USB_PowerSwitchOn_GPIO_Port, &GPIO_InitStruct); + + /*Configure GPIO pin : USB_OverCurrent_Pin */ + GPIO_InitStruct.Pin = USB_OverCurrent_Pin; + GPIO_InitStruct.Mode = GPIO_MODE_INPUT; + GPIO_InitStruct.Pull = GPIO_NOPULL; + HAL_GPIO_Init(USB_OverCurrent_GPIO_Port, &GPIO_InitStruct); + + /*Configure GPIO pin : PC7 */ + GPIO_InitStruct.Pin = GPIO_PIN_7; + GPIO_InitStruct.Mode = GPIO_MODE_OUTPUT_PP; + GPIO_InitStruct.Pull = GPIO_NOPULL; + GPIO_InitStruct.Speed = GPIO_SPEED_FREQ_LOW; + HAL_GPIO_Init(GPIOC, &GPIO_InitStruct); + +/* USER CODE BEGIN MX_GPIO_Init_2 */ +/* USER CODE END MX_GPIO_Init_2 */ +} + +/* USER CODE BEGIN 4 */ + +/* USER CODE END 4 */ + +/** + * @brief This function is executed in case of error occurrence. + * @retval None + */ +void Error_Handler(void) +{ + /* USER CODE BEGIN Error_Handler_Debug */ + /* User can add his own implementation to report the HAL error return state */ + __disable_irq(); + while (1) + { + } + /* USER CODE END Error_Handler_Debug */ +} + +#ifdef USE_FULL_ASSERT +/** + * @brief Reports the name of the source file and the source line number + * where the assert_param error has occurred. + * @param file: pointer to the source file name + * @param line: assert_param error line source number + * @retval None + */ +void assert_failed(uint8_t *file, uint32_t line) +{ + /* USER CODE BEGIN 6 */ + /* User can add his own implementation to report the file name and line number, + ex: printf("Wrong parameters value: file %s on line %d\r\n", file, line) */ + /* USER CODE END 6 */ +} +#endif /* USE_FULL_ASSERT */ diff --git a/aidge_export_arm_cortexm/templates/main_call/main_stm32h7.jinja b/aidge_export_arm_cortexm/templates/main_call/main_stm32h7.jinja new file mode 100644 index 0000000..ac5b695 --- /dev/null +++ b/aidge_export_arm_cortexm/templates/main_call/main_stm32h7.jinja @@ -0,0 +1,365 @@ +/* USER CODE BEGIN Header */ +/** + ****************************************************************************** + * @file : main.c + * @brief : Main program body + ****************************************************************************** + * @attention + * + * <h2><center>© Copyright (c) 2019 STMicroelectronics. + * All rights reserved.</center></h2> + * + * This software component is licensed by ST under BSD 3-Clause license, + * the "License"; You may not use this file except in compliance with the + * License. You may obtain a copy of the License at: + * opensource.org/licenses/BSD-3-Clause + * + ****************************************************************************** + */ +/* USER CODE END Header */ + +/* Includes ------------------------------------------------------------------*/ +#include "main.h" + +/* Private includes ----------------------------------------------------------*/ +/* USER CODE BEGIN Includes */ + +#include <stdio.h> + +#include "dnn/include/forward.hpp" + +/* USER CODE END Includes */ + +/* Private typedef -----------------------------------------------------------*/ +/* USER CODE BEGIN PTD */ + +/* USER CODE END PTD */ + +/* Private define ------------------------------------------------------------*/ +/* USER CODE BEGIN PD */ + + +/* USER CODE END PD */ + +/* Private macro -------------------------------------------------------------*/ +/* USER CODE BEGIN PM */ + +/* USER CODE END PM */ + +/* Private variables ---------------------------------------------------------*/ + +UART_HandleTypeDef huart3; + +/* USER CODE BEGIN PV */ + +/* USER CODE END PV */ + +/* Private function prototypes -----------------------------------------------*/ +void SystemClock_Config(void); +static void MX_GPIO_Init(void); +static void MX_USART3_UART_Init(void); +/* USER CODE BEGIN PFP */ + +/* USER CODE END PFP */ + +/* Private user code ---------------------------------------------------------*/ +/* USER CODE BEGIN 0 */ + +int _write(int file, char *data, int len) +{ + HAL_StatusTypeDef status = HAL_UART_Transmit(&huart3, (uint8_t*) data, len, 1000); + + return (status == HAL_OK ? len : 0); +} + +/* USER CODE END 0 */ + +/** + * @brief The application entry point. + * @retval int + */ +#include "{{ call_function }}.hpp" +int main(void) +{ + /* USER CODE BEGIN 1 */ + + /* USER CODE END 1 */ + + + /* Enable I-Cache---------------------------------------------------------*/ + SCB_EnableICache(); + + /* Enable D-Cache---------------------------------------------------------*/ + SCB_EnableDCache(); + + /* MCU Configuration--------------------------------------------------------*/ + + /* Reset of all peripherals, Initializes the Flash interface and the Systick. */ + HAL_Init(); + + /* USER CODE BEGIN Init */ + + /* USER CODE END Init */ + + /* Configure the system clock */ + SystemClock_Config(); + + /* USER CODE BEGIN SysInit */ + + /* USER CODE END SysInit */ + + /* Initialize all configured peripherals */ + MX_GPIO_Init(); + MX_USART3_UART_Init(); + /* USER CODE BEGIN 2 */ + + printf("\r\n"); + printf("*****************************************************\r\n"); + printf("****************** DEMO EXPORT ARM ******************\r\n"); + printf("*****************************************************\r\n"); + printf("\r\n"); + {{ call_function }}(); + printf("\r\n"); + printf("*****************************************************\r\n"); + printf("********************** END DEMO *********************\r\n"); + printf("*****************************************************\r\n"); + printf("\r\n"); + + /* USER CODE END 3 */ +} + +/** + * @brief System Clock Configuration + * @retval None + */ +void SystemClock_Config(void) +{ + RCC_OscInitTypeDef RCC_OscInitStruct = {0}; + RCC_ClkInitTypeDef RCC_ClkInitStruct = {0}; + RCC_PeriphCLKInitTypeDef PeriphClkInitStruct = {0}; + + /** Supply configuration update enable + */ + HAL_PWREx_ConfigSupply(PWR_LDO_SUPPLY); + /** Configure the main internal regulator output voltage + */ + __HAL_PWR_VOLTAGESCALING_CONFIG(PWR_REGULATOR_VOLTAGE_SCALE1); + + while(!__HAL_PWR_GET_FLAG(PWR_FLAG_VOSRDY)) {} + /** Initializes the CPU, AHB and APB busses clocks + */ + RCC_OscInitStruct.OscillatorType = RCC_OSCILLATORTYPE_HSI; + RCC_OscInitStruct.HSIState = RCC_HSI_DIV1; + RCC_OscInitStruct.HSICalibrationValue = RCC_HSICALIBRATION_DEFAULT; + RCC_OscInitStruct.PLL.PLLState = RCC_PLL_ON; + RCC_OscInitStruct.PLL.PLLSource = RCC_PLLSOURCE_HSI; + RCC_OscInitStruct.PLL.PLLM = 4; + RCC_OscInitStruct.PLL.PLLN = 50; + RCC_OscInitStruct.PLL.PLLP = 2; + RCC_OscInitStruct.PLL.PLLQ = 2; + RCC_OscInitStruct.PLL.PLLR = 2; + RCC_OscInitStruct.PLL.PLLRGE = RCC_PLL1VCIRANGE_3; + RCC_OscInitStruct.PLL.PLLVCOSEL = RCC_PLL1VCOWIDE; + RCC_OscInitStruct.PLL.PLLFRACN = 0; + if (HAL_RCC_OscConfig(&RCC_OscInitStruct) != HAL_OK) + { + Error_Handler(); + } + /** Initializes the CPU, AHB and APB busses clocks + */ + RCC_ClkInitStruct.ClockType = RCC_CLOCKTYPE_HCLK|RCC_CLOCKTYPE_SYSCLK + |RCC_CLOCKTYPE_PCLK1|RCC_CLOCKTYPE_PCLK2 + |RCC_CLOCKTYPE_D3PCLK1|RCC_CLOCKTYPE_D1PCLK1; + RCC_ClkInitStruct.SYSCLKSource = RCC_SYSCLKSOURCE_PLLCLK; + RCC_ClkInitStruct.SYSCLKDivider = RCC_SYSCLK_DIV1; + RCC_ClkInitStruct.AHBCLKDivider = RCC_HCLK_DIV2; + RCC_ClkInitStruct.APB3CLKDivider = RCC_APB3_DIV2; + RCC_ClkInitStruct.APB1CLKDivider = RCC_APB1_DIV2; + RCC_ClkInitStruct.APB2CLKDivider = RCC_APB2_DIV2; + RCC_ClkInitStruct.APB4CLKDivider = RCC_APB4_DIV2; + + if (HAL_RCC_ClockConfig(&RCC_ClkInitStruct, FLASH_LATENCY_2) != HAL_OK) + { + Error_Handler(); + } + PeriphClkInitStruct.PeriphClockSelection = RCC_PERIPHCLK_USART3; + PeriphClkInitStruct.Usart234578ClockSelection = RCC_USART234578CLKSOURCE_D2PCLK1; + if (HAL_RCCEx_PeriphCLKConfig(&PeriphClkInitStruct) != HAL_OK) + { + Error_Handler(); + } +} + +/** + * @brief USART3 Initialization Function + * @param None + * @retval None + */ +static void MX_USART3_UART_Init(void) +{ + + /* USER CODE BEGIN USART3_Init 0 */ + + /* USER CODE END USART3_Init 0 */ + + /* USER CODE BEGIN USART3_Init 1 */ + + /* USER CODE END USART3_Init 1 */ + huart3.Instance = USART3; + huart3.Init.BaudRate = 115200; + huart3.Init.WordLength = UART_WORDLENGTH_8B; + huart3.Init.StopBits = UART_STOPBITS_1; + huart3.Init.Parity = UART_PARITY_NONE; + huart3.Init.Mode = UART_MODE_TX_RX; + huart3.Init.HwFlowCtl = UART_HWCONTROL_NONE; + huart3.Init.OverSampling = UART_OVERSAMPLING_16; + huart3.Init.OneBitSampling = UART_ONE_BIT_SAMPLE_DISABLE; + huart3.Init.ClockPrescaler = UART_PRESCALER_DIV1; + huart3.AdvancedInit.AdvFeatureInit = UART_ADVFEATURE_NO_INIT; + if (HAL_UART_Init(&huart3) != HAL_OK) + { + Error_Handler(); + } + if (HAL_UARTEx_SetTxFifoThreshold(&huart3, UART_TXFIFO_THRESHOLD_1_8) != HAL_OK) + { + Error_Handler(); + } + if (HAL_UARTEx_SetRxFifoThreshold(&huart3, UART_RXFIFO_THRESHOLD_1_8) != HAL_OK) + { + Error_Handler(); + } + if (HAL_UARTEx_DisableFifoMode(&huart3) != HAL_OK) + { + Error_Handler(); + } + /* USER CODE BEGIN USART3_Init 2 */ + + /* USER CODE END USART3_Init 2 */ + +} + +/** + * @brief GPIO Initialization Function + * @param None + * @retval None + */ +static void MX_GPIO_Init(void) +{ + GPIO_InitTypeDef GPIO_InitStruct = {0}; + + /* GPIO Ports Clock Enable */ + __HAL_RCC_GPIOC_CLK_ENABLE(); + __HAL_RCC_GPIOH_CLK_ENABLE(); + __HAL_RCC_GPIOA_CLK_ENABLE(); + __HAL_RCC_GPIOB_CLK_ENABLE(); + __HAL_RCC_GPIOD_CLK_ENABLE(); + __HAL_RCC_GPIOG_CLK_ENABLE(); + + /*Configure GPIO pin Output Level */ + HAL_GPIO_WritePin(GPIOB, GPIO_PIN_14|GPIO_PIN_7, GPIO_PIN_RESET); + + /*Configure GPIO pin Output Level */ + HAL_GPIO_WritePin(GPIOG, GPIO_PIN_6, GPIO_PIN_RESET); + + /*Configure GPIO pin : PC13 */ + GPIO_InitStruct.Pin = GPIO_PIN_13; + GPIO_InitStruct.Mode = GPIO_MODE_IT_RISING; + GPIO_InitStruct.Pull = GPIO_NOPULL; + HAL_GPIO_Init(GPIOC, &GPIO_InitStruct); + + /*Configure GPIO pins : PC1 PC4 PC5 */ + GPIO_InitStruct.Pin = GPIO_PIN_1|GPIO_PIN_4|GPIO_PIN_5; + GPIO_InitStruct.Mode = GPIO_MODE_AF_PP; + GPIO_InitStruct.Pull = GPIO_NOPULL; + GPIO_InitStruct.Speed = GPIO_SPEED_FREQ_LOW; + GPIO_InitStruct.Alternate = GPIO_AF11_ETH; + HAL_GPIO_Init(GPIOC, &GPIO_InitStruct); + + /*Configure GPIO pins : PA1 PA2 PA7 */ + GPIO_InitStruct.Pin = GPIO_PIN_1|GPIO_PIN_2|GPIO_PIN_7; + GPIO_InitStruct.Mode = GPIO_MODE_AF_PP; + GPIO_InitStruct.Pull = GPIO_NOPULL; + GPIO_InitStruct.Speed = GPIO_SPEED_FREQ_LOW; + GPIO_InitStruct.Alternate = GPIO_AF11_ETH; + HAL_GPIO_Init(GPIOA, &GPIO_InitStruct); + + /*Configure GPIO pin : PB13 */ + GPIO_InitStruct.Pin = GPIO_PIN_13; + GPIO_InitStruct.Mode = GPIO_MODE_AF_PP; + GPIO_InitStruct.Pull = GPIO_NOPULL; + GPIO_InitStruct.Speed = GPIO_SPEED_FREQ_LOW; + GPIO_InitStruct.Alternate = GPIO_AF11_ETH; + HAL_GPIO_Init(GPIOB, &GPIO_InitStruct); + + /*Configure GPIO pins : PB14 PB7 */ + GPIO_InitStruct.Pin = GPIO_PIN_14|GPIO_PIN_7; + GPIO_InitStruct.Mode = GPIO_MODE_OUTPUT_PP; + GPIO_InitStruct.Pull = GPIO_NOPULL; + GPIO_InitStruct.Speed = GPIO_SPEED_FREQ_LOW; + HAL_GPIO_Init(GPIOB, &GPIO_InitStruct); + + /*Configure GPIO pin : PG6 */ + GPIO_InitStruct.Pin = GPIO_PIN_6; + GPIO_InitStruct.Mode = GPIO_MODE_OUTPUT_PP; + GPIO_InitStruct.Pull = GPIO_NOPULL; + GPIO_InitStruct.Speed = GPIO_SPEED_FREQ_LOW; + HAL_GPIO_Init(GPIOG, &GPIO_InitStruct); + + /*Configure GPIO pin : PG7 */ + GPIO_InitStruct.Pin = GPIO_PIN_7; + GPIO_InitStruct.Mode = GPIO_MODE_INPUT; + GPIO_InitStruct.Pull = GPIO_NOPULL; + HAL_GPIO_Init(GPIOG, &GPIO_InitStruct); + + /*Configure GPIO pins : PA8 PA10 PA11 PA12 */ + GPIO_InitStruct.Pin = GPIO_PIN_8|GPIO_PIN_10|GPIO_PIN_11|GPIO_PIN_12; + GPIO_InitStruct.Mode = GPIO_MODE_AF_PP; + GPIO_InitStruct.Pull = GPIO_NOPULL; + GPIO_InitStruct.Speed = GPIO_SPEED_FREQ_LOW; + GPIO_InitStruct.Alternate = GPIO_AF10_OTG1_FS; + HAL_GPIO_Init(GPIOA, &GPIO_InitStruct); + + /*Configure GPIO pins : PG11 PG13 */ + GPIO_InitStruct.Pin = GPIO_PIN_11|GPIO_PIN_13; + GPIO_InitStruct.Mode = GPIO_MODE_AF_PP; + GPIO_InitStruct.Pull = GPIO_NOPULL; + GPIO_InitStruct.Speed = GPIO_SPEED_FREQ_LOW; + GPIO_InitStruct.Alternate = GPIO_AF11_ETH; + HAL_GPIO_Init(GPIOG, &GPIO_InitStruct); + +} + +/* USER CODE BEGIN 4 */ + +/* USER CODE END 4 */ + +/** + * @brief This function is executed in case of error occurrence. + * @retval None + */ +void Error_Handler(void) +{ + /* USER CODE BEGIN Error_Handler_Debug */ + /* User can add his own implementation to report the HAL error return state */ + + /* USER CODE END Error_Handler_Debug */ +} + +#ifdef USE_FULL_ASSERT +/** + * @brief Reports the name of the source file and the source line number + * where the assert_param error has occurred. + * @param file: pointer to the source file name + * @param line: assert_param error line source number + * @retval None + */ +void assert_failed(uint8_t *file, uint32_t line) +{ + /* USER CODE BEGIN 6 */ + /* User can add his own implementation to report the file name and line number, + tex: printf("Wrong parameters value: file %s on line %d\r\n", file, line) */ + /* USER CODE END 6 */ +} +#endif /* USE_FULL_ASSERT */ + +/************************ (C) COPYRIGHT STMicroelectronics *****END OF FILE****/ diff --git a/aidge_export_arm_cortexm/templates/main_call/print_output.jinja b/aidge_export_arm_cortexm/templates/main_call/print_output.jinja new file mode 100644 index 0000000..5f250b6 --- /dev/null +++ b/aidge_export_arm_cortexm/templates/main_call/print_output.jinja @@ -0,0 +1,44 @@ +#ifndef MAIN_CALL_HPP +#define MAIN_CALL_HPP + +#include "forward.hpp" +#include <cstdio> +{% for name in inputs_name %} +#include "{{ name }}.h" +{% endfor %} + +{% set printf_formats = { + "double": "%lf", + "float": "%f", + "int8_t": "%hhd", + "int16_t": "%hd", + "int32_t": "%d", + "int64_t": "%lld", + "uint8_t": "%hhu", + "uint16_t": "%hu", + "uint32_t": "%u", + "uint64_t": "%llu" +} %} + +// Function declaration for main_call +int print_output( + // Initialize the output arrays + {%- for o in range(outputs_name | length) %} + {{ outputs_dtype[o] }}* {{ outputs_name[o] }} = nullptr; + {% endfor %} + + // Call the forward function + {{ func_name }}({{ inputs_name|join(", ") }}{% if inputs_name %}, {% endif %}&{{ outputs_name|join(", &") }}); + + // Print the results of each output + {%- for o in range(outputs_name | length) %} + printf("{{ outputs_name[o] }}:\n\r"); + for (int o = 0; o < {{ outputs_size[o] }}; ++o) { + printf("{{ printf_formats[outputs_dtype[o]] }} ", {{ outputs_name[o] }}[o]); + } + printf("\n\r"); + {% endfor %} + return 0; +); + +#endif // MAIN_CALL_HPP \ No newline at end of file diff --git a/ba.py b/ba.py new file mode 100644 index 0000000..4930900 --- /dev/null +++ b/ba.py @@ -0,0 +1,102 @@ + +import aidge_core +import aidge_backend_cpu +import aidge_export_arm_cortexm +import aidge_export_cpp +import aidge_onnx +# import aidge_quantization +import numpy as np +from aidge_export_arm_cortexm.export_registry import ExportLibAidgeARM, ExportLibCMSISNN +from aidge_export_cpp.export_registry import ExportLibCpp + +SEED = 123 +np.random.seed(SEED) +# PARAMETERS +INPUT_DIMS = [1, 3, 5, 5] +KERNEL_DIMS = [1, 1] +STRIDE_DIMS = [1, 1] +DILATION_DIMS = [1, 1] +IN_CHANNELS = 2 +OUT_CHANNELS = 2 +NO_BIAS = False + + +# EXECUTION + +PROPAGATE = True +FUSE = True +ADAPT_TO_BACKEND = True +CONSTANT_FOLDING = False +GENERATE_SCHEDULING = True +EXPORT = True +PRINT_INPUT_OUTPUT_INFOS = True +NAME = "test_cmsis_nn" +LIB = "CMSIS-NN" +INIT_WEIGHTS = True + +model = aidge_core.sequential([ + # aidge_core.Producer(aidge_core.Tensor(dims=INPUT_DIMS), name="dataProvider"), + aidge_core.Conv2D(in_channels=3, out_channels=OUT_CHANNELS, kernel_dims=KERNEL_DIMS, name='conv', stride_dims=STRIDE_DIMS, dilation_dims=DILATION_DIMS, no_bias=NO_BIAS), +]) + +model.save("init") + +input_array = np.random.random( size=INPUT_DIMS).astype(np.float32) +#Init des poids +if INIT_WEIGHTS: + for n in model.get_nodes() : + print( "Node : " + str(n)) + if n.type() == "Producer": + dims = n.get_operator().get_output(0).dims() + print(dims) + array = np.random.random( size=dims).astype(np.float32) + tensor = aidge_core.Tensor(array).set_datatype(aidge_core.dtype.float32) + n.get_operator().set_output(0, aidge_core.Tensor(array)) + print(str(n.get_operator().get_output(0).dformat())) + print("Data in :") + print(array) + + +model.compile("cpu", aidge_core.dtype.float32, dims=[INPUT_DIMS]) + + + +def propagate(model, scheduler, tensor): + # Setup the input + + input_tensor = aidge_core.Tensor(tensor) + # Tensor backend must be set again ... + input_tensor.set_backend("cpu") + input_tensor.set_datatype(aidge_core.dtype.float32) + # Run the inference + scheduler.forward(True, [input_tensor]) + # Gather the results + output_node = model.get_output_nodes().pop() + output_tensor = output_node.get_operator().get_output(0) + aidge_core.export_utils.generate_input_file(export_folder="conv_export", array_name="expected_output", tensor=output_tensor) + return np.array(output_tensor) + + +if PROPAGATE: + model.set_datatype(aidge_core.dtype.float32) + model.set_backend("cpu") + scheduler = aidge_core.SequentialScheduler(model) + output_array =propagate(model, scheduler, input_array) + print(output_array) + + +scheduler = aidge_core.SequentialScheduler(model) +scheduler.generate_scheduling() +scheduler.graph_view() + + +aidge_export_arm_cortexm.export( + "conv_export", + graphview=model, + scheduler = scheduler, + board="stm32h7" + +) +aidge_export_arm_cortexm.utils.generate_call_function_arm_cortex_m("conv_export","main","stm32h7") +aidge_export_arm_cortexm.utils.generate_print_output_arm_cortex_m("conv_export",model,"stm32h7",input_array) +# aidge_core.export_utils.generate_input_file(export_folder="conv_export", array_name="_input_0", tensor=aidge_core.Tensor(input_array)) -- GitLab From 0ea91be627ae10cc7f7a6d899361a529fe410f5d Mon Sep 17 00:00:00 2001 From: Wissam Boussella <wissam.boussella@cea.fr> Date: Fri, 11 Apr 2025 14:55:51 +0200 Subject: [PATCH 2/6] Refactor print_output function and update imports in utils.py --- .../templates/main_call/print_output.jinja | 8 +-- aidge_export_arm_cortexm/utils.py | 56 ++++++++++++++++++- 2 files changed, 59 insertions(+), 5 deletions(-) diff --git a/aidge_export_arm_cortexm/templates/main_call/print_output.jinja b/aidge_export_arm_cortexm/templates/main_call/print_output.jinja index 5f250b6..5ee79cd 100644 --- a/aidge_export_arm_cortexm/templates/main_call/print_output.jinja +++ b/aidge_export_arm_cortexm/templates/main_call/print_output.jinja @@ -2,7 +2,7 @@ #define MAIN_CALL_HPP #include "forward.hpp" -#include <cstdio> +#include "stdio.h" {% for name in inputs_name %} #include "{{ name }}.h" {% endfor %} @@ -21,10 +21,10 @@ } %} // Function declaration for main_call -int print_output( +int print_output(){ // Initialize the output arrays {%- for o in range(outputs_name | length) %} - {{ outputs_dtype[o] }}* {{ outputs_name[o] }} = nullptr; + {{ outputs_dtype[o] }}* {{ outputs_name[o] }} = NULL; {% endfor %} // Call the forward function @@ -39,6 +39,6 @@ int print_output( printf("\n\r"); {% endfor %} return 0; -); +}; #endif // MAIN_CALL_HPP \ No newline at end of file diff --git a/aidge_export_arm_cortexm/utils.py b/aidge_export_arm_cortexm/utils.py index e5b166d..cf8cf74 100644 --- a/aidge_export_arm_cortexm/utils.py +++ b/aidge_export_arm_cortexm/utils.py @@ -1,5 +1,8 @@ from importlib.metadata import version - +from pathlib import Path +from aidge_core.export_utils import generate_file, data_conversion +import aidge_core +from aidge_export_arm_cortexm import ROOT def show_version(): version_aidge_export_arm_cortexm = version("aidge_export_arm_cortexm") @@ -7,3 +10,54 @@ def show_version(): def get_project_version()->str: return version("aidge_export_arm_cortexm") + +def generate_call_function_arm_cortex_m(export_folder: str, call_function: str, board: str) -> None: + generate_file( + str(Path(export_folder) / "Src" / "main.c"), + str(ROOT / "templates" / "main_call" / str("main_" + board + ".jinja")), + call_function=call_function + ) + +def generate_print_output_arm_cortex_m(export_folder: str, graph_view: aidge_core.GraphView, board: str, inputs_tensor=None) -> None: + + outputs_name: list[str] = [] + outputs_dtype: list[str] = [] + outputs_size: list[int] = [] + inputs_name: list[str] = [] + gv_inputs: list[tuple[aidge_core.Node, int]] = graph_view.get_ordered_inputs() + gv_outputs: list[tuple[aidge_core.Node, int]] = graph_view.get_ordered_outputs() + + for in_node, in_idx in gv_inputs: + in_node_input, in_node_input_idx = in_node.input(in_idx) + in_name = f"{in_node.name()}_input_{in_idx}" if in_node_input is None else f"{in_node_input.name()}_output_{in_node_input_idx}" + inputs_name.append(in_name) + input_tensor = in_node.get_operator().get_input(in_idx) + if input_tensor is None or input_tensor.undefined() or not input_tensor.has_impl(): + if inputs_tensor is not None: + aidge_core.Log.notice("No support for inputs_tensor argument yet.") + aidge_core.Log.notice(f"No input tensor set for {in_name}, main generated will not be functionnal after code generation.") + else: + aidge_core.Log.notice(f"No input tensor set for {in_name}, main generated will not be functionnal after code generation.") + else: + aidge_core.export_utils.generate_input_file(export_folder=export_folder, array_name=in_name, tensor=input_tensor) + + for out_node, out_id in gv_outputs: + outputs_name.append(f"{out_node.name()}_output_{out_id}") + out_tensor = out_node.get_operator().get_output(out_id) + outputs_dtype.append(data_conversion.aidge2c(out_tensor.dtype())) + outputs_size.append(out_tensor.size()) + + if len(outputs_name) != len(outputs_dtype) or len(outputs_name) != len(outputs_size): + raise RuntimeError("FATAL: Output args list does not have the same length this is an internal bug.") + + ROOT = Path(__file__).resolve().parents[0] + generate_call_function_arm_cortex_m(Path(export_folder),"print_output",board=board) + generate_file( + str(Path(export_folder)/"Src" / "print_output.hpp"), + str(ROOT / "templates" / "main_call" / "print_output.jinja"), + func_name="model_forward", + inputs_name=inputs_name, + outputs_name=outputs_name, + outputs_dtype=outputs_dtype, + outputs_size=outputs_size + ) \ No newline at end of file -- GitLab From ca3258a87d1e8cc00efbb6f3e0fa14f281a3cdc5 Mon Sep 17 00:00:00 2001 From: Wissam Boussella <wissam.boussella@cea.fr> Date: Fri, 11 Apr 2025 14:58:41 +0200 Subject: [PATCH 3/6] Remove ba.py file, needs to fix also the main_stm32F7.jinja, he's not working, and moves method to benchmark.py --- ba.py | 102 ---------------------------------------------------------- 1 file changed, 102 deletions(-) delete mode 100644 ba.py diff --git a/ba.py b/ba.py deleted file mode 100644 index 4930900..0000000 --- a/ba.py +++ /dev/null @@ -1,102 +0,0 @@ - -import aidge_core -import aidge_backend_cpu -import aidge_export_arm_cortexm -import aidge_export_cpp -import aidge_onnx -# import aidge_quantization -import numpy as np -from aidge_export_arm_cortexm.export_registry import ExportLibAidgeARM, ExportLibCMSISNN -from aidge_export_cpp.export_registry import ExportLibCpp - -SEED = 123 -np.random.seed(SEED) -# PARAMETERS -INPUT_DIMS = [1, 3, 5, 5] -KERNEL_DIMS = [1, 1] -STRIDE_DIMS = [1, 1] -DILATION_DIMS = [1, 1] -IN_CHANNELS = 2 -OUT_CHANNELS = 2 -NO_BIAS = False - - -# EXECUTION - -PROPAGATE = True -FUSE = True -ADAPT_TO_BACKEND = True -CONSTANT_FOLDING = False -GENERATE_SCHEDULING = True -EXPORT = True -PRINT_INPUT_OUTPUT_INFOS = True -NAME = "test_cmsis_nn" -LIB = "CMSIS-NN" -INIT_WEIGHTS = True - -model = aidge_core.sequential([ - # aidge_core.Producer(aidge_core.Tensor(dims=INPUT_DIMS), name="dataProvider"), - aidge_core.Conv2D(in_channels=3, out_channels=OUT_CHANNELS, kernel_dims=KERNEL_DIMS, name='conv', stride_dims=STRIDE_DIMS, dilation_dims=DILATION_DIMS, no_bias=NO_BIAS), -]) - -model.save("init") - -input_array = np.random.random( size=INPUT_DIMS).astype(np.float32) -#Init des poids -if INIT_WEIGHTS: - for n in model.get_nodes() : - print( "Node : " + str(n)) - if n.type() == "Producer": - dims = n.get_operator().get_output(0).dims() - print(dims) - array = np.random.random( size=dims).astype(np.float32) - tensor = aidge_core.Tensor(array).set_datatype(aidge_core.dtype.float32) - n.get_operator().set_output(0, aidge_core.Tensor(array)) - print(str(n.get_operator().get_output(0).dformat())) - print("Data in :") - print(array) - - -model.compile("cpu", aidge_core.dtype.float32, dims=[INPUT_DIMS]) - - - -def propagate(model, scheduler, tensor): - # Setup the input - - input_tensor = aidge_core.Tensor(tensor) - # Tensor backend must be set again ... - input_tensor.set_backend("cpu") - input_tensor.set_datatype(aidge_core.dtype.float32) - # Run the inference - scheduler.forward(True, [input_tensor]) - # Gather the results - output_node = model.get_output_nodes().pop() - output_tensor = output_node.get_operator().get_output(0) - aidge_core.export_utils.generate_input_file(export_folder="conv_export", array_name="expected_output", tensor=output_tensor) - return np.array(output_tensor) - - -if PROPAGATE: - model.set_datatype(aidge_core.dtype.float32) - model.set_backend("cpu") - scheduler = aidge_core.SequentialScheduler(model) - output_array =propagate(model, scheduler, input_array) - print(output_array) - - -scheduler = aidge_core.SequentialScheduler(model) -scheduler.generate_scheduling() -scheduler.graph_view() - - -aidge_export_arm_cortexm.export( - "conv_export", - graphview=model, - scheduler = scheduler, - board="stm32h7" - -) -aidge_export_arm_cortexm.utils.generate_call_function_arm_cortex_m("conv_export","main","stm32h7") -aidge_export_arm_cortexm.utils.generate_print_output_arm_cortex_m("conv_export",model,"stm32h7",input_array) -# aidge_core.export_utils.generate_input_file(export_folder="conv_export", array_name="_input_0", tensor=aidge_core.Tensor(input_array)) -- GitLab From e4bced38535e7820a4408be0aee99af5f57382d5 Mon Sep 17 00:00:00 2001 From: Wissam Boussella <wissam.boussella@cea.fr> Date: Fri, 11 Apr 2025 15:09:20 +0200 Subject: [PATCH 4/6] Refactor benchmark and utils: move generate_call_function_arm_cortex_m and generate_print_output_arm_cortex_m to benchmark.py --- aidge_export_arm_cortexm/benchmark.py | 189 ++++++++------------------ aidge_export_arm_cortexm/utils.py | 50 ------- 2 files changed, 55 insertions(+), 184 deletions(-) diff --git a/aidge_export_arm_cortexm/benchmark.py b/aidge_export_arm_cortexm/benchmark.py index 6366c57..1c3004b 100644 --- a/aidge_export_arm_cortexm/benchmark.py +++ b/aidge_export_arm_cortexm/benchmark.py @@ -1,138 +1,59 @@ -import contextlib -import os -from shutil import rmtree -from subprocess import run - import numpy as np import aidge_core -import aidge_backend_cpu -import aidge_export_arm_cortexm - -def measure_inference_time(model: aidge_core.GraphView, input_data: list[str, np.ndarray], nb_warmup: int = 10, nb_iterations: int = 50) -> list[float]: - # load and set up the model - # model.set_datatype(ai.dtype.float32) - model.set_backend("cpu") - - # create input Tensor list for the GraphView - ordered_inputs: list[aidge_core.Tensor] = [] - # [tmp fix] manual transpositin of data for input of export BEFORE converting to Tensor - for i in input_data: - nb_dims = len(i[1].shape) - if nb_dims == 3: - ordered_inputs.append(aidge_core.Tensor(i[1].transpose(0,2,1).reshape(i[1].shape).copy())) - if nb_dims == 4: - ordered_inputs.append(aidge_core.Tensor(np.transpose(i[1], axes=(0,2,3,1)).reshape(i[1].shape).copy())) +from pathlib import Path +from aidge_core.export_utils import generate_file, data_conversion +from aidge_export_arm_cortexm import ROOT + + + +def generate_call_function_arm_cortex_m(export_folder: str, call_function: str, board: str) -> None: + generate_file( + str(Path(export_folder) / "Src" / "main.c"), + str(ROOT / "templates" / "main_call" / str("main_" + board + ".jinja")), + call_function=call_function + ) + +def generate_print_output_arm_cortex_m(export_folder: str, graph_view: aidge_core.GraphView, board: str, inputs_tensor=None) -> None: + + outputs_name: list[str] = [] + outputs_dtype: list[str] = [] + outputs_size: list[int] = [] + inputs_name: list[str] = [] + gv_inputs: list[tuple[aidge_core.Node, int]] = graph_view.get_ordered_inputs() + gv_outputs: list[tuple[aidge_core.Node, int]] = graph_view.get_ordered_outputs() + + for in_node, in_idx in gv_inputs: + in_node_input, in_node_input_idx = in_node.input(in_idx) + in_name = f"{in_node.name()}_input_{in_idx}" if in_node_input is None else f"{in_node_input.name()}_output_{in_node_input_idx}" + inputs_name.append(in_name) + input_tensor = in_node.get_operator().get_input(in_idx) + if input_tensor is None or input_tensor.undefined() or not input_tensor.has_impl(): + if inputs_tensor is not None: + aidge_core.Log.notice("No support for inputs_tensor argument yet.") + aidge_core.Log.notice(f"No input tensor set for {in_name}, main generated will not be functionnal after code generation.") + else: + aidge_core.Log.notice(f"No input tensor set for {in_name}, main generated will not be functionnal after code generation.") else: - ordered_inputs.append(aidge_core.Tensor(i[1])) - - # set inputs for the export - for i, inp in enumerate(model.get_ordered_inputs()): - op = inp[0].get_operator() - op.set_input(i, ordered_inputs[i]) - - model.forward_dims([t.dims() for t in ordered_inputs]) - - scheduler = aidge_core.SequentialScheduler(model) - scheduler.generate_scheduling() - - # for ordered_input in ordered_inputs: - # ordered_input.set_backend("cpu") - operator_type: str = model.get_ordered_outputs()[0][0].get_operator().type() - print(" ├─Generating export...", end="", flush=True) - folder_name: str = f"{operator_type.lower()}_test_export_cpp" - with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f): - aidge_core.export_utils.scheduler_export( - scheduler, - folder_name, - aidge_export_arm_cortexm.ExportLibAidgeARM, - memory_manager=aidge_core.mem_info.generate_optimized_memory_info, - memory_manager_args={"wrapping": False } - ) - aidge_core.export_utils.generate_main_inference_time_cpp(folder_name, model, nb_iterations, nb_warmup) - print(" ok") - - print(" ├─Compiling...", end="", flush=True) - with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f): - run(['make'], cwd=folder_name, stdout=f) - print(" ok") - timings_str = run(f'./{folder_name}/bin/run_export', capture_output=True, text=True) - - folder_path = os.path.abspath(folder_name) - if os.path.exists(folder_path): - rmtree(folder_path, ignore_errors=True) - - timings = [float(t) for t in timings_str.stdout.split(' ') if t.strip()] - return timings - -def compute_output(model: aidge_core.GraphView, input_data: list[str, np.ndarray]) -> list[np.ndarray]: - # load and set up the model - model.set_backend("cpu") - - # create input Tensor list for the GraphView - ordered_inputs: list[aidge_core.Tensor] = [] - # [tmp fix] manual transpositin of data for input of export BEFORE converting to Tensor - for i in input_data: - nb_dims = len(i[1].shape) - if nb_dims == 3: - ordered_inputs.append(aidge_core.Tensor(i[1].transpose(0,2,1).reshape(i[1].shape).copy())) - if nb_dims == 4: - ordered_inputs.append(aidge_core.Tensor(np.transpose(i[1], axes=(0,2,3,1)).reshape(i[1].shape).copy())) - else: - ordered_inputs.append(aidge_core.Tensor(i[1])) - - # set inputs for the export - for i, inp in enumerate(model.get_ordered_inputs()): - op = inp[0].get_operator() - op.set_input(i, ordered_inputs[i]) - - model.forward_dims([t.dims() for t in ordered_inputs]) - - scheduler = aidge_core.SequentialScheduler(model) - scheduler.generate_scheduling() - - - operator_type: str = model.get_ordered_outputs()[0][0].get_operator().type() - print(" │ Generating export...", end="", flush=True) - folder_name: str = f"{operator_type.lower()}_test_export_cpp" - with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f): - aidge_core.export_utils.scheduler_export( - scheduler, - folder_name, - aidge_export_cpp.ExportLibCpp, - memory_manager=aidge_core.mem_info.generate_optimized_memory_info, - memory_manager_args={"wrapping": False } - ) - aidge_core.export_utils.generate_main_display_output_cpp(folder_name, model) - print(" ok") - - print(" │ Compiling...", end="", flush=True) - with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f): - run(['make'], cwd=folder_name, stdout=f) - print(" ok") - output_str: str = run(f'./{folder_name}/bin/run_export', capture_output=True, text=True) - folder_path = os.path.abspath(folder_name) - if os.path.exists(folder_path): - rmtree(folder_path, ignore_errors=True) - - outputs_str: list[str] = output_str.stdout.strip().split('\n') - outputs = [np.array([float(val) for val in single_output_str.split(' ') if val.strip()]) for i, single_output_str in enumerate(outputs_str)] - - for i, pair in enumerate(model.get_ordered_outputs()): - dims = pair[0].get_operator().get_output(pair[1]).dims() - nb_dims = len(dims) - dims_permutted = dims - if nb_dims == 3: - dims_permutted = [dims[0], dims[2], dims[1]] - if nb_dims == 4: - dims_permutted = [dims[0], dims[2], dims[3], dims[1]] - - if np.prod(dims) != outputs[i].size: - aidge_core.Log.fatal("Incompatible export output size ({}) with required shape {}", outputs[i].size, dims) - outputs[i] = outputs[i].reshape(dims_permutted) - if nb_dims == 3: - outputs[i] = outputs[i].transpose(0,2,1) - if nb_dims == 4: - outputs[i] = outputs[i].transpose(0,3,1,2) - - return outputs \ No newline at end of file + aidge_core.export_utils.generate_input_file(export_folder=export_folder, array_name=in_name, tensor=input_tensor) + + for out_node, out_id in gv_outputs: + outputs_name.append(f"{out_node.name()}_output_{out_id}") + out_tensor = out_node.get_operator().get_output(out_id) + outputs_dtype.append(data_conversion.aidge2c(out_tensor.dtype())) + outputs_size.append(out_tensor.size()) + + if len(outputs_name) != len(outputs_dtype) or len(outputs_name) != len(outputs_size): + raise RuntimeError("FATAL: Output args list does not have the same length this is an internal bug.") + + ROOT = Path(__file__).resolve().parents[0] + generate_call_function_arm_cortex_m(Path(export_folder),"print_output",board=board) + generate_file( + str(Path(export_folder)/"Src" / "print_output.hpp"), + str(ROOT / "templates" / "main_call" / "print_output.jinja"), + func_name="model_forward", + inputs_name=inputs_name, + outputs_name=outputs_name, + outputs_dtype=outputs_dtype, + outputs_size=outputs_size + ) \ No newline at end of file diff --git a/aidge_export_arm_cortexm/utils.py b/aidge_export_arm_cortexm/utils.py index cf8cf74..295c2fe 100644 --- a/aidge_export_arm_cortexm/utils.py +++ b/aidge_export_arm_cortexm/utils.py @@ -11,53 +11,3 @@ def show_version(): def get_project_version()->str: return version("aidge_export_arm_cortexm") -def generate_call_function_arm_cortex_m(export_folder: str, call_function: str, board: str) -> None: - generate_file( - str(Path(export_folder) / "Src" / "main.c"), - str(ROOT / "templates" / "main_call" / str("main_" + board + ".jinja")), - call_function=call_function - ) - -def generate_print_output_arm_cortex_m(export_folder: str, graph_view: aidge_core.GraphView, board: str, inputs_tensor=None) -> None: - - outputs_name: list[str] = [] - outputs_dtype: list[str] = [] - outputs_size: list[int] = [] - inputs_name: list[str] = [] - gv_inputs: list[tuple[aidge_core.Node, int]] = graph_view.get_ordered_inputs() - gv_outputs: list[tuple[aidge_core.Node, int]] = graph_view.get_ordered_outputs() - - for in_node, in_idx in gv_inputs: - in_node_input, in_node_input_idx = in_node.input(in_idx) - in_name = f"{in_node.name()}_input_{in_idx}" if in_node_input is None else f"{in_node_input.name()}_output_{in_node_input_idx}" - inputs_name.append(in_name) - input_tensor = in_node.get_operator().get_input(in_idx) - if input_tensor is None or input_tensor.undefined() or not input_tensor.has_impl(): - if inputs_tensor is not None: - aidge_core.Log.notice("No support for inputs_tensor argument yet.") - aidge_core.Log.notice(f"No input tensor set for {in_name}, main generated will not be functionnal after code generation.") - else: - aidge_core.Log.notice(f"No input tensor set for {in_name}, main generated will not be functionnal after code generation.") - else: - aidge_core.export_utils.generate_input_file(export_folder=export_folder, array_name=in_name, tensor=input_tensor) - - for out_node, out_id in gv_outputs: - outputs_name.append(f"{out_node.name()}_output_{out_id}") - out_tensor = out_node.get_operator().get_output(out_id) - outputs_dtype.append(data_conversion.aidge2c(out_tensor.dtype())) - outputs_size.append(out_tensor.size()) - - if len(outputs_name) != len(outputs_dtype) or len(outputs_name) != len(outputs_size): - raise RuntimeError("FATAL: Output args list does not have the same length this is an internal bug.") - - ROOT = Path(__file__).resolve().parents[0] - generate_call_function_arm_cortex_m(Path(export_folder),"print_output",board=board) - generate_file( - str(Path(export_folder)/"Src" / "print_output.hpp"), - str(ROOT / "templates" / "main_call" / "print_output.jinja"), - func_name="model_forward", - inputs_name=inputs_name, - outputs_name=outputs_name, - outputs_dtype=outputs_dtype, - outputs_size=outputs_size - ) \ No newline at end of file -- GitLab From 2c9ef702373900770b3f6b8e72fb7e21d5efe634 Mon Sep 17 00:00:00 2001 From: Wissam Boussella <wissam.boussella@cea.fr> Date: Tue, 22 Apr 2025 14:25:31 +0200 Subject: [PATCH 5/6] Refactor benchmark and generation utilities: move functions to generate.py and add benchmark_inference_time template --- aidge_export_arm_cortexm/benchmark.py | 188 +++++++++++++----- aidge_export_arm_cortexm/generate.py | 103 ++++++++++ .../main_call/benchmark_inference_time.jinja | 56 ++++++ .../templates/main_call/main_stm32f7.jinja | 2 + 4 files changed, 299 insertions(+), 50 deletions(-) create mode 100644 aidge_export_arm_cortexm/generate.py create mode 100644 aidge_export_arm_cortexm/templates/main_call/benchmark_inference_time.jinja diff --git a/aidge_export_arm_cortexm/benchmark.py b/aidge_export_arm_cortexm/benchmark.py index 1c3004b..e842dde 100644 --- a/aidge_export_arm_cortexm/benchmark.py +++ b/aidge_export_arm_cortexm/benchmark.py @@ -2,58 +2,146 @@ import numpy as np import aidge_core from pathlib import Path -from aidge_core.export_utils import generate_file, data_conversion +import aidge_export_arm_cortexm.generate from aidge_export_arm_cortexm import ROOT +import contextlib +import os +from shutil import rmtree +from subprocess import run +import numpy as np + +import aidge_core +import aidge_backend_cpu +import aidge_export_cpp -def generate_call_function_arm_cortex_m(export_folder: str, call_function: str, board: str) -> None: - generate_file( - str(Path(export_folder) / "Src" / "main.c"), - str(ROOT / "templates" / "main_call" / str("main_" + board + ".jinja")), - call_function=call_function - ) - -def generate_print_output_arm_cortex_m(export_folder: str, graph_view: aidge_core.GraphView, board: str, inputs_tensor=None) -> None: - - outputs_name: list[str] = [] - outputs_dtype: list[str] = [] - outputs_size: list[int] = [] - inputs_name: list[str] = [] - gv_inputs: list[tuple[aidge_core.Node, int]] = graph_view.get_ordered_inputs() - gv_outputs: list[tuple[aidge_core.Node, int]] = graph_view.get_ordered_outputs() - - for in_node, in_idx in gv_inputs: - in_node_input, in_node_input_idx = in_node.input(in_idx) - in_name = f"{in_node.name()}_input_{in_idx}" if in_node_input is None else f"{in_node_input.name()}_output_{in_node_input_idx}" - inputs_name.append(in_name) - input_tensor = in_node.get_operator().get_input(in_idx) - if input_tensor is None or input_tensor.undefined() or not input_tensor.has_impl(): - if inputs_tensor is not None: - aidge_core.Log.notice("No support for inputs_tensor argument yet.") - aidge_core.Log.notice(f"No input tensor set for {in_name}, main generated will not be functionnal after code generation.") - else: - aidge_core.Log.notice(f"No input tensor set for {in_name}, main generated will not be functionnal after code generation.") +def measure_inference_time_lib_aidge(model: aidge_core.GraphView, input_data: list[str, np.ndarray], nb_warmup: int = 10, nb_iterations: int = 50) -> list[float]: + # load and set up the model + # model.set_datatype(ai.dtype.float32) + model.set_backend("cpu") + + # create input Tensor list for the GraphView + ordered_inputs: list[aidge_core.Tensor] = [] + # [tmp fix] manual transpositin of data for input of export BEFORE converting to Tensor + for i in input_data: + nb_dims = len(i[1].shape) + if nb_dims == 3: + ordered_inputs.append(aidge_core.Tensor(i[1].transpose(0,2,1).reshape(i[1].shape).copy())) + if nb_dims == 4: + ordered_inputs.append(aidge_core.Tensor(np.transpose(i[1], axes=(0,2,3,1)).reshape(i[1].shape).copy())) else: - aidge_core.export_utils.generate_input_file(export_folder=export_folder, array_name=in_name, tensor=input_tensor) - - for out_node, out_id in gv_outputs: - outputs_name.append(f"{out_node.name()}_output_{out_id}") - out_tensor = out_node.get_operator().get_output(out_id) - outputs_dtype.append(data_conversion.aidge2c(out_tensor.dtype())) - outputs_size.append(out_tensor.size()) - - if len(outputs_name) != len(outputs_dtype) or len(outputs_name) != len(outputs_size): - raise RuntimeError("FATAL: Output args list does not have the same length this is an internal bug.") - - ROOT = Path(__file__).resolve().parents[0] - generate_call_function_arm_cortex_m(Path(export_folder),"print_output",board=board) - generate_file( - str(Path(export_folder)/"Src" / "print_output.hpp"), - str(ROOT / "templates" / "main_call" / "print_output.jinja"), - func_name="model_forward", - inputs_name=inputs_name, - outputs_name=outputs_name, - outputs_dtype=outputs_dtype, - outputs_size=outputs_size - ) \ No newline at end of file + ordered_inputs.append(aidge_core.Tensor(i[1])) + + # set inputs for the export + for i, inp in enumerate(model.get_ordered_inputs()): + op = inp[0].get_operator() + op.set_input(i, ordered_inputs[i]) + + model.forward_dims([t.dims() for t in ordered_inputs]) + + scheduler = aidge_core.SequentialScheduler(model) + scheduler.generate_scheduling() + + # for ordered_input in ordered_inputs: + # ordered_input.set_backend("cpu") + operator_type: str = model.get_ordered_outputs()[0][0].get_operator().type() + print(" ├─Generating export...", end="", flush=True) + folder_name: str = f"{operator_type.lower()}_test_export_arm_cortexm" + with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f): + aidge_core.export_utils.scheduler_export( + scheduler, + folder_name, + aidge_export_arm_cortexm.ExportLibAidgeARM, + memory_manager=aidge_core.mem_info.generate_optimized_memory_info, + memory_manager_args={"wrapping": False } + ) + aidge_core.export_utils.generate_main_inference_time_cpp(folder_name, model, nb_iterations, nb_warmup) + aidge_export_arm_cortexm.generate.generate_print_output_arm_cortex_m(folder_name, model, nb_iterations, nb_warmup) + + print(" ok") + + print(" ├─Compiling...", end="", flush=True) + with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f): + run(['make'], cwd=folder_name, stdout=f) + print(" ok") + timings_str = run(f'./{folder_name}/bin/run_export', capture_output=True, text=True) + + folder_path = os.path.abspath(folder_name) + if os.path.exists(folder_path): + rmtree(folder_path, ignore_errors=True) + + timings = [float(t) for t in timings_str.stdout.split(' ') if t.strip()] + return timings + +# def compute_output(model: aidge_core.GraphView, input_data: list[str, np.ndarray]) -> list[np.ndarray]: +# # load and set up the model +# model.set_backend("cpu") + +# # create input Tensor list for the GraphView +# ordered_inputs: list[aidge_core.Tensor] = [] +# # [tmp fix] manual transpositin of data for input of export BEFORE converting to Tensor +# for i in input_data: +# nb_dims = len(i[1].shape) +# if nb_dims == 3: +# ordered_inputs.append(aidge_core.Tensor(i[1].transpose(0,2,1).reshape(i[1].shape).copy())) +# if nb_dims == 4: +# ordered_inputs.append(aidge_core.Tensor(np.transpose(i[1], axes=(0,2,3,1)).reshape(i[1].shape).copy())) +# else: +# ordered_inputs.append(aidge_core.Tensor(i[1])) + +# # set inputs for the export +# for i, inp in enumerate(model.get_ordered_inputs()): +# op = inp[0].get_operator() +# op.set_input(i, ordered_inputs[i]) + +# model.forward_dims([t.dims() for t in ordered_inputs]) + +# scheduler = aidge_core.SequentialScheduler(model) +# scheduler.generate_scheduling() + + +# operator_type: str = model.get_ordered_outputs()[0][0].get_operator().type() +# print(" │ Generating export...", end="", flush=True) +# folder_name: str = f"{operator_type.lower()}_test_export_cpp" +# with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f): +# aidge_core.export_utils.scheduler_export( +# scheduler, +# folder_name, +# aidge_export_cpp.ExportLibCpp, +# memory_manager=aidge_core.mem_info.generate_optimized_memory_info, +# memory_manager_args={"wrapping": False } +# ) +# aidge_core.export_utils.generate_main_display_output_cpp(folder_name, model) +# print(" ok") + +# print(" │ Compiling...", end="", flush=True) +# with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f): +# run(['make'], cwd=folder_name, stdout=f) +# print(" ok") +# output_str: str = run(f'./{folder_name}/bin/run_export', capture_output=True, text=True) +# folder_path = os.path.abspath(folder_name) +# if os.path.exists(folder_path): +# rmtree(folder_path, ignore_errors=True) + +# outputs_str: list[str] = output_str.stdout.strip().split('\n') +# outputs = [np.array([float(val) for val in single_output_str.split(' ') if val.strip()]) for i, single_output_str in enumerate(outputs_str)] + +# for i, pair in enumerate(model.get_ordered_outputs()): +# dims = pair[0].get_operator().get_output(pair[1]).dims() +# nb_dims = len(dims) +# dims_permutted = dims +# if nb_dims == 3: +# dims_permutted = [dims[0], dims[2], dims[1]] +# if nb_dims == 4: +# dims_permutted = [dims[0], dims[2], dims[3], dims[1]] + +# if np.prod(dims) != outputs[i].size: +# aidge_core.Log.fatal("Incompatible export output size ({}) with required shape {}", outputs[i].size, dims) +# outputs[i] = outputs[i].reshape(dims_permutted) +# if nb_dims == 3: +# outputs[i] = outputs[i].transpose(0,2,1) +# if nb_dims == 4: +# outputs[i] = outputs[i].transpose(0,3,1,2) + +# return outputs \ No newline at end of file diff --git a/aidge_export_arm_cortexm/generate.py b/aidge_export_arm_cortexm/generate.py new file mode 100644 index 0000000..934ad29 --- /dev/null +++ b/aidge_export_arm_cortexm/generate.py @@ -0,0 +1,103 @@ +from pathlib import Path +from aidge_core.export_utils import generate_file, data_conversion +from aidge_export_arm_cortexm import ROOT +import aidge_core +import aidge_export_arm_cortexm + +def generate_call_function_arm_cortex_m(export_folder: str, call_function: str, board: str) -> None: + generate_file( + str(Path(export_folder) / "Src" / "main.c"), + str(ROOT / "templates" / "main_call" / str("main_" + board + ".jinja")), + call_function=call_function + ) + +def generate_print_output_arm_cortex_m(export_folder: str, graph_view: aidge_core.GraphView, board: str, inputs_tensor=None) -> None: + + outputs_name: list[str] = [] + outputs_dtype: list[str] = [] + outputs_size: list[int] = [] + inputs_name: list[str] = [] + gv_inputs: list[tuple[aidge_core.Node, int]] = graph_view.get_ordered_inputs() + gv_outputs: list[tuple[aidge_core.Node, int]] = graph_view.get_ordered_outputs() + + for in_node, in_idx in gv_inputs: + in_node_input, in_node_input_idx = in_node.input(in_idx) + in_name = f"{in_node.name()}_input_{in_idx}" if in_node_input is None else f"{in_node_input.name()}_output_{in_node_input_idx}" + inputs_name.append(in_name) + input_tensor = in_node.get_operator().get_input(in_idx) + if input_tensor is None or input_tensor.undefined() or not input_tensor.has_impl(): + if inputs_tensor is not None: + aidge_core.Log.notice("No support for inputs_tensor argument yet.") + aidge_core.Log.notice(f"No input tensor set for {in_name}, main generated will not be functionnal after code generation.") + else: + aidge_core.Log.notice(f"No input tensor set for {in_name}, main generated will not be functionnal after code generation.") + else: + aidge_core.export_utils.generate_input_file(export_folder=export_folder, array_name=in_name, tensor=input_tensor) + + for out_node, out_id in gv_outputs: + outputs_name.append(f"{out_node.name()}_output_{out_id}") + out_tensor = out_node.get_operator().get_output(out_id) + outputs_dtype.append(data_conversion.aidge2c(out_tensor.dtype())) + outputs_size.append(out_tensor.size()) + + if len(outputs_name) != len(outputs_dtype) or len(outputs_name) != len(outputs_size): + raise RuntimeError("FATAL: Output args list does not have the same length this is an internal bug.") + + ROOT = Path(__file__).resolve().parents[0] + generate_call_function_arm_cortex_m(Path(export_folder),"print_output",board=board) + generate_file( + str(Path(export_folder)/"Src" / "print_output.hpp"), + str(ROOT / "templates" / "main_call" / "print_output.jinja"), + func_name="model_forward", + inputs_name=inputs_name, + outputs_name=outputs_name, + outputs_dtype=outputs_dtype, + outputs_size=outputs_size + ) + + +def generate_inference_time_arm_cortex_m(export_folder: str, graph_view: aidge_core.GraphView, board: str, nb_iterations, nb_warmup, inputs_tensor=None) -> None: + outputs_name: list[str] = [] + outputs_dtype: list[str] = [] + outputs_size: list[int] = [] + inputs_name: list[str] = [] + gv_inputs: list[tuple[aidge_core.Node, int]] = graph_view.get_ordered_inputs() + gv_outputs: list[tuple[aidge_core.Node, int]] = graph_view.get_ordered_outputs() + + for in_node, in_idx in gv_inputs: + in_node_input, in_node_input_idx = in_node.input(in_idx) + in_name = f"{in_node.name()}_input_{in_idx}" if in_node_input is None else f"{in_node_input.name()}_output_{in_node_input_idx}" + inputs_name.append(in_name) + input_tensor = in_node.get_operator().get_input(in_idx) + if input_tensor is None or input_tensor.undefined() or not input_tensor.has_impl(): + if inputs_tensor is not None: + aidge_core.Log.notice("No support for inputs_tensor argument yet.") + aidge_core.Log.notice(f"No input tensor set for {in_name}, main generated will not be functionnal after code generation.") + else: + aidge_core.Log.notice(f"No input tensor set for {in_name}, main generated will not be functionnal after code generation.") + else: + aidge_core.export_utils.generate_input_file(str(Path(export_folder) / "data"), array_name=in_name, tensor=input_tensor) + + for out_node, out_id in gv_outputs: + outputs_name.append(f"{out_node.name()}_output_{out_id}") + out_tensor = out_node.get_operator().get_output(out_id) + outputs_dtype.append(data_conversion.aidge2c(out_tensor.dtype())) + outputs_size.append(out_tensor.size()) + + if len(outputs_name) != len(outputs_dtype) or len(outputs_name) != len(outputs_size): + raise RuntimeError("FATAL: Output args list does not have the same length this is an internal bug.") + + ROOT = Path(__file__).resolve().parents[0] + generate_call_function_arm_cortex_m(Path(export_folder),"print_output",board=board) + generate_file( + str(Path(export_folder) / "benchmark_inference_time.hpp"), + str(ROOT / "templates" / "main_call" / "benchmark_inference_time.jinja"), + func_name="model_forward", + inputs_name=inputs_name, + outputs_name=outputs_name, + outputs_dtype=outputs_dtype, + outputs_size=outputs_size, + nb_iterations=nb_iterations, + nb_warmup=nb_warmup, + board=board + ) \ No newline at end of file diff --git a/aidge_export_arm_cortexm/templates/main_call/benchmark_inference_time.jinja b/aidge_export_arm_cortexm/templates/main_call/benchmark_inference_time.jinja new file mode 100644 index 0000000..028a530 --- /dev/null +++ b/aidge_export_arm_cortexm/templates/main_call/benchmark_inference_time.jinja @@ -0,0 +1,56 @@ +#ifndef BENCHMARK_INFERENCE_TIME_HPP +#define BENCHMARK_INFERENCE_TIME_HPP + +#include "stdio.h" +#include "forward.hpp" + +// Necessary to have HAL_GetTick() +#include "{{ board }}xx_hal.h" +{% for name in inputs_name %} +#include "{{ name }}.h" +{% endfor %} + +{% set printf_formats = { + "double": "%lf", + "float": "%f", + "int8_t": "%hhd", + "int16_t": "%hd", + "int32_t": "%d", + "int64_t": "%lld", + "uint8_t": "%hhu", + "uint16_t": "%hu", + "uint32_t": "%u", + "uint64_t": "%llu" +} %} + +int benchmark_inference_time() +{ + // Initialize the output arrays + {%- for o in range(outputs_name | length) %} + {{ outputs_dtype[o] }}* {{ outputs_name[o] }} = NULL; + {% endfor %} + uint32_t start; + uint32_t end; + double times[{{ nb_iterations }}] = {0}; + for (std::size_t i = 0; i < {{ nb_iterations }} + {{ nb_warmup }}; ++i) { + if (i < {{ nb_warmup }}) { + {{ func_name }}({{ inputs_name|join(", ") }}{% if inputs_name %}, {% endif %}&{{ outputs_name|join(", &") }}); + } else { + start = HAL_GetTick(); + {{ func_name }}({{ inputs_name|join(", ") }}{% if inputs_name %}, {% endif %}&{{ outputs_name|join(", &") }}); + {{ func_name }}({{ inputs_name|join(", ") }}{% if inputs_name %}, {% endif %}&{{ outputs_name|join(", &") }}); + {{ func_name }}({{ inputs_name|join(", ") }}{% if inputs_name %}, {% endif %}&{{ outputs_name|join(", &") }}); + {{ func_name }}({{ inputs_name|join(", ") }}{% if inputs_name %}, {% endif %}&{{ outputs_name|join(", &") }}); + end = HAL_GetTick(); + times[i - {{ nb_warmup }}] = ((double)(end - start)/CLOCKS_PER_SEC)/4.0; + } + } + + for (std::size_t i = 0; i < {{ nb_iterations }}; ++i) { + printf("%.10lf ", times[i]); + } + printf("\n"); + return 0; +} + +#endif BENCHMARK_INFERENCE_TIME_HPP \ No newline at end of file diff --git a/aidge_export_arm_cortexm/templates/main_call/main_stm32f7.jinja b/aidge_export_arm_cortexm/templates/main_call/main_stm32f7.jinja index a5f8e5e..50dea3f 100644 --- a/aidge_export_arm_cortexm/templates/main_call/main_stm32f7.jinja +++ b/aidge_export_arm_cortexm/templates/main_call/main_stm32f7.jinja @@ -26,6 +26,8 @@ #include <stdio.h> #include <stdint.h> +#include "dnn/include/forward.hpp" + /* USER CODE END Includes */ /* Private typedef -----------------------------------------------------------*/ -- GitLab From 8d3507547309538d325b19e843691149c2a8b5bd Mon Sep 17 00:00:00 2001 From: Wissam Boussella <wissam.boussella@cea.fr> Date: Tue, 22 Apr 2025 16:07:22 +0200 Subject: [PATCH 6/6] now can generate inference time for stm32 : can compile but not yet tested on the h7 --- aidge_export_arm_cortexm/benchmark.py | 72 ------------------- aidge_export_arm_cortexm/generate.py | 2 +- .../main_call/benchmark_inference_time.jinja | 10 +-- 3 files changed, 6 insertions(+), 78 deletions(-) diff --git a/aidge_export_arm_cortexm/benchmark.py b/aidge_export_arm_cortexm/benchmark.py index e842dde..17f5ff9 100644 --- a/aidge_export_arm_cortexm/benchmark.py +++ b/aidge_export_arm_cortexm/benchmark.py @@ -73,75 +73,3 @@ def measure_inference_time_lib_aidge(model: aidge_core.GraphView, input_data: li timings = [float(t) for t in timings_str.stdout.split(' ') if t.strip()] return timings - -# def compute_output(model: aidge_core.GraphView, input_data: list[str, np.ndarray]) -> list[np.ndarray]: -# # load and set up the model -# model.set_backend("cpu") - -# # create input Tensor list for the GraphView -# ordered_inputs: list[aidge_core.Tensor] = [] -# # [tmp fix] manual transpositin of data for input of export BEFORE converting to Tensor -# for i in input_data: -# nb_dims = len(i[1].shape) -# if nb_dims == 3: -# ordered_inputs.append(aidge_core.Tensor(i[1].transpose(0,2,1).reshape(i[1].shape).copy())) -# if nb_dims == 4: -# ordered_inputs.append(aidge_core.Tensor(np.transpose(i[1], axes=(0,2,3,1)).reshape(i[1].shape).copy())) -# else: -# ordered_inputs.append(aidge_core.Tensor(i[1])) - -# # set inputs for the export -# for i, inp in enumerate(model.get_ordered_inputs()): -# op = inp[0].get_operator() -# op.set_input(i, ordered_inputs[i]) - -# model.forward_dims([t.dims() for t in ordered_inputs]) - -# scheduler = aidge_core.SequentialScheduler(model) -# scheduler.generate_scheduling() - - -# operator_type: str = model.get_ordered_outputs()[0][0].get_operator().type() -# print(" │ Generating export...", end="", flush=True) -# folder_name: str = f"{operator_type.lower()}_test_export_cpp" -# with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f): -# aidge_core.export_utils.scheduler_export( -# scheduler, -# folder_name, -# aidge_export_cpp.ExportLibCpp, -# memory_manager=aidge_core.mem_info.generate_optimized_memory_info, -# memory_manager_args={"wrapping": False } -# ) -# aidge_core.export_utils.generate_main_display_output_cpp(folder_name, model) -# print(" ok") - -# print(" │ Compiling...", end="", flush=True) -# with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f): -# run(['make'], cwd=folder_name, stdout=f) -# print(" ok") -# output_str: str = run(f'./{folder_name}/bin/run_export', capture_output=True, text=True) -# folder_path = os.path.abspath(folder_name) -# if os.path.exists(folder_path): -# rmtree(folder_path, ignore_errors=True) - -# outputs_str: list[str] = output_str.stdout.strip().split('\n') -# outputs = [np.array([float(val) for val in single_output_str.split(' ') if val.strip()]) for i, single_output_str in enumerate(outputs_str)] - -# for i, pair in enumerate(model.get_ordered_outputs()): -# dims = pair[0].get_operator().get_output(pair[1]).dims() -# nb_dims = len(dims) -# dims_permutted = dims -# if nb_dims == 3: -# dims_permutted = [dims[0], dims[2], dims[1]] -# if nb_dims == 4: -# dims_permutted = [dims[0], dims[2], dims[3], dims[1]] - -# if np.prod(dims) != outputs[i].size: -# aidge_core.Log.fatal("Incompatible export output size ({}) with required shape {}", outputs[i].size, dims) -# outputs[i] = outputs[i].reshape(dims_permutted) -# if nb_dims == 3: -# outputs[i] = outputs[i].transpose(0,2,1) -# if nb_dims == 4: -# outputs[i] = outputs[i].transpose(0,3,1,2) - -# return outputs \ No newline at end of file diff --git a/aidge_export_arm_cortexm/generate.py b/aidge_export_arm_cortexm/generate.py index 934ad29..9a9548b 100644 --- a/aidge_export_arm_cortexm/generate.py +++ b/aidge_export_arm_cortexm/generate.py @@ -88,7 +88,7 @@ def generate_inference_time_arm_cortex_m(export_folder: str, graph_view: aidge_c raise RuntimeError("FATAL: Output args list does not have the same length this is an internal bug.") ROOT = Path(__file__).resolve().parents[0] - generate_call_function_arm_cortex_m(Path(export_folder),"print_output",board=board) + generate_call_function_arm_cortex_m(Path(export_folder),"benchmark_inference_time",board=board) generate_file( str(Path(export_folder) / "benchmark_inference_time.hpp"), str(ROOT / "templates" / "main_call" / "benchmark_inference_time.jinja"), diff --git a/aidge_export_arm_cortexm/templates/main_call/benchmark_inference_time.jinja b/aidge_export_arm_cortexm/templates/main_call/benchmark_inference_time.jinja index 028a530..2ce9596 100644 --- a/aidge_export_arm_cortexm/templates/main_call/benchmark_inference_time.jinja +++ b/aidge_export_arm_cortexm/templates/main_call/benchmark_inference_time.jinja @@ -7,7 +7,7 @@ // Necessary to have HAL_GetTick() #include "{{ board }}xx_hal.h" {% for name in inputs_name %} -#include "{{ name }}.h" +#include "data/{{ name }}.h" {% endfor %} {% set printf_formats = { @@ -32,7 +32,7 @@ int benchmark_inference_time() uint32_t start; uint32_t end; double times[{{ nb_iterations }}] = {0}; - for (std::size_t i = 0; i < {{ nb_iterations }} + {{ nb_warmup }}; ++i) { + for (unsigned int i = 0; i < {{ nb_iterations }} + {{ nb_warmup }}; ++i) { if (i < {{ nb_warmup }}) { {{ func_name }}({{ inputs_name|join(", ") }}{% if inputs_name %}, {% endif %}&{{ outputs_name|join(", &") }}); } else { @@ -42,15 +42,15 @@ int benchmark_inference_time() {{ func_name }}({{ inputs_name|join(", ") }}{% if inputs_name %}, {% endif %}&{{ outputs_name|join(", &") }}); {{ func_name }}({{ inputs_name|join(", ") }}{% if inputs_name %}, {% endif %}&{{ outputs_name|join(", &") }}); end = HAL_GetTick(); - times[i - {{ nb_warmup }}] = ((double)(end - start)/CLOCKS_PER_SEC)/4.0; + times[i - {{ nb_warmup }}] = (double)(end - start); } } - for (std::size_t i = 0; i < {{ nb_iterations }}; ++i) { + for (unsigned int i = 0; i < {{ nb_iterations }}; ++i) { printf("%.10lf ", times[i]); } printf("\n"); return 0; } -#endif BENCHMARK_INFERENCE_TIME_HPP \ No newline at end of file +#endif //BENCHMARK_INFERENCE_TIME_HPP \ No newline at end of file -- GitLab