Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • eclipse/aidge/aidge_export_tensorrt
  • nthm/aidge_export_tensorrt
  • hrouis/aidge_export_tensorrt
  • mmarchal/aidge_export_tensorrt
4 results
Show changes
Commits on Source (5)
Showing
with 2747 additions and 2 deletions
###############################################################################
# Aidge Continuous Integration and Deployment #
# #
###############################################################################
stages:
- static_analysis
- build
- test
- coverage
- release
- deploy
include:
- project: 'eclipse/aidge/gitlab_shared_files'
ref: 'main'
file:
# choose which jobs to run by including the corresponding files.
- '.gitlab/ci/ubuntu_python.gitlab-ci.yml'
- '.gitlab/ci/release/pip.gitlab-ci.yml'
# Since aidge_export_tensorrt is a pure python package building on windows and on ubuntu doesn't differ
# - '.gitlab/ci/windows_python.gitlab-ci.yml'
test:ubuntu_python:
before_script:
- !reference [.setup:test:ubuntu_python, before_script]
- DEPS_NAMES=("aidge_onnx" "aidge_core")
- DEPENDENCY_JOB="build:ubuntu_python"
- !reference [.ubuntu:download:artifacts, script]
# Need to install extra dependencies for tests:
- python -m pip install jinja2
coverage:ubuntu_python:
before_script:
- !reference [.setup:coverage:ubuntu_python, before_script]
- DEPS_NAMES=("aidge_onnx" "aidge_core")
- DEPENDENCY_JOB="build:ubuntu_python"
- !reference [.ubuntu:download:artifacts, script]
# Need to install extra dependencies for tests:
- python -m pip install jinja2
\ No newline at end of file
......@@ -33,7 +33,7 @@ def generate_file(filename, templatename, **kwargs):
def export(export_folder, graphview, python_binding=True, trt_version="8.6"):
def export(export_folder, graphview, python_binding=True, trt_version="10.10"):
"""Generate a TensorRT export.
:param export_folder: Name of the folder where to generate the TensorRT export
......@@ -42,7 +42,7 @@ def export(export_folder, graphview, python_binding=True, trt_version="8.6"):
:type graphview: str or :py:class:`Aidge.GraphView`
:param python_binding: If ``True``, clone PyBind into the export to enable python binding, defaults to True
:type python_binding: bool, optional
:param trt_version: The supported TensorRT version, defaults to "8.6"
:param trt_version: The supported TensorRT version, defaults to "10.10"
:type trt_version: str, optional
"""
print(f"Generating TensorRT export in {export_folder}.")
......
cmake_minimum_required(VERSION 3.15)
# Options for compiling the export
option(PYBIND "python binding" ON)
option(TEST_DEBUG "c++ test for debugging" OFF)
# Define CMAKE constants
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE_INIT Release)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
# Add cmake modules
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake/")
project(Aidge_Export_TRT)
enable_language(CUDA)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --compiler-options -fPIC")
# To remove override warnings by deprecated functions in plugin modules
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -diag-suppress 997")
add_library(aidge_trt_cpp STATIC)
# CUDAToolkit
if(${CMAKE_VERSION} VERSION_LESS "3.17.0")
find_package(CUDAToolkit)
else()
# For CMake >= 3.17.0, use the default FindCUDAToolkit provided by CMake
# => in this case, we need to prevent find_package() to use our own.
list(REMOVE_ITEM CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake/")
find_package(CUDAToolkit)
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake/")
endif()
find_package(CuDNN)
find_package(TensorRT)
# Add include directory
target_include_directories(aidge_trt_cpp PUBLIC "include")
# Add plugin directory
target_include_directories(aidge_trt_cpp PUBLIC "plugins")
# Add cuda, cudnn and tensorrt include directories
target_include_directories(aidge_trt_cpp SYSTEM PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
target_include_directories(aidge_trt_cpp SYSTEM PUBLIC ${CUDNN_INCLUDE_DIRS})
target_include_directories(aidge_trt_cpp SYSTEM PUBLIC ${TensorRT_INCLUDE_DIRS})
# Add cpp src files
file(GLOB_RECURSE cpp_src_files "src/*.cpp" "plugins/*.cpp")
target_sources(aidge_trt_cpp PUBLIC ${cpp_src_files})
# Add cuda src files
# Used PUBLIC for target sources in order to let tensorrt detect plugins
file(GLOB_RECURSE cuda_src_files "src/*.cu" "plugins/*.cu")
target_sources(aidge_trt_cpp PUBLIC ${cuda_src_files})
# Add libraries relative to CUDA
target_link_libraries(aidge_trt_cpp PUBLIC CUDA::cudart CUDA::cublas)
# Add libraries relative to CuDNN
target_link_libraries(aidge_trt_cpp PUBLIC ${CUDNN_LIBRARY})
# Add libraries relative to TensorRT
target_link_libraries(aidge_trt_cpp PUBLIC trt::nvinfer trt::nvonnxparser)
if (PYBIND)
if(NOT EXISTS ${CMAKE_SOURCE_DIR}/python_binding/pybind11)
message(STATUS "Folder python_binding/pybind 11 does not exist. Cloning from Git repository.")
# Run the Git clone command
execute_process(
COMMAND git clone --depth=1 https://github.com/pybind/pybind11.git ${CMAKE_SOURCE_DIR}/python_binding/pybind11
RESULT_VARIABLE git_clone_result
)
# Check the result of the Git clone operation
if(git_clone_result)
message(FATAL_ERROR "Failed to clone https://github.com/pybind/pybind11.git.\nError code: ${git_clone_result}")
else()
message(STATUS "Pybind11 cloned successfully.")
endif()
execute_process(
COMMAND chmod -R a+w ${CMAKE_SOURCE_DIR}/python_binding/pybind11
)
endif()
message(STATUS "Using python_binding/pybind11 for Python binding")
add_subdirectory(${CMAKE_SOURCE_DIR}/python_binding/pybind11 ${CMAKE_BINARY_DIR}/pybind11)
pybind11_add_module(aidge_trt MODULE "python_binding/pybind_export.cpp")
target_include_directories(aidge_trt PUBLIC ${pybind11_INCLUDE_DIRS} "python_binding")
target_link_libraries(aidge_trt PUBLIC aidge_trt_cpp)
endif()
if (TEST_DEBUG)
add_executable(run_export "test_debug.cpp")
target_link_libraries(run_export PUBLIC aidge_trt_cpp)
endif()
#####################################################################
##: Different options to compile the export
##: Usage :
##:
##: make / make help
##: display the different options available
##: make build_cpp
##: compile the export on host for C++ apps
##: (generate an executable in build/bin)
##: make build_lib_python
##: compile the export on host for Python apps
##: (generate a python lib in build/lib)
##: make build_image_docker
##: generate the docker image of the tensorrt compiler
##: make build_cpp_docker
##: compile the export in a container for C++ apps
##: (generate an executable in build/bin)
##: make test_cpp_docker
##: test the executable for C++ apps in a container
##: make build_lib_python_docker
##: compile the export in a container for Python apps
##: (generate a python lib in build/lib)
##: make test_lib_python_docker
##: test the lib for Python apps in a container
##: make clean
##: clean up the build and bin folders
##:
#####################################################################
OBJDIR := build
BINDIR := bin
TARGET := ${BINDIR}/run_export
MAKEFLAGS := --no-print-directory
DOCKER_COMPILER := tools/tensorrt10.10_compiler.Dockerfile
IMAGE := tensorrt:10.10_compiler
all: help
.PHONY: build_cpp build_lib_python clean help
# Build for C++ app
build_cpp:
./tools/compile_export_linux.sh -DPYBIND=0 -DTEST_DEBUG=1
# Build for Python app
build_lib_python:
./tools/compile_export_linux.sh -DPYBIND=1 -DTEST_DEBUG=0
clean:
if [ -d "$(OBJDIR)" ]; then rm -rf $(OBJDIR); fi
if [ -d "$(BINDIR)" ]; then rm -rf $(BINDIR); fi
help:
@grep -e "^##:" Makefile;
# Makefile target for building the tensorrt compiler image
.PHONY: build_image_docker
build_image_docker:
@docker build --pull --rm -f "${DOCKER_COMPILER}" -t ${IMAGE} tools/
# Makefile targets for building and testing c++ app via docker
.PHONY: build_cpp_docker test_cpp_docker
build_cpp_docker:
@docker run --rm --name compiling -v "${PWD}":/usr/src/export -w /usr/src/export ${IMAGE} make build_cpp
test_cpp_docker:
@docker run --rm --gpus=all --name testing -v "${PWD}":/usr/src/export -w /usr/src/export ${IMAGE} ./${OBJDIR}/${TARGET}
# Makefile targets for building and testing python app via docker
.PHONY: build_lib_python_docker test_lib_python_docker
build_lib_python_docker:
@docker run --rm --name compiling -v "${PWD}":/usr/src/export -w /usr/src/export ${IMAGE} make build_lib_python
test_lib_python_docker:
@docker run --rm --gpus=all --name testing -v "${PWD}":/usr/src/export -w /usr/src/export ${IMAGE} python3 test.py
# Set CUDNN_FOUND, CUDNN_INCLUDE_DIRS, CUDNN_LIBRARY, CUDNN_VERSION_MAJOR, CUDNN_VERSION_MINOR, CUDNN_VERSION_PATCH and CUDNN_VERSION.
include(FindPackageHandleStandardArgs)
find_path(CUDNN_INCLUDE_DIRS cudnn.h HINTS ${CUDA_TOOLKIT_ROOT_DIR} PATH_SUFFIXES include)
find_library(CUDNN_LIBRARY NAMES cudnn HINTS ${CUDA_TOOLKIT_ROOT_DIR} PATH_SUFFIXES lib lib64 lib/x64)
find_package_handle_standard_args(CuDNN DEFAULT_MSG CUDNN_INCLUDE_DIRS CUDNN_LIBRARY)
if (CUDNN_INCLUDE_DIRS AND CUDNN_LIBRARY)
file(READ ${CUDNN_INCLUDE_DIRS}/cudnn.h CUDNN_FILE_CONTENTS)
string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)"
CUDNN_VERSION_MAJOR "${CUDNN_FILE_CONTENTS}")
string(REGEX REPLACE "define CUDNN_MAJOR * +([0-9]+)" "\\1"
CUDNN_VERSION_MAJOR "${CUDNN_VERSION_MAJOR}")
string(REGEX MATCH "define CUDNN_MINOR * +([0-9]+)"
CUDNN_VERSION_MINOR "${CUDNN_FILE_CONTENTS}")
string(REGEX REPLACE "define CUDNN_MINOR * +([0-9]+)" "\\1"
CUDNN_VERSION_MINOR "${CUDNN_VERSION_MINOR}")
string(REGEX MATCH "define CUDNN_PATCHLEVEL * +([0-9]+)"
CUDNN_VERSION_PATCH "${CUDNN_FILE_CONTENTS}")
string(REGEX REPLACE "define CUDNN_PATCHLEVEL * +([0-9]+)" "\\1"
CUDNN_VERSION_PATCH "${CUDNN_VERSION_PATCH}")
if(NOT CUDNN_VERSION_MAJOR)
file(READ ${CUDNN_INCLUDE_DIRS}/cudnn_version.h CUDNN_FILE_CONTENTS)
string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)"
CUDNN_VERSION_MAJOR "${CUDNN_FILE_CONTENTS}")
string(REGEX REPLACE "define CUDNN_MAJOR * +([0-9]+)" "\\1"
CUDNN_VERSION_MAJOR "${CUDNN_VERSION_MAJOR}")
endif()
if(NOT CUDNN_VERSION_MAJOR)
set(CUDNN_VERSION "?")
else()
set(CUDNN_VERSION "${CUDNN_VERSION_MAJOR}.${CUDNN_VERSION_MINOR}.${CUDNN_VERSION_PATCH}")
endif()
message(STATUS "CuDNN library status:")
message(STATUS " version: ${CUDNN_VERSION}")
message(STATUS " include path: ${CUDNN_INCLUDE_DIRS}")
message(STATUS " libraries: ${CUDNN_LIBRARY}")
endif()
\ No newline at end of file
# ~~~
# Copyright 2021 Olivier Le Doeuff
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# This module defines the following variables:
#
# - TensorRT_FOUND: A boolean specifying whether or not TensorRT was found.
# - TensorRT_VERSION: The exact version of TensorRT found
# - TensorRT_VERSION_MAJOR: The major version of TensorRT.
# - TensorRT_VERSION_MINOR: The minor version of TensorRT.
# - TensorRT_VERSION_PATCH: The patch version of TensorRT.
# - TensorRT_VERSION_TWEAK: The tweak version of TensorRT.
# - TensorRT_INCLUDE_DIRS: The path to TensorRT ``include`` folder containing the header files required to compile a project linking against TensorRT.
# - TensorRT_LIBRARY_DIRS: The path to TensorRT library directory that contains libraries.
#
# This module create following targets:
# - trt::nvinfer
# - trt::nvinfer_plugin
# - trt::nvonnxparser
# - trt::nvparsers
# This script was inspired from https://github.com/NicolasIRAGNE/CMakeScripts
# This script was inspired from https://github.com/NVIDIA/tensorrt-laboratory/blob/master/cmake/FindTensorRT.cmake
#
# Hints
# ^^^^^
# A user may set ``TensorRT_ROOT`` to an installation root to tell this module where to look.
# ~~~
if(NOT TensorRT_FIND_COMPONENTS)
set(TensorRT_FIND_COMPONENTS nvinfer nvinfer_plugin nvonnxparser nvparsers)
endif()
set(TensorRT_LIBRARIES)
# find the include directory of TensorRT
find_path(
TensorRT_INCLUDE_DIR
NAMES NvInfer.h
PATHS ${TensorRT_ROOT} ENV TensorRT_ROOT
PATH_SUFFIXES include
)
string(FIND ${TensorRT_INCLUDE_DIR} "NOTFOUND" _include_dir_notfound)
if(NOT _include_dir_notfound EQUAL -1)
if(TensorRT_FIND_REQUIRED)
message(FATAL_ERROR "Fail to find TensorRT, please set TensorRT_ROOT. Include path not found.")
endif()
return()
endif()
set(TensorRT_INCLUDE_DIRS ${TensorRT_INCLUDE_DIR})
# Extract version of tensorrt
if(EXISTS "${TensorRT_INCLUDE_DIR}/NvInferVersion.h")
file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInferVersion.h" TensorRT_MAJOR REGEX "^#define NV_TENSORRT_MAJOR [0-9]+.*$")
file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInferVersion.h" TensorRT_MINOR REGEX "^#define NV_TENSORRT_MINOR [0-9]+.*$")
file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInferVersion.h" TensorRT_PATCH REGEX "^#define NV_TENSORRT_PATCH [0-9]+.*$")
file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInferVersion.h" TensorRT_TWEAK REGEX "^#define NV_TENSORRT_BUILD [0-9]+.*$")
string(REGEX REPLACE "^#define NV_TENSORRT_MAJOR ([0-9]+).*$" "\\1" TensorRT_VERSION_MAJOR "${TensorRT_MAJOR}")
string(REGEX REPLACE "^#define NV_TENSORRT_MINOR ([0-9]+).*$" "\\1" TensorRT_VERSION_MINOR "${TensorRT_MINOR}")
string(REGEX REPLACE "^#define NV_TENSORRT_PATCH ([0-9]+).*$" "\\1" TensorRT_VERSION_PATCH "${TensorRT_PATCH}")
string(REGEX REPLACE "^#define NV_TENSORRT_BUILD ([0-9]+).*$" "\\1" TensorRT_VERSION_TWEAK "${TensorRT_TWEAK}")
set(TensorRT_VERSION "${TensorRT_VERSION_MAJOR}.${TensorRT_VERSION_MINOR}.${TensorRT_VERSION_PATCH}.${TensorRT_VERSION_TWEAK}")
endif()
function(_find_trt_component component)
# Find library for component (ie nvinfer, nvparsers, etc...)
find_library(
TensorRT_${component}_LIBRARY
NAMES ${component}
PATHS ${TensorRT_ROOT} ${TENSORRT_LIBRARY_DIR} ENV TensorRT_ROOT
)
string(FIND ${TensorRT_${component}_LIBRARY} "NOTFOUND" _library_not_found)
if(NOT TensorRT_LIBRARY_DIR)
get_filename_component(_path ${TensorRT_${component}_LIBRARY} DIRECTORY)
set(TensorRT_LIBRARY_DIR
"${_path}"
CACHE INTERNAL "TensorRT_LIBRARY_DIR"
)
endif()
if(NOT TensorRT_LIBRARY_DIRS)
get_filename_component(_path ${TensorRT_${component}_LIBRARY} DIRECTORY)
set(TensorRT_LIBRARY_DIRS
"${_path}"
CACHE INTERNAL "TensorRT_LIBRARY_DIRS"
)
endif()
# Library found, and doesn't already exists
if(_library_not_found EQUAL -1 AND NOT TARGET trt::${component})
set(TensorRT_${component}_FOUND
TRUE
CACHE INTERNAL "Found ${component}"
)
# Create a target
add_library(trt::${component} IMPORTED INTERFACE)
target_include_directories(trt::${component} SYSTEM INTERFACE "${TensorRT_INCLUDE_DIRS}")
target_link_libraries(trt::${component} INTERFACE "${TensorRT_${component}_LIBRARY}")
set(TensorRT_LIBRARIES ${TensorRT_LIBRARIES} ${TensorRT_${component}_LIBRARY})
endif()
endfunction()
# Find each components
foreach(component IN LISTS TensorRT_FIND_COMPONENTS)
_find_trt_component(${component})
endforeach()
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(TensorRT HANDLE_COMPONENTS VERSION_VAR TensorRT_VERSION REQUIRED_VARS TensorRT_INCLUDE_DIR)
\ No newline at end of file
#ifndef BATCH_STREAM_H
#define BATCH_STREAM_H
#include "NvInfer.h"
#include <algorithm>
#include <assert.h>
#include <stdio.h>
#include <vector>
class BatchStream
{
public:
BatchStream(unsigned int batchSize, std::vector<unsigned int> dims, unsigned int maxBatches, std::string prefix)
: _batchSize(batchSize)
, _maxBatches(maxBatches)
, _prefix(prefix)
{
_dims.nbDims = dims.size()+1; //The number of dimensions. Max 8.
assert(_dims.nbDims <= 8 && "The maximum number of dimensions supported for a tensor is 8");
_dims.d[0] = batchSize; //Batch Size
for(std::size_t i = 0; i < _dims.nbDims-1; ++i) _dims.d[i+1] = dims[i];
for(auto elem : dims) _imageSize *= elem;
_batch.resize(_batchSize * _imageSize, 0);
_fileBatch.resize(_dims.d[0] * _imageSize, 0);
reset(0);
}
// Resets data members
void reset(int firstBatch)
{
_batchCount = 0;
_fileCount = 0;
_fileBatchPos = _dims.d[0];
skip(firstBatch);
}
// Advance to next batch and return true, or return false if there is no batch left.
bool next()
{
if (_batchCount == _maxBatches)
return false;
for (int csize = 1, batchPos = 0; batchPos < _batchSize; batchPos += csize, _fileBatchPos += csize)
{
assert(_fileBatchPos > 0 && _fileBatchPos <= _dims.d[0]);
if (_fileBatchPos == _dims.d[0] && !update())
return false;
// copy the smaller of: elements left to fulfill the request, or elements left in the file buffer.
csize = std::min(_batchSize - batchPos, static_cast<int32_t>(_dims.d[0] - _fileBatchPos));
std::copy_n(getFileBatch() + _fileBatchPos * _imageSize, csize * _imageSize, getBatch() + batchPos * _imageSize);
}
_batchCount++;
return true;
}
// Skips the batches
void skip(int skipCount)
{
if (_batchSize >= _dims.d[0] && _batchSize % _dims.d[0] == 0 && _fileBatchPos == _dims.d[0])
{
_fileCount += skipCount * _batchSize / _dims.d[0];
return;
}
int x = _batchCount;
for (std::size_t i = 0; i < skipCount; ++i) next();
_batchCount = x;
}
float* getBatch() { return &_batch[0]; }
int getBatchesRead() const { return _batchCount; }
int getBatchSize() const { return _batchSize; }
int getImageSize() const { return _imageSize; }
nvinfer1::Dims getDims() const { return _dims; }
private:
float* getFileBatch() { return &_fileBatch[0]; }
bool update()
{
std::string inputFileName = _prefix + std::to_string(_fileCount++) + ".batch";
std::ifstream file(inputFileName, std::ios_base::in);
if (!file.is_open()) std::cout << "Could not open calibration file " << inputFileName << std::endl;
for(std::size_t i = 0; i < _imageSize; ++i)
{
if(file.eof())
{
std::cerr << "Error: Unexpected end of file. Wrong input size." << std::endl;
std::exit(EXIT_FAILURE);
}
file >> _fileBatch[i];
}
_fileBatchPos = 0;
file.close();
return true;
}
int _batchSize{0};
int _maxBatches{0};
int _batchCount{0};
int _fileCount{0};
int _fileBatchPos{0};
int _imageSize{1};
nvinfer1::Dims _dims;
std::vector<float> _batch;
std::vector<float> _fileBatch;
std::string _prefix;
};
#endif
#ifndef __AIDGE_TENSORRT_GRAPH_HPP__
#define __AIDGE_TENSORRT_GRAPH_HPP__
#include "Utils.hpp"
#include "cuda_utils.h"
#include <string>
#include <vector>
#include <NvInfer.h>
#include <NvOnnxParser.h>
// Allow TensorRT to use up to 1GB of GPU memory for tactic selection
constexpr size_t MAX_WORKSPACE_SIZE = 1ULL << 30; // 1 GB
typedef enum
{
SYNC,
ASYNC
} ExecutionMode_T;
typedef struct
{
std::string name;
int nbElements;
int size;
} IODesc;
typedef struct
{
std::vector<IODesc> inputs;
std::vector<IODesc> outputs;
unsigned int nIO;
} IOGraphDesc;
/**
* @class Graph
* @brief Manages the lifecycle and execution of a neural network graph using TensorRT.
*
* The Graph class encapsulates the functionality required to manage, configure, and execute
* a neural network graph for inference using NVIDIA's TensorRT. This includes loading models
* from ONNX or TensorRT files, setting the CUDA device and data types, managing calibration
* for INT8 precision, and running inference in both synchronous and asynchronous modes.
*/
class Graph
{
public:
/**
* @brief Constructor for the Graph class.
*
* @param filePath Path to the file to load (default is empty).
* @param device_id Device ID to use (default is 0).
* @param nbbits Number of bits for data (default is -32).
*/
Graph(std::string const &filePath,
unsigned int device_id,
int nbbits);
/**
* @brief Destructor for the Graph class.
*/
~Graph();
/**
* @brief Set the CUDA device.
*
* @param id Device ID.
*/
void device(unsigned int id);
/**
* @brief Set the data type for the graph.
*
* @param nbbits Number of bits for data.
*/
void databits(int nbbits);
/**
* @brief Set the data mode for the graph.
*
* @param datatype Data type for the graph.
*/
void datamode(nvinfer1::DataType datatype);
/**
* @brief Load a file into the graph.
*
* @param filePath Path to the file to load.
*/
void load(std::string const &filePath);
/**
* @brief Load an ONNX file into the graph.
*
* @param onnxModelPath Path to the ONNX model file.
*/
void load_onnx(std::string const &onnxModelPath);
/**
* @brief Load a TensorRT file into the graph.
*
* @param trtModelPath Path to the TensorRT model file.
*/
void load_trt(std::string const &trtModelPath);
/**
* @brief Save the graph to a file.
*
* @param fileName Name of the file to save.
*/
void save(std::string const &fileName);
/**
* @brief Initializes the TensorRT engine and execution context for the Graph class. This involves building a serialized network, deserializing it into a CUDA engine, and setting up the necessary execution context and I/O descriptors.
*/
void initialize();
/**
* @brief Calibrate the graph using the calibration data found inside the `calibration` folder.
* This folder should include a `.info` file containing the dimensions of the calibration data, along with the data stored in a `.batch` file*
* Calibration can be expensive, so it is beneficial to generate the calibration data once and then reuse it for subsequent builds of the network. The cache includes the regression cutoff and quantile values used to generate it, and will not be used if these do not match the settings of the current calibrator. However, the network should be recalibrated if its structure changes or if the input data set changes, and it is the responsibility of the application to ensure this.
*
* @param calibration_folder_path Path to the calibration folder.
* @param cache_file_path Path to the cache file.
* @param batch_size Batch size for calibration (default is 1).
*/
void calibrate(std::string const &calibration_folder_path, std::string const &cache_file_path, unsigned int batch_size);
/**
* @brief Profile the graph's execution by printing the average profiled tensorRT process time per stimulus.
*
* @param nb_iterations Number of iterations for profiling.
* @param mode Execution mode (SYNC or ASYNC).
*/
void profile(unsigned int nb_iterations, ExecutionMode_T mode = ExecutionMode_T::ASYNC);
/**
* @brief Automatically set the input profile for the graph.
*
* @param dims_inputs Dimensions of the input tensors.
*/
void auto_input_profile(std::vector<std::vector<int>> dims_inputs);
// Inference methods
/**
* @brief Run the graph.
*
* @param inputs Input data.
* @param outputs Output data.
* @param mode Execution mode (SYNC or ASYNC).
*/
void run(void **inputs, void **outputs, ExecutionMode_T mode = ExecutionMode_T::ASYNC);
/**
* @brief Run the graph asynchronously.
*
* @param inputs Input data.
* @param outputs Output data.
*/
void run_async(void **inputs, void **outputs);
/**
* @brief Run the graph synchronously.
*
* @param inputs Input data.
* @param outputs Output data.
*/
void run_sync(void **inputs, void **outputs);
// Getters
/**
* @brief Get the number of IO tensors in the graph.
*
* @return unsigned int Number of IO tensors.
*/
unsigned int getNbIO();
/**
* @brief Get the IO descriptors of the graph.
*
* @return IOGraphDesc IO descriptors.
*/
IOGraphDesc getIODescription();
protected:
/**
* @brief Initialize IO descriptors for the graph.
*/
void initialize_io_descriptors();
private:
// TensorRT objects for network, engine
// and context creation and management
nvinfer1::INetworkDefinition *_network{nullptr};
nvinfer1::ICudaEngine *_engine{nullptr};
nvinfer1::IBuilder *_builder{nullptr};
nvinfer1::IBuilderConfig *_builderconfig{nullptr};
nvinfer1::IExecutionContext *_context{nullptr};
nvinfer1::IOptimizationProfile *_profile{nullptr};
nvinfer1::IInt8Calibrator *_calibrator{nullptr};
// Graph IO information
IOGraphDesc _iodescriptors;
// Buffer for GPU computation
std::vector<void *> _iobuffer;
// Stream
cudaStream_t _stream{nullptr};
};
#endif // __AIDGE_TENSORRT_GRAPH_HPP__
#include <iterator>
class Int8EntropyCalibrator : public nvinfer1::IInt8EntropyCalibrator2
{
public:
Int8EntropyCalibrator(BatchStream& stream, int firstBatch, std::string cacheName, bool readCache = true)
: _stream(stream),
_calibrationCacheName(cacheName),
_readCache(readCache)
{
nvinfer1::Dims dims = _stream.getDims();
_inputCount = _stream.getBatchSize() * dims.d[1] * dims.d[2] * dims.d[3];
CHECK_CUDA_STATUS(cudaMalloc(&_deviceInput, _inputCount * sizeof(float)));
_stream.reset(firstBatch);
}
virtual ~Int8EntropyCalibrator()
{
CHECK_CUDA_STATUS(cudaFree(_deviceInput));
}
int getBatchSize() const noexcept override { return _stream.getBatchSize(); }
bool getBatch(void* bindings[], const char* names[], int nbBindings) noexcept override
{
if (!_stream.next())
{
return false;
}
CHECK_CUDA_STATUS(cudaMemcpy(_deviceInput, _stream.getBatch(), _inputCount * sizeof(float), cudaMemcpyHostToDevice));
bindings[0] = _deviceInput;
return true;
}
const void* readCalibrationCache(size_t& length) noexcept override
{
_calibrationCache.clear();
std::ifstream input(calibrationTableName(), std::ios::binary);
input >> std::noskipws;
if (_readCache && input.good())
{
std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(_calibrationCache));
}
length = _calibrationCache.size();
return length ? &_calibrationCache[0] : nullptr;
}
virtual void writeCalibrationCache(const void* cache, size_t length) noexcept override
{
std::ofstream output(calibrationTableName(), std::ios::binary);
output.write(reinterpret_cast<const char*>(cache), length);
}
private:
std::string calibrationTableName()
{
return _calibrationCacheName;
}
BatchStream _stream;
size_t _inputCount;
bool _readCache{true};
std::string _calibrationCacheName;
void* _deviceInput{nullptr};
std::vector<char> _calibrationCache;
};
#ifndef __AIDGE_TENSORRT_UTILS_HPP__
#define __AIDGE_TENSORRT_UTILS_HPP__
#include <iostream>
#include <sstream>
#include <iomanip>
#include <string>
#include <vector>
#include <algorithm>
#include <NvInfer.h>
#include <cuda_runtime.h>
#define DIV_UP(X, Y) ((X) / (Y) + ((X) % (Y) > 0))
#define CEIL_DIV(X, Y) (((X) + (Y)-1) / (Y))
static struct Profiler : public nvinfer1::IProfiler
{
typedef std::pair<std::string, float> Record;
std::vector<Record> mProfile;
virtual void reportLayerTime(const char* layerName, float ms) noexcept
{
auto record = std::find_if(mProfile.begin(), mProfile.end(), [&](const Record& r){ return r.first == layerName; });
if (record == mProfile.end())
mProfile.push_back(std::make_pair(layerName, ms));
else
record->second += ms;
}
} gProfiler;
static class Logger : public nvinfer1::ILogger
{
void log(Severity severity, const char* msg) noexcept override
{
switch (severity)
{
case Severity::kINTERNAL_ERROR:
std::cerr << "INTERNAL_ERROR: ";
break;
case Severity::kERROR:
std::cerr << "ERROR: ";
break;
case Severity::kWARNING:
std::cerr << "WARNING: ";
break;
case Severity::kINFO:
std::cerr << "INFO: ";
break;
default:
std::cerr << "VERBOSE: ";
break;
}
std::cerr << msg << std::endl;
}
} gLogger;
static bool endsWith(std::string const &str, std::string const &suffix)
{
if (str.length() < suffix.length()) {
return false;
}
return std::equal(suffix.rbegin(), suffix.rend(), str.rbegin());
}
static std::string removeSubstring(const std::string& input, const std::string& substringToRemove) {
std::string result = input;
size_t pos = result.find(substringToRemove);
if (pos != std::string::npos) {
result.erase(pos, substringToRemove.length());
}
return result;
}
static std::string baseName(const std::string& filePath)
{
const size_t slashPos = filePath.find_last_of("/\\");
return (slashPos == std::string::npos) ? filePath
: filePath.substr(slashPos + 1);
}
static size_t dataTypeToSize(nvinfer1::DataType dataType)
{
switch ((int)dataType) {
case int(nvinfer1::DataType::kFLOAT):
return 4;
case int(nvinfer1::DataType::kHALF):
return 2;
case int(nvinfer1::DataType::kINT8):
return 1;
case int(nvinfer1::DataType::kINT32):
return 4;
case int(nvinfer1::DataType::kBOOL):
return 1;
default:
return 4;
}
}
static bool cudaSupportsDataType(nvinfer1::DataType dataType)
{
int deviceId;
cudaError_t status = cudaGetDevice(&deviceId);
if (status != cudaSuccess) {
std::cerr << "Failed to get CUDA device: " << cudaGetErrorString(status) << std::endl;
return false;
}
cudaDeviceProp deviceProp;
status = cudaGetDeviceProperties(&deviceProp, deviceId);
if (status != cudaSuccess) {
std::cerr << "Failed to get device properties: " << cudaGetErrorString(status) << std::endl;
return false;
}
int major = deviceProp.major;
int minor = deviceProp.minor;
float computeCapability = major + minor * 0.1f;
switch (dataType) {
case nvinfer1::DataType::kFLOAT:
// FP32 supported on all SM 7.5+
return computeCapability >= 7.5f;
case nvinfer1::DataType::kHALF:
// FP16 supported on all SM 7.5+
return computeCapability >= 7.5f;
case nvinfer1::DataType::kINT8:
// INT8 supported on all SM 7.5+
return computeCapability >= 7.5f;
case nvinfer1::DataType::kINT32:
// INT32 supported on all SM 7.5+
return computeCapability >= 7.5f;
case nvinfer1::DataType::kBOOL:
// BOOL supported on all SM 7.5+
return computeCapability >= 7.5f;
default:
std::cerr << "Unknown data type in cudaSupportsDataType" << std::endl;
return false;
}
}
static bool cudaHasFastFp16()
{
return cudaSupportsDataType(nvinfer1::DataType::kHALF);
}
static bool cudaHasFastInt8()
{
return cudaSupportsDataType(nvinfer1::DataType::kINT8);
}
#endif // __AIDGE_TENSORRT_UTILS_HPP__
\ No newline at end of file
#ifndef __AIDGE_TENSORRT_CUDA_UTILS_H__
#define __AIDGE_TENSORRT_CUDA_UTILS_H__
#include <cublas_v2.h>
#include <cuda.h>
#include <cudnn.h>
#define FatalError(s) \
{ \
std::stringstream _where, _message; \
_where << __FILE__ << ':' << __LINE__; \
_message << std::string(s) + "\n" << __FILE__ << ':' << __LINE__; \
std::cerr << _message.str() << "\nAborting...\n"; \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
}
#define CHECK_CUDA_STATUS(status) \
{ \
std::stringstream _error; \
if (status != 0) { \
_error << "Cuda failure: " << cudaGetErrorString(status); \
FatalError(_error.str()); \
} \
}
#endif // __AIDGE_TENSORRT_CUDA_UTILS_H__
\ No newline at end of file
#include "Graph.hpp"
#include <iostream>
#include <vector>
// Uncomment if you want to run a single inference with static inputs
// #include "inputs.h"
int main()
{
Graph model;
model.load("model.onnx");
// Uncomment if the model does not have explicit batch
// Don't forget to change the values of the input dimensions
// std::vector<std::vector<int>> dims_input{{ 1, 1, 28, 28 }};
// model.auto_input_profile(dims_input);
// Uncomment if you want to activate FP16
// model.datamode(nvinfer1::DataType::kHALF);
model.initialize();
// Comment to remove model profiling
model.profile(10);
// Example of script to run a single inference with static inputs
/*
const unsigned int nb_classes = 10;
std::vector<void *> bufferIn {1, nullptr};
bufferIn[0] = (void *)inputs;
std::vector<void *> bufferOut {1, nullptr};
bufferOut[0] = (void *)new float[10];
std::vector<void *> bufferIn {1, nullptr};
bufferIn[0] = (void *)new char[28*28*1 * 4];
float *pData = (float *)bufferIn[0];
for (unsigned int j = 0; j < 784; ++j) {
pData[j] = inputs[j];
}
std::vector<void *> bufferOut {1, nullptr};
bufferOut[0] = (void *)new char[10 * 4];
model.run_async(bufferIn.data(), bufferOut.data());
float *floatArray = static_cast<float *>(bufferOut[0]);
for (unsigned int i = 0; i < nb_classes; ++i)
{
std::cout << i << ": " << floatArray[i] << std::endl;
}
delete[] (float *)bufferIn[0];
delete[] (float *)bufferOut[0];
*/
return 0;
}
\ No newline at end of file
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <pybind11/stl.h>
#include "Graph.hpp"
#include <vector>
namespace py = pybind11;
void init_Graph(py::module& m)
{
py::enum_<ExecutionMode_T>(m, "exe_mode")
.value("sync", ExecutionMode_T::SYNC)
.value("async", ExecutionMode_T::ASYNC)
.export_values()
;
py::class_<Graph>(m, "Graph")
.def(py::init<std::string, unsigned int, int>(),
py::arg("filepath") = "",
py::arg("device_id") = 0,
py::arg("nb_bits") = -32,
R"mydelimiter(
Construct a new Graph object.
:param filepath: Path to the file to load (default is empty).
:type filepath: str
:param device_id: Device ID to use (default is 0).
:type device_id: unsigned int
:param nb_bits: Number of bits for data (default is -32).
:type nb_bits: int
)mydelimiter")
.def("device", &Graph::device, py::arg("id"),
R"mydelimiter(
Set the CUDA device.
:param id: Device ID.
:type id: unsigned int
)mydelimiter")
.def("load", &Graph::load, py::arg("filepath"),
R"mydelimiter(
Load a graph from a file, either a `.onnx` file or a `.trt` engine.
:param filepath: Path to the file.
:type filepath: str
)mydelimiter")
.def("save", &Graph::save, py::arg("filepath"),
R"mydelimiter(
Save the current graph as a `.trt` engine.
:param filepath: Path to the file.
:type filepath: str
)mydelimiter")
.def("calibrate", &Graph::calibrate, py::arg("calibration_folder_path") = "./calibration_folder/", py::arg("cache_file_path") = "./calibration_cache", py::arg("batch_size") = 1,
R"mydelimiter(
Calibrate the graph to determine the appropriate scaling factors for converting floating-point values to lower-precision representations, using the calibration data found inside the specified `calibration_folder`. This folder should include a `.info` file containing the dimensions of the calibration data, along with the data stored in a `.batch` file
Calibration can be expensive, so it is beneficial to generate the calibration data once and then reuse it for subsequent builds of the network. The cache includes the regression cutoff and quantile values used to generate it, and will not be used if these do not match the settings of the current calibrator. However, the network should be recalibrated if its structure changes or if the input data set changes, and it is the responsibility of the application to ensure this.
:param calibration_folder_path: Path to the calibration folder.
:type calibration_folder_path: str
:param cache_file_path: Path to the cache file.
:type cache_file_path: str
:param batch_size: Batch size for calibration (default is 1).
:type batch_size: int
)mydelimiter")
.def("initialize", &Graph::initialize,
R"mydelimiter(
Initializes the TensorRT engine and execution context for the Graph class. This involves building a serialized network, deserializing it into a CUDA engine, and setting up the necessary execution context and I/O descriptors.
)mydelimiter")
.def("profile", &Graph::profile, py::arg("nb_iterations"), py::arg("mode")= ExecutionMode_T::ASYNC,
R"mydelimiter(
Profile the graph's execution by printing the average profiled TensorRT process time per stimulus.
:param nb_iterations: Number of iterations for profiling.
:type nb_iterations: unsigned int
:param mode: Execution mode (SYNC or ASYNC, default is ASYNC).
:type mode: ExecutionMode_T
)mydelimiter")
.def("run_sync", [](Graph& graph, py::list inputs) -> py::list {
py::list outputs;
std::vector<void *> bufferIn;
std::vector<void *> bufferOut;
IOGraphDesc iodesc = graph.getIODescription();
// Fill bufferIn for inference
for (py::handle array: inputs)
{
// py::buffer_info buf_info =
// array.cast<py::array_t<float>>().request();
py::buffer_info buf_info = array.cast<py::array>().request();
bufferIn.push_back(static_cast<void*>(buf_info.ptr));
}
// Allocate memory resources for bufferOut
for (unsigned int i = 0; i < iodesc.outputs.size(); ++i)
{
void* out = (void *)new char[iodesc.outputs[i].size];
bufferOut.push_back(out);
}
// Run inference
graph.run_sync(bufferIn.data(), bufferOut.data());
// Get outputs
for (unsigned int i = 0; i < iodesc.outputs.size(); ++i)
{
// Improve the code by being independant from the type of output
float* data_ptr = static_cast<float*>(bufferOut[i]);
py::array_t<float> processed_array = py::array_t<float>(
iodesc.outputs[i].nbElements, data_ptr);
outputs.append(processed_array);
}
return outputs;
}, py::arg("inputs"),
R"mydelimiter(
Run the graph.
:param inputs: Input data.
:type inputs: list
:param outputs: Output data.
:type outputs: list
:param mode: Execution mode (SYNC or ASYNC, default is ASYNC).
:type mode: ExecutionMode_T
)mydelimiter");
}
PYBIND11_MODULE(aidge_trt, m)
{
init_Graph(m);
}
#include "Graph.hpp"
#include <fstream>
#include <sstream>
#include "BatchStream.hpp"
#include "IInt8EntropyCalibrator.hpp"
#include <dirent.h>
Graph::Graph( std::string const& filePath = "",
unsigned int device_id = 0,
int nbbits = -32)
{
// ctor
this->_builder = nvinfer1::createInferBuilder(gLogger);
this->_profile = this->_builder->createOptimizationProfile();
this->_builderconfig = this->_builder->createBuilderConfig();
// this->_builderconfig->setMaxWorkspaceSize(MAX_WORKSPACE_SIZE);
this->_builderconfig->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, MAX_WORKSPACE_SIZE);
CHECK_CUDA_STATUS(cudaStreamCreate(&(this->_stream)));
device(device_id);
databits(nbbits);
if (!filePath.empty()) {
load(filePath);
}
}
Graph::~Graph()
{
// dtor
if (!this->_iobuffer.empty()) {
for (unsigned int i = 0; i < this->_iobuffer.size(); ++i) {
CHECK_CUDA_STATUS(cudaFree(this->_iobuffer[i]));
}
this->_iobuffer.clear();
}
CHECK_CUDA_STATUS(cudaStreamDestroy(this->_stream));
}
void Graph::device(unsigned int id)
{
CHECK_CUDA_STATUS(cudaSetDevice(id));
}
void Graph::databits(int nbbits)
{
nvinfer1::DataType datatype;
if (nbbits == -32) {
datatype = nvinfer1::DataType::kFLOAT;
}
else if (nbbits == -16) {
datatype = nvinfer1::DataType::kHALF;
}
else if (nbbits == -8) {
datatype = nvinfer1::DataType::kFP8;
}
else if (nbbits == 32) {
datatype = nvinfer1::DataType::kINT32;
}
else if (nbbits == 8) {
datatype = nvinfer1::DataType::kINT8;
}
else {
std::cout << "Cannot use this number of bits ( "
<< nbbits
<< ") for infering the network"
<< std::endl;
return;
}
datamode(datatype);
}
void Graph::datamode(nvinfer1::DataType datatype)
{
switch (datatype) {
case nvinfer1::DataType::kFLOAT:
// Do nothing as it is the default datatype
break;
case nvinfer1::DataType::kHALF: {
if (!cudaHasFastFp16()) {
std::cout << "Cannot use FP16 for this platform \nLet default datatype activated." << std::endl;
return;
}
this->_builderconfig->setFlag(nvinfer1::BuilderFlag::kFP16);
}
break;
case nvinfer1::DataType::kINT8: {
if (!cudaHasFastInt8()) {
std::cout << "Cannot use INT8 for this platform \nLet default datatype activated." << std::endl;
return;
}
// Mark calibrator as nullptr not to provide an INT8 calibrator
this->_builderconfig->setFlag(nvinfer1::BuilderFlag::kINT8);
}
break;
case nvinfer1::DataType::kFP8:
case nvinfer1::DataType::kINT32:
case nvinfer1::DataType::kBOOL:
case nvinfer1::DataType::kUINT8:
default:
std::cout << "Cannot use this datatype for infering the network \nLet default datatype activated." << std::endl;
break;
}
}
void Graph::calibrate( std::string const& calibration_folder_path = "./calibration_folder/",
std::string const& cache_file_path = "./calibration_cache",
unsigned int batch_size = 1)
{
// Open calibration files
const std::string calibDir = calibration_folder_path;
std::vector<std::string> filesCalib;
struct dirent* pFile;
DIR* pDir = opendir(calibDir.c_str());
if (pDir == NULL) {
std::cout << "No directory for batches calibration" << std::endl;
}
else {
while ((pFile = readdir(pDir)) != NULL)
{
if (pFile->d_name[0] != '.') filesCalib.push_back(std::string(calibDir + pFile->d_name));
}
closedir(pDir);
}
unsigned int nbCalibFiles = filesCalib.size();
if(nbCalibFiles == 0) std::cout << "Cannot find calibration files in dir " << calibDir << std::endl;
// Get input tensor shape by reading data.info file in calibration folder
std::vector<unsigned int> dims;
std::ifstream inputFile(calibDir + "/.info");
if (!inputFile.is_open()) {
std::cout << "Error opening the file .info" << std::endl;
} else {
std::string line;
// Read all lines from the file
while (std::getline(inputFile, line)) {
try {
unsigned int intValue = std::stoul(line); // Use stoul for unsigned int
dims.push_back(intValue);
} catch (const std::invalid_argument& e) {
std::cerr << "Error converting string to unsigned int: " << e.what() << std::endl;
} catch (const std::out_of_range& e) {
std::cerr << "Error: Value out of range for unsigned int conversion." << std::endl;
}
}
}
inputFile.close();
BatchStream calibrationStream(batch_size, dims, nbCalibFiles/batch_size, calibration_folder_path);
this->_calibrator = new Int8EntropyCalibrator(calibrationStream, 0, cache_file_path);
this->_builderconfig->setInt8Calibrator(this->_calibrator);
}
void Graph::load(std::string const& filePath)
{
if (endsWith(filePath, ".onnx")) {
load_onnx(filePath);
}
else if (endsWith(filePath, ".trt")) {
load_trt(filePath);
}
else {
throw std::runtime_error("Cannot load this format of file");
}
}
void Graph::load_onnx(std::string const& onnxModelPath)
{
// Impose TensorRT flags for the creation of the network
// Maybe change it to adapt graph to dynamic inputs
nvinfer1::NetworkDefinitionCreationFlags creationFlag;
creationFlag = 1 << static_cast<int32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
this->_network = this->_builder->createNetworkV2(creationFlag);
nvonnxparser::IParser* parser = nvonnxparser::createParser(*this->_network, gLogger);
parser->parseFromFile(onnxModelPath.c_str(), static_cast<int>(nvinfer1::ILogger::Severity::kINFO));
this->_network->setName(removeSubstring(baseName(onnxModelPath), ".onnx").c_str());
}
void Graph::load_trt(std::string const& trtModelPath)
{
std::ifstream cudaEngineStream(trtModelPath);
if(!cudaEngineStream.good())
throw std::runtime_error("Could not open cuda engine file " + trtModelPath);
nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(gLogger);
// Read the stringstream into a memory buffer and pass that to TRT
cudaEngineStream.seekg(0, std::ios::end);
const int modelSize = cudaEngineStream.tellg();
cudaEngineStream.seekg(0, std::ios::beg);
void* modelMem = malloc(modelSize);
if(!modelMem)
throw std::runtime_error("Could not allocate enough memory for load cuda engine file " + trtModelPath);
cudaEngineStream.read((char*)modelMem, modelSize);
this->_engine = runtime->deserializeCudaEngine(modelMem, modelSize);
free(modelMem);
}
void Graph::save(std::string const& fileName)
{
std::ofstream engineSerializedFile;
nvinfer1::IHostMemory* memory = this->_engine->serialize();
if (memory == nullptr)
throw std::runtime_error("Serialize engine failed");
// Open a new file
engineSerializedFile.open(fileName + ".trt", std::ios::out | std::ios::binary);
if (engineSerializedFile.is_open() && engineSerializedFile.good() && !engineSerializedFile.fail()) {
//Save the serialized engine data into the file
engineSerializedFile.write(reinterpret_cast<const char *>(memory->data()), memory->size());
engineSerializedFile.close();
}
else
throw std::runtime_error("Could not save cuda engine file in " + fileName + ".trt");
}
void Graph::initialize()
{
if (!this->_engine) {
nvinfer1::IHostMemory* engineString = this->_builder->buildSerializedNetwork(*(this->_network), *(this->_builderconfig));
if (engineString == nullptr || engineString->size() == 0)
throw std::runtime_error("Failed building serialized engine");
nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(gLogger);
this->_engine = runtime->deserializeCudaEngine(engineString->data(), engineString->size());
}
this->_context = this->_engine->createExecutionContext();
// Initialize IO information
initialize_io_descriptors();
}
void Graph::auto_input_profile(std::vector<std::vector<int>> dims_inputs)
{
// To improve by adding a system to read in a file/json the different optim and the dims
for (int i = 0; i < this->_network->getNbInputs(); ++i) {
nvinfer1::ITensor* input = this->_network->getInput(i);
nvinfer1::Dims dims{};
dims.nbDims = dims_inputs[i].size();
for (unsigned int k = 0; k < dims_inputs[i].size(); ++k) {
dims.d[k] = dims_inputs[i][k];
}
this->_profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kMIN, dims);
this->_profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kOPT, dims);
this->_profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kMAX, dims);
}
this->_builderconfig->addOptimizationProfile(this->_profile);
}
void Graph::initialize_io_descriptors()
{
this->_iodescriptors.nIO = this->_engine->getNbIOTensors();
for (int nIO = 0; nIO < this->_engine->getNbIOTensors(); ++nIO) {
std::string name = std::string(this->_engine->getIOTensorName(nIO));
nvinfer1::Dims dim = this->_context->getTensorShape(name.c_str());
int size = 1;
for (int j = 0; j < dim.nbDims; ++j) {
size *= dim.d[j];
}
int datasize = size * dataTypeToSize(this->_engine->getTensorDataType(name.c_str()));
IODesc descriptor {name, size, datasize};
switch (this->_engine->getTensorIOMode(name.c_str())) {
case nvinfer1::TensorIOMode::kINPUT:
this->_iodescriptors.inputs.push_back(descriptor);
break;
case nvinfer1::TensorIOMode::kOUTPUT:
this->_iodescriptors.outputs.push_back(descriptor);
break;
case nvinfer1::TensorIOMode::kNONE:
default:
break;
}
}
}
void Graph::run(void** inputs, void** outputs, ExecutionMode_T mode)
{
switch (mode) {
case SYNC: {
run_sync(inputs, outputs);
break;
}
case ASYNC: {
run_async(inputs, outputs);
break;
}
default:
throw std::runtime_error("Running mode not supported");
}
}
void Graph::run_async(void** inputs, void** outputs)
{
unsigned int nbInputs = this->_iodescriptors.inputs.size();
unsigned int nbOutputs = this->_iodescriptors.outputs.size();
// Check if memory resources have been allocated for inputs and outputs
// If not, allocate memory on device
if (this->_iobuffer.empty()) {
for (unsigned int i = 0; i < nbInputs; ++i) {
void* inputPtr;
CHECK_CUDA_STATUS(cudaMalloc(&inputPtr, this->_iodescriptors.inputs[i].size));
this->_context->setTensorAddress(this->_iodescriptors.inputs[i].name.c_str(), inputPtr);
this->_iobuffer.push_back(inputPtr);
}
for (unsigned int i = 0; i < nbOutputs; ++i) {
void* outputPtr;
CHECK_CUDA_STATUS(cudaMalloc(&outputPtr, this->_iodescriptors.outputs[i].size));
this->_context->setTensorAddress(this->_iodescriptors.outputs[i].name.c_str(), outputPtr);
this->_iobuffer.push_back(outputPtr);
}
}
// Copy inputs to GPU
for (unsigned int i = 0; i < nbInputs; ++i) {
CHECK_CUDA_STATUS(cudaMemcpy(this->_iobuffer[i],
inputs[i],
this->_iodescriptors.inputs[i].size,
cudaMemcpyHostToDevice));
}
// Run inference on GPU
this->_context->enqueueV3(this->_stream);
// Copy outputs to CPU
for (unsigned int i = 0; i < nbOutputs; ++i) {
CHECK_CUDA_STATUS(cudaMemcpy(outputs[i],
this->_iobuffer[i + nbInputs],
this->_iodescriptors.outputs[i].size,
cudaMemcpyDeviceToHost));
}
}
void Graph::run_sync(void** inputs, void** outputs)
{
unsigned int nbInputs = this->_iodescriptors.inputs.size();
unsigned int nbOutputs = this->_iodescriptors.outputs.size();
// Check if memory resources have been allocated for inputs and outputs
// If not, allocate memory on device
if (this->_iobuffer.empty()) {
for (unsigned int i = 0; i < nbInputs; ++i) {
void* inputPtr;
CHECK_CUDA_STATUS(cudaMalloc(&inputPtr, this->_iodescriptors.inputs[i].size));
this->_iobuffer.push_back(inputPtr);
}
for (unsigned int i = 0; i < nbOutputs; ++i) {
void* outputPtr;
CHECK_CUDA_STATUS(cudaMalloc(&outputPtr, this->_iodescriptors.outputs[i].size));
this->_iobuffer.push_back(outputPtr);
}
}
// Copy inputs to GPU
for (unsigned int i = 0; i < nbInputs; ++i) {
CHECK_CUDA_STATUS(cudaMemcpy(this->_iobuffer[i],
inputs[i],
this->_iodescriptors.inputs[i].size,
cudaMemcpyHostToDevice));
}
// Run inference on GPU
this->_context->executeV2(this->_iobuffer.data());
// Copy outputs to CPU
for (unsigned int i = 0; i < nbOutputs; ++i) {
CHECK_CUDA_STATUS(cudaMemcpy(outputs[i],
this->_iobuffer[i + nbInputs],
this->_iodescriptors.outputs[i].size,
cudaMemcpyDeviceToHost));
}
}
void Graph::profile(unsigned int nb_iterations, ExecutionMode_T mode)
{
if(!this->_context) {
throw std::runtime_error(
"Cannot profile the graph without context from engine");
}
unsigned int nbInputs = this->_iodescriptors.inputs.size();
unsigned int nbOutputs = this->_iodescriptors.outputs.size();
// Initialize input buffer on CPU
std::vector<void *> inputs {nbInputs, nullptr};
for (unsigned int i = 0; i < nbInputs; ++i) {
inputs[i] = (void *)new char[this->_iodescriptors.inputs[i].size];
unsigned int nbElts = this->_iodescriptors.inputs[i].size / dataTypeToSize(this->_engine->getTensorDataType(this->_iodescriptors.inputs[i].name.c_str()));
float *pData = (float *)inputs[i];
for (unsigned int j = 0; j < nbElts; ++j) {
pData[j] = float(j);
}
}
// Initialize output buffer on CPU
std::vector<void *> outputs {nbOutputs, nullptr};
for (unsigned int i = 0; i < nbOutputs; ++i) {
outputs[i] = (void *)new char[this->_iodescriptors.outputs[i].size];
}
// Run 1st inference to allocate GPU resources
run(inputs.data(), outputs.data(), mode);
this->_context->setProfiler(&gProfiler);
for (unsigned int i = 0; i < nb_iterations; ++i) {
run(inputs.data(), outputs.data(), mode);
}
double totalProcessTime = 0.0;
for (size_t i = 0; i < gProfiler.mProfile.size(); ++i)
totalProcessTime += gProfiler.mProfile[i].second / nb_iterations;
for (size_t i = 0; i < gProfiler.mProfile.size(); i++)
{
const double processTimeMs = gProfiler.mProfile[i].second / nb_iterations;
const double workLoad = (processTimeMs / totalProcessTime) * 100.0;
std::string barrelLoad(((unsigned int)workLoad + 1) * 2, '*');
std::cout << std::setprecision(10)
<< "(" << std::setfill('0') << std::setw(2)
<< (unsigned int)workLoad << "%) " << barrelLoad
<< " " << gProfiler.mProfile[i].first << ": "
<< processTimeMs << " ms"
<< std::endl;
}
std::cout << "Average profiled tensorRT process time per stimulus = "
<< totalProcessTime << " ms" << std::endl;
for (unsigned int i = 0; i < nbInputs; ++i) {
delete[] (char *)inputs[i];
}
for (unsigned int i = 0; i < nbOutputs; ++i) {
delete[] (char *)outputs[i];
}
}
unsigned int Graph::getNbIO()
{
return this->_iodescriptors.nIO;
}
IOGraphDesc Graph::getIODescription()
{
return this->_iodescriptors;
}
"""Example test file for the TensorRT Python API.
"""
# TODO Update the path to the shared object if needed
import build.lib.aidge_trt as aidge_trt
import numpy as np
if __name__ == '__main__':
model = aidge_trt.Graph("model.onnx")
model.initialize()
# Profile with 10 iterations
model.profile(10)
# Execution example
# img: numpy.array = np.load("PATH TO NPY file")
# output: numpy.array = model.run_sync([img])
#!/usr/bin/env bash
# This script is not supposed to run alone
# Must be used with the Makefile of this export
OBJDIR=build
BINDIR=bin
cmake -B$OBJDIR $@
cmake --build $OBJDIR
# Add write permissions for users
# to clean build and bin folders outside the container
if [ -d "$OBJDIR" ]; then
chmod -R a+w $OBJDIR
fi
if [ -d "$BINDIR" ]; then
chmod -R a+w $BINDIR
fi
#!/usr/bin/env bash
set -Eeuo pipefail
# first arg is `-f` or `--some-option` or there are no args
if [ "$#" -eq 0 ] || [ "${1#-}" != "$1" ]; then
exec bash "$@"
fi
exec "$@"
FROM nvcr.io/nvidia/tensorrt:25.05-py3
# Start bash login shell
COPY docker-entrypoint.sh /usr/local/bin/
RUN chmod +x /usr/local/bin/docker-entrypoint.sh
ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"]
CMD ["/bin/bash", "-i"]