Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • eclipse/aidge/aidge_export_tensorrt
  • nthm/aidge_export_tensorrt
  • hrouis/aidge_export_tensorrt
  • mmarchal/aidge_export_tensorrt
4 results
Show changes
Commits on Source (5)
Showing
with 2747 additions and 2 deletions
###############################################################################
# Aidge Continuous Integration and Deployment #
# #
###############################################################################
stages:
- static_analysis
- build
- test
- coverage
- release
- deploy
include:
- project: 'eclipse/aidge/gitlab_shared_files'
ref: 'main'
file:
# choose which jobs to run by including the corresponding files.
- '.gitlab/ci/ubuntu_python.gitlab-ci.yml'
- '.gitlab/ci/release/pip.gitlab-ci.yml'
# Since aidge_export_tensorrt is a pure python package building on windows and on ubuntu doesn't differ
# - '.gitlab/ci/windows_python.gitlab-ci.yml'
test:ubuntu_python:
before_script:
- !reference [.setup:test:ubuntu_python, before_script]
- DEPS_NAMES=("aidge_onnx" "aidge_core")
- DEPENDENCY_JOB="build:ubuntu_python"
- !reference [.ubuntu:download:artifacts, script]
# Need to install extra dependencies for tests:
- python -m pip install jinja2
coverage:ubuntu_python:
before_script:
- !reference [.setup:coverage:ubuntu_python, before_script]
- DEPS_NAMES=("aidge_onnx" "aidge_core")
- DEPENDENCY_JOB="build:ubuntu_python"
- !reference [.ubuntu:download:artifacts, script]
# Need to install extra dependencies for tests:
- python -m pip install jinja2
\ No newline at end of file
......@@ -33,7 +33,7 @@ def generate_file(filename, templatename, **kwargs):
def export(export_folder, graphview, python_binding=True, trt_version="8.6"):
def export(export_folder, graphview, python_binding=True, trt_version="10.10"):
"""Generate a TensorRT export.
:param export_folder: Name of the folder where to generate the TensorRT export
......@@ -42,7 +42,7 @@ def export(export_folder, graphview, python_binding=True, trt_version="8.6"):
:type graphview: str or :py:class:`Aidge.GraphView`
:param python_binding: If ``True``, clone PyBind into the export to enable python binding, defaults to True
:type python_binding: bool, optional
:param trt_version: The supported TensorRT version, defaults to "8.6"
:param trt_version: The supported TensorRT version, defaults to "10.10"
:type trt_version: str, optional
"""
print(f"Generating TensorRT export in {export_folder}.")
......
cmake_minimum_required(VERSION 3.15)
# Options for compiling the export
option(PYBIND "python binding" ON)
option(TEST_DEBUG "c++ test for debugging" OFF)
# Define CMAKE constants
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE_INIT Release)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
# Add cmake modules
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake/")
project(Aidge_Export_TRT)
enable_language(CUDA)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --compiler-options -fPIC")
# To remove override warnings by deprecated functions in plugin modules
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -diag-suppress 997")
add_library(aidge_trt_cpp STATIC)
# CUDAToolkit
if(${CMAKE_VERSION} VERSION_LESS "3.17.0")
find_package(CUDAToolkit)
else()
# For CMake >= 3.17.0, use the default FindCUDAToolkit provided by CMake
# => in this case, we need to prevent find_package() to use our own.
list(REMOVE_ITEM CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake/")
find_package(CUDAToolkit)
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake/")
endif()
find_package(CuDNN)
find_package(TensorRT)
# Add include directory
target_include_directories(aidge_trt_cpp PUBLIC "include")
# Add plugin directory
target_include_directories(aidge_trt_cpp PUBLIC "plugins")
# Add cuda, cudnn and tensorrt include directories
target_include_directories(aidge_trt_cpp SYSTEM PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
target_include_directories(aidge_trt_cpp SYSTEM PUBLIC ${CUDNN_INCLUDE_DIRS})
target_include_directories(aidge_trt_cpp SYSTEM PUBLIC ${TensorRT_INCLUDE_DIRS})
# Add cpp src files
file(GLOB_RECURSE cpp_src_files "src/*.cpp" "plugins/*.cpp")
target_sources(aidge_trt_cpp PUBLIC ${cpp_src_files})
# Add cuda src files
# Used PUBLIC for target sources in order to let tensorrt detect plugins
file(GLOB_RECURSE cuda_src_files "src/*.cu" "plugins/*.cu")
target_sources(aidge_trt_cpp PUBLIC ${cuda_src_files})
# Add libraries relative to CUDA
target_link_libraries(aidge_trt_cpp PUBLIC CUDA::cudart CUDA::cublas)
# Add libraries relative to CuDNN
target_link_libraries(aidge_trt_cpp PUBLIC ${CUDNN_LIBRARY})
# Add libraries relative to TensorRT
target_link_libraries(aidge_trt_cpp PUBLIC trt::nvinfer trt::nvonnxparser)
if (PYBIND)
if(NOT EXISTS ${CMAKE_SOURCE_DIR}/python_binding/pybind11)
message(STATUS "Folder python_binding/pybind 11 does not exist. Cloning from Git repository.")
# Run the Git clone command
execute_process(
COMMAND git clone --depth=1 https://github.com/pybind/pybind11.git ${CMAKE_SOURCE_DIR}/python_binding/pybind11
RESULT_VARIABLE git_clone_result
)
# Check the result of the Git clone operation
if(git_clone_result)
message(FATAL_ERROR "Failed to clone https://github.com/pybind/pybind11.git.\nError code: ${git_clone_result}")
else()
message(STATUS "Pybind11 cloned successfully.")
endif()
execute_process(
COMMAND chmod -R a+w ${CMAKE_SOURCE_DIR}/python_binding/pybind11
)
endif()
message(STATUS "Using python_binding/pybind11 for Python binding")
add_subdirectory(${CMAKE_SOURCE_DIR}/python_binding/pybind11 ${CMAKE_BINARY_DIR}/pybind11)
pybind11_add_module(aidge_trt MODULE "python_binding/pybind_export.cpp")
target_include_directories(aidge_trt PUBLIC ${pybind11_INCLUDE_DIRS} "python_binding")
target_link_libraries(aidge_trt PUBLIC aidge_trt_cpp)
endif()
if (TEST_DEBUG)
add_executable(run_export "test_debug.cpp")
target_link_libraries(run_export PUBLIC aidge_trt_cpp)
endif()
#####################################################################
##: Different options to compile the export
##: Usage :
##:
##: make / make help
##: display the different options available
##: make build_cpp
##: compile the export on host for C++ apps
##: (generate an executable in build/bin)
##: make build_lib_python
##: compile the export on host for Python apps
##: (generate a python lib in build/lib)
##: make build_image_docker
##: generate the docker image of the tensorrt compiler
##: make build_cpp_docker
##: compile the export in a container for C++ apps
##: (generate an executable in build/bin)
##: make test_cpp_docker
##: test the executable for C++ apps in a container
##: make build_lib_python_docker
##: compile the export in a container for Python apps
##: (generate a python lib in build/lib)
##: make test_lib_python_docker
##: test the lib for Python apps in a container
##: make clean
##: clean up the build and bin folders
##:
#####################################################################
OBJDIR := build
BINDIR := bin
TARGET := ${BINDIR}/run_export
MAKEFLAGS := --no-print-directory
DOCKER_COMPILER := tools/tensorrt10.10_compiler.Dockerfile
IMAGE := tensorrt:10.10_compiler
all: help
.PHONY: build_cpp build_lib_python clean help
# Build for C++ app
build_cpp:
./tools/compile_export_linux.sh -DPYBIND=0 -DTEST_DEBUG=1
# Build for Python app
build_lib_python:
./tools/compile_export_linux.sh -DPYBIND=1 -DTEST_DEBUG=0
clean:
if [ -d "$(OBJDIR)" ]; then rm -rf $(OBJDIR); fi
if [ -d "$(BINDIR)" ]; then rm -rf $(BINDIR); fi
help:
@grep -e "^##:" Makefile;
# Makefile target for building the tensorrt compiler image
.PHONY: build_image_docker
build_image_docker:
@docker build --pull --rm -f "${DOCKER_COMPILER}" -t ${IMAGE} tools/
# Makefile targets for building and testing c++ app via docker
.PHONY: build_cpp_docker test_cpp_docker
build_cpp_docker:
@docker run --rm --name compiling -v "${PWD}":/usr/src/export -w /usr/src/export ${IMAGE} make build_cpp
test_cpp_docker:
@docker run --rm --gpus=all --name testing -v "${PWD}":/usr/src/export -w /usr/src/export ${IMAGE} ./${OBJDIR}/${TARGET}
# Makefile targets for building and testing python app via docker
.PHONY: build_lib_python_docker test_lib_python_docker
build_lib_python_docker:
@docker run --rm --name compiling -v "${PWD}":/usr/src/export -w /usr/src/export ${IMAGE} make build_lib_python
test_lib_python_docker:
@docker run --rm --gpus=all --name testing -v "${PWD}":/usr/src/export -w /usr/src/export ${IMAGE} python3 test.py
# Distributed under the OSI-approved BSD 3-Clause License. See accompanying
# file Copyright.txt or https://cmake.org/licensing for details.
#[=======================================================================[.rst:
FindCUDAToolkit
---------------
.. versionadded:: 3.17
This script locates the NVIDIA CUDA toolkit and the associated libraries, but
does not require the ``CUDA`` language be enabled for a given project. This
module does not search for the NVIDIA CUDA Samples.
.. versionadded:: 3.19
QNX support.
Search Behavior
^^^^^^^^^^^^^^^
The CUDA Toolkit search behavior uses the following order:
1. If the ``CUDA`` language has been enabled we will use the directory
containing the compiler as the first search location for ``nvcc``.
2. If the ``CUDAToolkit_ROOT`` cmake configuration variable (e.g.,
``-DCUDAToolkit_ROOT=/some/path``) *or* environment variable is defined, it
will be searched. If both an environment variable **and** a
configuration variable are specified, the *configuration* variable takes
precedence.
The directory specified here must be such that the executable ``nvcc`` or
the appropriate ``version.txt`` file can be found underneath the specified
directory.
3. If the CUDA_PATH environment variable is defined, it will be searched
for ``nvcc``.
4. The user's path is searched for ``nvcc`` using :command:`find_program`. If
this is found, no subsequent search attempts are performed. Users are
responsible for ensuring that the first ``nvcc`` to show up in the path is
the desired path in the event that multiple CUDA Toolkits are installed.
5. On Unix systems, if the symbolic link ``/usr/local/cuda`` exists, this is
used. No subsequent search attempts are performed. No default symbolic link
location exists for the Windows platform.
6. The platform specific default install locations are searched. If exactly one
candidate is found, this is used. The default CUDA Toolkit install locations
searched are:
+-------------+-------------------------------------------------------------+
| Platform | Search Pattern |
+=============+=============================================================+
| macOS | ``/Developer/NVIDIA/CUDA-X.Y`` |
+-------------+-------------------------------------------------------------+
| Other Unix | ``/usr/local/cuda-X.Y`` |
+-------------+-------------------------------------------------------------+
| Windows | ``C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\vX.Y`` |
+-------------+-------------------------------------------------------------+
Where ``X.Y`` would be a specific version of the CUDA Toolkit, such as
``/usr/local/cuda-9.0`` or
``C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0``
.. note::
When multiple CUDA Toolkits are installed in the default location of a
system (e.g., both ``/usr/local/cuda-9.0`` and ``/usr/local/cuda-10.0``
exist but the ``/usr/local/cuda`` symbolic link does **not** exist), this
package is marked as **not** found.
There are too many factors involved in making an automatic decision in
the presence of multiple CUDA Toolkits being installed. In this
situation, users are encouraged to either (1) set ``CUDAToolkit_ROOT`` or
(2) ensure that the correct ``nvcc`` executable shows up in ``$PATH`` for
:command:`find_program` to find.
Arguments
^^^^^^^^^
``[<version>]``
The ``[<version>]`` argument requests a version with which the package found
should be compatible. See :ref:`find_package version format <FIND_PACKAGE_VERSION_FORMAT>`
for more details.
Options
^^^^^^^
``REQUIRED``
If specified, configuration will error if a suitable CUDA Toolkit is not
found.
``QUIET``
If specified, the search for a suitable CUDA Toolkit will not produce any
messages.
``EXACT``
If specified, the CUDA Toolkit is considered found only if the exact
``VERSION`` specified is recovered.
Imported targets
^^^^^^^^^^^^^^^^
An :ref:`imported target <Imported targets>` named ``CUDA::toolkit`` is provided.
This module defines :prop_tgt:`IMPORTED` targets for each
of the following libraries that are part of the CUDAToolkit:
- :ref:`CUDA Runtime Library<cuda_toolkit_rt_lib>`
- :ref:`CUDA Driver Library<cuda_toolkit_driver_lib>`
- :ref:`cuBLAS<cuda_toolkit_cuBLAS>`
- :ref:`cuFile<cuda_toolkit_cuFile>`
- :ref:`cuFFT<cuda_toolkit_cuFFT>`
- :ref:`cuRAND<cuda_toolkit_cuRAND>`
- :ref:`cuSOLVER<cuda_toolkit_cuSOLVER>`
- :ref:`cuSPARSE<cuda_toolkit_cuSPARSE>`
- :ref:`cuPTI<cuda_toolkit_cupti>`
- :ref:`NPP<cuda_toolkit_NPP>`
- :ref:`nvBLAS<cuda_toolkit_nvBLAS>`
- :ref:`nvGRAPH<cuda_toolkit_nvGRAPH>`
- :ref:`nvJPEG<cuda_toolkit_nvJPEG>`
- :ref:`nvidia-ML<cuda_toolkit_nvML>`
- :ref:`nvPTX Compiler<cuda_toolkit_nvptx>`
- :ref:`nvRTC<cuda_toolkit_nvRTC>`
- :ref:`nvToolsExt<cuda_toolkit_nvToolsExt>`
- :ref:`nvtx3<cuda_toolkit_nvtx3>`
- :ref:`OpenCL<cuda_toolkit_opencl>`
- :ref:`cuLIBOS<cuda_toolkit_cuLIBOS>`
.. _`cuda_toolkit_rt_lib`:
CUDA Runtime Library
""""""""""""""""""""
The CUDA Runtime library (cudart) are what most applications will typically
need to link against to make any calls such as `cudaMalloc`, and `cudaFree`.
Targets Created:
- ``CUDA::cudart``
- ``CUDA::cudart_static``
.. _`cuda_toolkit_driver_lib`:
CUDA Driver Library
""""""""""""""""""""
The CUDA Driver library (cuda) are used by applications that use calls
such as `cuMemAlloc`, and `cuMemFree`.
Targets Created:
- ``CUDA::cuda_driver``
.. _`cuda_toolkit_cuBLAS`:
cuBLAS
""""""
The `cuBLAS <https://docs.nvidia.com/cuda/cublas/index.html>`_ library.
Targets Created:
- ``CUDA::cublas``
- ``CUDA::cublas_static``
- ``CUDA::cublasLt`` starting in CUDA 10.1
- ``CUDA::cublasLt_static`` starting in CUDA 10.1
.. _`cuda_toolkit_cuFile`:
cuFile
""""""
.. versionadded:: 3.25
The NVIDIA GPUDirect Storage `cuFile <https://docs.nvidia.com/cuda/cufile-api/index.html>`_ library.
Targets Created:
- ``CUDA::cuFile`` starting in CUDA 11.4
- ``CUDA::cuFile_static`` starting in CUDA 11.4
- ``CUDA::cuFile_rdma`` starting in CUDA 11.4
- ``CUDA::cuFile_rdma_static`` starting in CUDA 11.4
.. _`cuda_toolkit_cuFFT`:
cuFFT
"""""
The `cuFFT <https://docs.nvidia.com/cuda/cufft/index.html>`_ library.
Targets Created:
- ``CUDA::cufft``
- ``CUDA::cufftw``
- ``CUDA::cufft_static``
- ``CUDA::cufft_static_nocallback`` starting in CUDA 9.2, requires CMake 3.23+
- ``CUDA::cufftw_static``
cuRAND
""""""
The `cuRAND <https://docs.nvidia.com/cuda/curand/index.html>`_ library.
Targets Created:
- ``CUDA::curand``
- ``CUDA::curand_static``
.. _`cuda_toolkit_cuSOLVER`:
cuSOLVER
""""""""
The `cuSOLVER <https://docs.nvidia.com/cuda/cusolver/index.html>`_ library.
Targets Created:
- ``CUDA::cusolver``
- ``CUDA::cusolver_static``
.. _`cuda_toolkit_cuSPARSE`:
cuSPARSE
""""""""
The `cuSPARSE <https://docs.nvidia.com/cuda/cusparse/index.html>`_ library.
Targets Created:
- ``CUDA::cusparse``
- ``CUDA::cusparse_static``
.. _`cuda_toolkit_cupti`:
cupti
"""""
The `NVIDIA CUDA Profiling Tools Interface <https://developer.nvidia.com/CUPTI>`_.
Targets Created:
- ``CUDA::cupti``
- ``CUDA::cupti_static``
.. _`cuda_toolkit_NPP`:
NPP
"""
The `NPP <https://docs.nvidia.com/cuda/npp/index.html>`_ libraries.
Targets Created:
- `nppc`:
- ``CUDA::nppc``
- ``CUDA::nppc_static``
- `nppial`: Arithmetic and logical operation functions in `nppi_arithmetic_and_logical_operations.h`
- ``CUDA::nppial``
- ``CUDA::nppial_static``
- `nppicc`: Color conversion and sampling functions in `nppi_color_conversion.h`
- ``CUDA::nppicc``
- ``CUDA::nppicc_static``
- `nppicom`: JPEG compression and decompression functions in `nppi_compression_functions.h`
Removed starting in CUDA 11.0, use :ref:`nvJPEG<cuda_toolkit_nvJPEG>` instead.
- ``CUDA::nppicom``
- ``CUDA::nppicom_static``
- `nppidei`: Data exchange and initialization functions in `nppi_data_exchange_and_initialization.h`
- ``CUDA::nppidei``
- ``CUDA::nppidei_static``
- `nppif`: Filtering and computer vision functions in `nppi_filter_functions.h`
- ``CUDA::nppif``
- ``CUDA::nppif_static``
- `nppig`: Geometry transformation functions found in `nppi_geometry_transforms.h`
- ``CUDA::nppig``
- ``CUDA::nppig_static``
- `nppim`: Morphological operation functions found in `nppi_morphological_operations.h`
- ``CUDA::nppim``
- ``CUDA::nppim_static``
- `nppist`: Statistics and linear transform in `nppi_statistics_functions.h` and `nppi_linear_transforms.h`
- ``CUDA::nppist``
- ``CUDA::nppist_static``
- `nppisu`: Memory support functions in `nppi_support_functions.h`
- ``CUDA::nppisu``
- ``CUDA::nppisu_static``
- `nppitc`: Threshold and compare operation functions in `nppi_threshold_and_compare_operations.h`
- ``CUDA::nppitc``
- ``CUDA::nppitc_static``
- `npps`:
- ``CUDA::npps``
- ``CUDA::npps_static``
.. _`cuda_toolkit_nvBLAS`:
nvBLAS
""""""
The `nvBLAS <https://docs.nvidia.com/cuda/nvblas/index.html>`_ libraries.
This is a shared library only.
Targets Created:
- ``CUDA::nvblas``
.. _`cuda_toolkit_nvGRAPH`:
nvGRAPH
"""""""
The `nvGRAPH <https://docs.nvidia.com/cuda/nvgraph/index.html>`_ library.
Removed starting in CUDA 11.0
Targets Created:
- ``CUDA::nvgraph``
- ``CUDA::nvgraph_static``
.. _`cuda_toolkit_nvJPEG`:
nvJPEG
""""""
The `nvJPEG <https://docs.nvidia.com/cuda/nvjpeg/index.html>`_ library.
Introduced in CUDA 10.
Targets Created:
- ``CUDA::nvjpeg``
- ``CUDA::nvjpeg_static``
.. _`cuda_toolkit_nvPTX`:
nvPTX Compiler
""""""""""""""
.. versionadded:: 3.25
The `nvPTX <https://docs.nvidia.com/cuda/ptx-compiler-api/index.html>`_ (PTX Compilation) library.
The PTX Compiler APIs are a set of APIs which can be used to compile a PTX program into GPU assembly code.
Introduced in CUDA 11.1
This is a static library only.
Targets Created:
- ``CUDA::nvptxcompiler_static`` starting in CUDA 11.1
.. _`cuda_toolkit_nvRTC`:
nvRTC
"""""
The `nvRTC <https://docs.nvidia.com/cuda/nvrtc/index.html>`_ (Runtime Compilation) library.
This is a shared library only.
Targets Created:
- ``CUDA::nvrtc``
.. _`cuda_toolkit_nvml`:
nvidia-ML
"""""""""
The `NVIDIA Management Library <https://developer.nvidia.com/nvidia-management-library-nvml>`_.
This is a shared library only.
Targets Created:
- ``CUDA::nvml``
.. _`cuda_toolkit_nvToolsExt`:
nvToolsExt
""""""""""
.. deprecated:: 3.25 With CUDA 10.0+, use :ref:`nvtx3 <cuda_toolkit_nvtx3>`.
The `NVIDIA Tools Extension <https://docs.nvidia.com/gameworks/content/gameworkslibrary/nvtx/nvidia_tools_extension_library_nvtx.htm>`_.
This is a shared library only.
Targets Created:
- ``CUDA::nvToolsExt``
.. _`cuda_toolkit_nvtx3`:
nvtx3
"""""
.. versionadded:: 3.25
The header-only `NVIDIA Tools Extension Library <https://nvidia.github.io/NVTX/doxygen/index.html>`_.
Introduced in CUDA 10.0.
Targets created:
- ``CUDA::nvtx3``
.. _`cuda_toolkit_opencl`:
OpenCL
""""""
The `NVIDIA OpenCL Library <https://developer.nvidia.com/opencl>`_.
This is a shared library only.
Targets Created:
- ``CUDA::OpenCL``
.. _`cuda_toolkit_cuLIBOS`:
cuLIBOS
"""""""
The cuLIBOS library is a backend thread abstraction layer library which is
static only. The ``CUDA::cublas_static``, ``CUDA::cusparse_static``,
``CUDA::cufft_static``, ``CUDA::curand_static``, and (when implemented) NPP
libraries all automatically have this dependency linked.
Target Created:
- ``CUDA::culibos``
**Note**: direct usage of this target by consumers should not be necessary.
.. _`cuda_toolkit_cuRAND`:
Result variables
^^^^^^^^^^^^^^^^
``CUDAToolkit_FOUND``
A boolean specifying whether or not the CUDA Toolkit was found.
``CUDAToolkit_VERSION``
The exact version of the CUDA Toolkit found (as reported by
``nvcc --version`` or ``version.txt``).
``CUDAToolkit_VERSION_MAJOR``
The major version of the CUDA Toolkit.
``CUDAToolkit_VERSION_MINOR``
The minor version of the CUDA Toolkit.
``CUDAToolkit_VERSION_PATCH``
The patch version of the CUDA Toolkit.
``CUDAToolkit_BIN_DIR``
The path to the CUDA Toolkit library directory that contains the CUDA
executable ``nvcc``.
``CUDAToolkit_INCLUDE_DIRS``
The path to the CUDA Toolkit ``include`` folder containing the header files
required to compile a project linking against CUDA.
``CUDAToolkit_LIBRARY_DIR``
The path to the CUDA Toolkit library directory that contains the CUDA
Runtime library ``cudart``.
``CUDAToolkit_LIBRARY_ROOT``
.. versionadded:: 3.18
The path to the CUDA Toolkit directory containing the nvvm directory and
version.txt.
``CUDAToolkit_TARGET_DIR``
The path to the CUDA Toolkit directory including the target architecture
when cross-compiling. When not cross-compiling this will be equivalent to
the parent directory of ``CUDAToolkit_BIN_DIR``.
``CUDAToolkit_NVCC_EXECUTABLE``
The path to the NVIDIA CUDA compiler ``nvcc``. Note that this path may
**not** be the same as
:variable:`CMAKE_CUDA_COMPILER <CMAKE_<LANG>_COMPILER>`. ``nvcc`` must be
found to determine the CUDA Toolkit version as well as determining other
features of the Toolkit. This variable is set for the convenience of
modules that depend on this one.
#]=======================================================================]
# NOTE: much of this was simply extracted from FindCUDA.cmake.
# James Bigler, NVIDIA Corp (nvidia.com - jbigler)
# Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html
#
# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved.
#
# Copyright (c) 2007-2009
# Scientific Computing and Imaging Institute, University of Utah
#
# This code is licensed under the MIT License. See the FindCUDA.cmake script
# for the text of the license.
# The MIT License
#
# License for the specific language governing rights and limitations under
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
#
###############################################################################
# The toolkit is located during compiler detection for CUDA and stored in CMakeCUDACompiler.cmake as
# CMAKE_CUDA_COMPILER_TOOLKIT_ROOT and CMAKE_CUDA_COMPILER_LIBRARY_ROOT.
# We compute the rest based on those here to avoid re-searching and to avoid finding a possibly
# different installation.
if(CMAKE_CUDA_COMPILER_TOOLKIT_ROOT)
set(CUDAToolkit_ROOT_DIR "${CMAKE_CUDA_COMPILER_TOOLKIT_ROOT}")
set(CUDAToolkit_LIBRARY_ROOT "${CMAKE_CUDA_COMPILER_LIBRARY_ROOT}")
set(CUDAToolkit_BIN_DIR "${CUDAToolkit_ROOT_DIR}/bin")
set(CUDAToolkit_NVCC_EXECUTABLE "${CUDAToolkit_BIN_DIR}/nvcc${CMAKE_EXECUTABLE_SUFFIX}")
set(CUDAToolkit_VERSION "${CMAKE_CUDA_COMPILER_TOOLKIT_VERSION}")
if(CUDAToolkit_VERSION MATCHES [=[([0-9]+)\.([0-9]+)\.([0-9]+)]=])
set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}")
set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}")
set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}")
endif()
else()
function(_CUDAToolkit_find_root_dir )
cmake_parse_arguments(arg "" "" "SEARCH_PATHS;FIND_FLAGS" ${ARGN})
if(NOT CUDAToolkit_BIN_DIR)
if(NOT CUDAToolkit_SENTINEL_FILE)
find_program(CUDAToolkit_NVCC_EXECUTABLE
NAMES nvcc nvcc.exe
PATHS ${arg_SEARCH_PATHS}
${arg_FIND_FLAGS}
)
endif()
if(NOT CUDAToolkit_NVCC_EXECUTABLE)
find_file(CUDAToolkit_SENTINEL_FILE
NAMES version.txt
PATHS ${arg_SEARCH_PATHS}
NO_DEFAULT_PATH
)
endif()
if(EXISTS "${CUDAToolkit_NVCC_EXECUTABLE}")
# If NVCC exists then invoke it to find the toolkit location.
# This allows us to support wrapper scripts (e.g. ccache or colornvcc), CUDA Toolkit,
# NVIDIA HPC SDK, and distro's splayed layouts
execute_process(COMMAND ${CUDAToolkit_NVCC_EXECUTABLE} "-v" "__cmake_determine_cuda"
OUTPUT_VARIABLE _CUDA_NVCC_OUT ERROR_VARIABLE _CUDA_NVCC_OUT)
if(_CUDA_NVCC_OUT MATCHES "\\#\\$ TOP=([^\r\n]*)")
get_filename_component(CUDAToolkit_BIN_DIR "${CMAKE_MATCH_1}/bin" ABSOLUTE)
else()
get_filename_component(CUDAToolkit_BIN_DIR "${CUDAToolkit_NVCC_EXECUTABLE}" DIRECTORY)
endif()
unset(_CUDA_NVCC_OUT)
mark_as_advanced(CUDAToolkit_BIN_DIR)
set(CUDAToolkit_BIN_DIR "${CUDAToolkit_BIN_DIR}" CACHE PATH "" FORCE)
endif()
if(CUDAToolkit_SENTINEL_FILE)
get_filename_component(CUDAToolkit_BIN_DIR ${CUDAToolkit_SENTINEL_FILE} DIRECTORY ABSOLUTE)
set(CUDAToolkit_BIN_DIR "${CUDAToolkit_BIN_DIR}/bin")
set(CUDAToolkit_BIN_DIR "${CUDAToolkit_BIN_DIR}" CACHE PATH "" FORCE)
mark_as_advanced(CUDAToolkit_BIN_DIR)
endif()
endif()
if(CUDAToolkit_BIN_DIR)
get_filename_component(CUDAToolkit_ROOT_DIR ${CUDAToolkit_BIN_DIR} DIRECTORY ABSOLUTE)
set(CUDAToolkit_ROOT_DIR "${CUDAToolkit_ROOT_DIR}" PARENT_SCOPE)
endif()
endfunction()
function(_CUDAToolkit_find_version_file result_variable)
# We first check for a non-scattered installation to prefer it over a scattered installation.
if(CUDAToolkit_ROOT AND EXISTS "${CUDAToolkit_ROOT}/version.txt")
set(${result_variable} "${CUDAToolkit_ROOT}/version.txt" PARENT_SCOPE)
elseif(CUDAToolkit_ROOT_DIR AND EXISTS "${CUDAToolkit_ROOT_DIR}/version.txt")
set(${result_variable} "${CUDAToolkit_ROOT_DIR}/version.txt" PARENT_SCOPE)
elseif(CMAKE_SYSROOT_LINK AND EXISTS "${CMAKE_SYSROOT_LINK}/usr/lib/cuda/version.txt")
set(${result_variable} "${CMAKE_SYSROOT_LINK}/usr/lib/cuda/version.txt" PARENT_SCOPE)
elseif(EXISTS "${CMAKE_SYSROOT}/usr/lib/cuda/version.txt")
set(${result_variable} "${CMAKE_SYSROOT}/usr/lib/cuda/version.txt" PARENT_SCOPE)
endif()
endfunction()
# For NVCC we can easily deduce the SDK binary directory from the compiler path.
if(CMAKE_CUDA_COMPILER_LOADED AND NOT CUDAToolkit_BIN_DIR AND CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA")
get_filename_component(CUDAToolkit_BIN_DIR "${CMAKE_CUDA_COMPILER}" DIRECTORY)
set(CUDAToolkit_BIN_DIR "${CUDAToolkit_BIN_DIR}" CACHE PATH "")
# Try language provided path first.
_CUDAToolkit_find_root_dir(SEARCH_PATHS "${CUDAToolkit_BIN_DIR}" FIND_FLAGS NO_DEFAULT_PATH)
mark_as_advanced(CUDAToolkit_BIN_DIR)
endif()
# Try user provided path
if(NOT CUDAToolkit_ROOT_DIR AND CUDAToolkit_ROOT)
_CUDAToolkit_find_root_dir(SEARCH_PATHS "${CUDAToolkit_ROOT}" FIND_FLAGS PATH_SUFFIXES bin NO_DEFAULT_PATH)
endif()
if(NOT CUDAToolkit_ROOT_DIR)
_CUDAToolkit_find_root_dir(FIND_FLAGS PATHS ENV CUDA_PATH PATH_SUFFIXES bin)
endif()
# If the user specified CUDAToolkit_ROOT but the toolkit could not be found, this is an error.
if(NOT CUDAToolkit_ROOT_DIR AND (DEFINED CUDAToolkit_ROOT OR DEFINED ENV{CUDAToolkit_ROOT}))
# Declare error messages now, print later depending on find_package args.
set(fail_base "Could not find nvcc executable in path specified by")
set(cuda_root_fail "${fail_base} CUDAToolkit_ROOT=${CUDAToolkit_ROOT}")
set(env_cuda_root_fail "${fail_base} environment variable CUDAToolkit_ROOT=$ENV{CUDAToolkit_ROOT}")
if(CUDAToolkit_FIND_REQUIRED)
if(DEFINED CUDAToolkit_ROOT)
message(FATAL_ERROR ${cuda_root_fail})
elseif(DEFINED ENV{CUDAToolkit_ROOT})
message(FATAL_ERROR ${env_cuda_root_fail})
endif()
else()
if(NOT CUDAToolkit_FIND_QUIETLY)
if(DEFINED CUDAToolkit_ROOT)
message(STATUS ${cuda_root_fail})
elseif(DEFINED ENV{CUDAToolkit_ROOT})
message(STATUS ${env_cuda_root_fail})
endif()
endif()
set(CUDAToolkit_FOUND FALSE)
unset(fail_base)
unset(cuda_root_fail)
unset(env_cuda_root_fail)
return()
endif()
endif()
# CUDAToolkit_ROOT cmake / env variable not specified, try platform defaults.
#
# - Linux: /usr/local/cuda-X.Y
# - macOS: /Developer/NVIDIA/CUDA-X.Y
# - Windows: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\vX.Y
#
# We will also search the default symlink location /usr/local/cuda first since
# if CUDAToolkit_ROOT is not specified, it is assumed that the symlinked
# directory is the desired location.
if(NOT CUDAToolkit_ROOT_DIR)
if(UNIX)
if(NOT APPLE)
set(platform_base "/usr/local/cuda-")
else()
set(platform_base "/Developer/NVIDIA/CUDA-")
endif()
else()
set(platform_base "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v")
endif()
# Build out a descending list of possible cuda installations, e.g.
file(GLOB possible_paths "${platform_base}*")
# Iterate the glob results and create a descending list.
set(versions)
foreach(p ${possible_paths})
# Extract version number from end of string
string(REGEX MATCH "[0-9][0-9]?\\.[0-9]$" p_version ${p})
if(IS_DIRECTORY ${p} AND p_version)
list(APPEND versions ${p_version})
endif()
endforeach()
# Sort numerically in descending order, so we try the newest versions first.
list(SORT versions COMPARE NATURAL ORDER DESCENDING)
# With a descending list of versions, populate possible paths to search.
set(search_paths)
foreach(v ${versions})
list(APPEND search_paths "${platform_base}${v}")
endforeach()
# Force the global default /usr/local/cuda to the front on Unix.
if(UNIX)
list(INSERT search_paths 0 "/usr/local/cuda")
endif()
# Now search for the toolkit again using the platform default search paths.
_CUDAToolkit_find_root_dir(SEARCH_PATHS "${search_paths}" FIND_FLAGS PATH_SUFFIXES bin)
# We are done with these variables now, cleanup for caller.
unset(platform_base)
unset(possible_paths)
unset(versions)
unset(search_paths)
if(NOT CUDAToolkit_ROOT_DIR)
if(CUDAToolkit_FIND_REQUIRED)
message(FATAL_ERROR "Could not find nvcc, please set CUDAToolkit_ROOT.")
elseif(NOT CUDAToolkit_FIND_QUIETLY)
message(STATUS "Could not find nvcc, please set CUDAToolkit_ROOT.")
endif()
set(CUDAToolkit_FOUND FALSE)
return()
endif()
endif()
_CUDAToolkit_find_version_file( _CUDAToolkit_version_file )
if(_CUDAToolkit_version_file)
# CUDAToolkit_LIBRARY_ROOT contains the device library and version file.
get_filename_component(CUDAToolkit_LIBRARY_ROOT "${_CUDAToolkit_version_file}" DIRECTORY ABSOLUTE)
endif()
unset(_CUDAToolkit_version_file)
if(CUDAToolkit_NVCC_EXECUTABLE AND
CMAKE_CUDA_COMPILER_VERSION AND
CUDAToolkit_NVCC_EXECUTABLE STREQUAL CMAKE_CUDA_COMPILER)
# Need to set these based off the already computed CMAKE_CUDA_COMPILER_VERSION value
# This if statement will always match, but is used to provide variables for MATCH 1,2,3...
if(CMAKE_CUDA_COMPILER_VERSION MATCHES [=[([0-9]+)\.([0-9]+)\.([0-9]+)]=])
set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}")
set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}")
set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}")
set(CUDAToolkit_VERSION "${CMAKE_CUDA_COMPILER_VERSION}")
endif()
elseif(CUDAToolkit_NVCC_EXECUTABLE)
# Compute the version by invoking nvcc
execute_process(COMMAND ${CUDAToolkit_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NVCC_OUT)
if(NVCC_OUT MATCHES [=[ V([0-9]+)\.([0-9]+)\.([0-9]+)]=])
set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}")
set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}")
set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}")
set(CUDAToolkit_VERSION "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}")
endif()
unset(NVCC_OUT)
else()
_CUDAToolkit_find_version_file(version_file)
if(version_file)
file(READ "${version_file}" VERSION_INFO)
if(VERSION_INFO MATCHES [=[CUDA Version ([0-9]+)\.([0-9]+)\.([0-9]+)]=])
set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}")
set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}")
set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}")
set(CUDAToolkit_VERSION "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}")
endif()
endif()
endif()
endif()
# Find target directory when crosscompiling.
if(CMAKE_CROSSCOMPILING)
if(CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7-a")
# Support for NVPACK
set(CUDAToolkit_TARGET_NAME "armv7-linux-androideabi")
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm")
set(CUDAToolkit_TARGET_NAME "armv7-linux-gnueabihf")
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
if(ANDROID_ARCH_NAME STREQUAL "arm64")
set(CUDAToolkit_TARGET_NAME "aarch64-linux-androideabi")
elseif (CMAKE_SYSTEM_NAME STREQUAL "QNX")
set(CUDAToolkit_TARGET_NAME "aarch64-qnx")
else()
set(CUDAToolkit_TARGET_NAME "aarch64-linux")
endif(ANDROID_ARCH_NAME STREQUAL "arm64")
elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
set(CUDAToolkit_TARGET_NAME "x86_64-linux")
endif()
if(EXISTS "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}")
set(CUDAToolkit_TARGET_DIR "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}")
# add known CUDA target root path to the set of directories we search for programs, libraries and headers
list(PREPEND CMAKE_FIND_ROOT_PATH "${CUDAToolkit_TARGET_DIR}")
# Mark that we need to pop the root search path changes after we have
# found all cuda libraries so that searches for our cross-compilation
# libraries work when another cuda sdk is in CMAKE_PREFIX_PATH or
# PATh
set(_CUDAToolkit_Pop_ROOT_PATH True)
endif()
endif()
# If not already set we can simply use the toolkit root or it's a scattered installation.
if(NOT CUDAToolkit_TARGET_DIR)
# Not cross compiling
set(CUDAToolkit_TARGET_DIR "${CUDAToolkit_ROOT_DIR}")
# Now that we have the real ROOT_DIR, find components inside it.
list(APPEND CMAKE_PREFIX_PATH ${CUDAToolkit_ROOT_DIR})
# Mark that we need to pop the prefix path changes after we have
# found the cudart library.
set(_CUDAToolkit_Pop_Prefix True)
endif()
# CUDAToolkit_TARGET_DIR always points to the directory containing the include directory.
# On a scattered installation /usr, on a non-scattered something like /usr/local/cuda or /usr/local/cuda-10.2/targets/aarch64-linux.
if(EXISTS "${CUDAToolkit_TARGET_DIR}/include/cuda_runtime.h")
set(CUDAToolkit_INCLUDE_DIR "${CUDAToolkit_TARGET_DIR}/include")
elseif(NOT CUDAToolkit_FIND_QUIETLY)
message(STATUS "Unable to find cuda_runtime.h in \"${CUDAToolkit_TARGET_DIR}/include\" for CUDAToolkit_INCLUDE_DIR.")
endif()
# The NVHPC layout moves math library headers and libraries to a sibling directory.
# Create a separate variable so this directory can be selectively added to math targets.
if(NOT EXISTS "${CUDAToolkit_INCLUDE_DIR}/cublas_v2.h")
file(REAL_PATH "${CUDAToolkit_TARGET_DIR}/../../" CUDAToolkit_MATH_INCLUDE_DIR)
cmake_path(APPEND CUDAToolkit_MATH_INCLUDE_DIR "math_libs/include")
if(NOT EXISTS "${CUDAToolkit_MATH_INCLUDE_DIR}/cublas_v2.h")
if(NOT CUDAToolkit_FIND_QUIETLY)
message(STATUS "Unable to find cublas_v2.h in either \"${CUDAToolkit_INCLUDE_DIR}\" or \"${CUDAToolkit_MATH_INCLUDE_DIR}\"")
endif()
unset(CUDAToolkit_MATH_INCLUDE_DIR)
endif()
endif()
# Find the CUDA Runtime Library libcudart
find_library(CUDA_CUDART
NAMES cudart
PATH_SUFFIXES lib64 lib/x64
)
find_library(CUDA_CUDART
NAMES cudart
PATH_SUFFIXES lib64/stubs lib/x64/stubs
)
if(NOT CUDA_CUDART AND NOT CUDAToolkit_FIND_QUIETLY)
message(STATUS "Unable to find cudart library.")
endif()
if(_CUDAToolkit_Pop_Prefix)
list(REMOVE_AT CMAKE_PREFIX_PATH -1)
unset(_CUDAToolkit_Pop_Prefix)
endif()
#-----------------------------------------------------------------------------
# Perform version comparison and validate all required variables are set.
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(CUDAToolkit
REQUIRED_VARS
CUDAToolkit_INCLUDE_DIR
CUDA_CUDART
CUDAToolkit_BIN_DIR
VERSION_VAR
CUDAToolkit_VERSION
)
unset(CUDAToolkit_ROOT_DIR)
mark_as_advanced(CUDA_CUDART
CUDAToolkit_INCLUDE_DIR
CUDAToolkit_NVCC_EXECUTABLE
CUDAToolkit_SENTINEL_FILE
)
#-----------------------------------------------------------------------------
# Construct result variables
if(CUDAToolkit_FOUND)
set(CUDAToolkit_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIR})
get_filename_component(CUDAToolkit_LIBRARY_DIR ${CUDA_CUDART} DIRECTORY ABSOLUTE)
endif()
#-----------------------------------------------------------------------------
# Construct import targets
if(CUDAToolkit_FOUND)
function(_CUDAToolkit_find_and_add_import_lib lib_name)
cmake_parse_arguments(arg "" "" "ALT;DEPS;EXTRA_PATH_SUFFIXES;EXTRA_INCLUDE_DIRS" ${ARGN})
set(search_names ${lib_name} ${arg_ALT})
find_library(CUDA_${lib_name}_LIBRARY
NAMES ${search_names}
HINTS ${CUDAToolkit_LIBRARY_DIR}
ENV CUDA_PATH
PATH_SUFFIXES nvidia/current lib64 lib/x64 lib
${arg_EXTRA_PATH_SUFFIXES}
)
# Don't try any stub directories until we have exhausted all other
# search locations.
find_library(CUDA_${lib_name}_LIBRARY
NAMES ${search_names}
HINTS ${CUDAToolkit_LIBRARY_DIR}
ENV CUDA_PATH
PATH_SUFFIXES lib64/stubs lib/x64/stubs lib/stubs stubs
# Support NVHPC splayed math library layout
../../math_libs/${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}/lib64
../../math_libs/lib64
)
mark_as_advanced(CUDA_${lib_name}_LIBRARY)
if (NOT TARGET CUDA::${lib_name} AND CUDA_${lib_name}_LIBRARY)
add_library(CUDA::${lib_name} UNKNOWN IMPORTED)
target_include_directories(CUDA::${lib_name} SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}")
if(DEFINED CUDAToolkit_MATH_INCLUDE_DIR)
string(FIND ${CUDA_${lib_name}_LIBRARY} "math_libs" math_libs)
if(NOT ${math_libs} EQUAL -1)
target_include_directories(CUDA::${lib_name} SYSTEM INTERFACE "${CUDAToolkit_MATH_INCLUDE_DIR}")
endif()
endif()
set_property(TARGET CUDA::${lib_name} PROPERTY IMPORTED_LOCATION "${CUDA_${lib_name}_LIBRARY}")
foreach(dep ${arg_DEPS})
if(TARGET CUDA::${dep})
target_link_libraries(CUDA::${lib_name} INTERFACE CUDA::${dep})
endif()
endforeach()
if(arg_EXTRA_INCLUDE_DIRS)
target_include_directories(CUDA::${lib_name} SYSTEM INTERFACE "${arg_EXTRA_INCLUDE_DIRS}")
endif()
endif()
endfunction()
if(NOT TARGET CUDA::toolkit)
add_library(CUDA::toolkit IMPORTED INTERFACE)
target_include_directories(CUDA::toolkit SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}")
target_link_directories(CUDA::toolkit INTERFACE "${CUDAToolkit_LIBRARY_DIR}")
endif()
_CUDAToolkit_find_and_add_import_lib(cuda_driver ALT cuda)
_CUDAToolkit_find_and_add_import_lib(cudart)
_CUDAToolkit_find_and_add_import_lib(cudart_static)
# setup dependencies that are required for cudart_static when building
# on linux. These are generally only required when using the CUDA toolkit
# when CUDA language is disabled
if(NOT TARGET CUDA::cudart_static_deps
AND TARGET CUDA::cudart_static)
add_library(CUDA::cudart_static_deps IMPORTED INTERFACE)
target_link_libraries(CUDA::cudart_static INTERFACE CUDA::cudart_static_deps)
if(UNIX AND (CMAKE_C_COMPILER OR CMAKE_CXX_COMPILER))
find_package(Threads REQUIRED)
target_link_libraries(CUDA::cudart_static_deps INTERFACE Threads::Threads ${CMAKE_DL_LIBS})
endif()
if(UNIX AND NOT APPLE AND NOT (CMAKE_SYSTEM_NAME STREQUAL "QNX"))
# On Linux, you must link against librt when using the static cuda runtime.
find_library(CUDAToolkit_rt_LIBRARY rt)
mark_as_advanced(CUDAToolkit_rt_LIBRARY)
if(NOT CUDAToolkit_rt_LIBRARY)
message(WARNING "Could not find librt library, needed by CUDA::cudart_static")
else()
target_link_libraries(CUDA::cudart_static_deps INTERFACE ${CUDAToolkit_rt_LIBRARY})
endif()
endif()
endif()
_CUDAToolkit_find_and_add_import_lib(culibos) # it's a static library
foreach (cuda_lib cublasLt cufft curand cusparse nppc nvjpeg)
_CUDAToolkit_find_and_add_import_lib(${cuda_lib})
_CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS culibos)
endforeach()
if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 11.0.0)
# cublas depends on cublasLt
# https://docs.nvidia.com/cuda/archive/11.0/cublas/index.html#static-library
_CUDAToolkit_find_and_add_import_lib(cublas DEPS cublasLt culibos)
_CUDAToolkit_find_and_add_import_lib(cublas_static DEPS cublasLt_static culibos)
else()
_CUDAToolkit_find_and_add_import_lib(cublas DEPS culibos)
_CUDAToolkit_find_and_add_import_lib(cublas_static DEPS culibos)
endif()
if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 11.4)
_CUDAToolkit_find_and_add_import_lib(cuFile DEPS culibos)
_CUDAToolkit_find_and_add_import_lib(cuFile_static DEPS culibos)
_CUDAToolkit_find_and_add_import_lib(cuFile_rdma DEPS cuFile culibos)
_CUDAToolkit_find_and_add_import_lib(cuFile_rdma_static DEPS cuFile_static culibos)
endif()
# cuFFTW depends on cuFFT
_CUDAToolkit_find_and_add_import_lib(cufftw DEPS cufft)
_CUDAToolkit_find_and_add_import_lib(cufftw_static DEPS cufft_static)
if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 9.2)
_CUDAToolkit_find_and_add_import_lib(cufft_static_nocallback DEPS culibos)
endif()
# cuSOLVER depends on cuBLAS, and cuSPARSE
set(cusolver_deps cublas cusparse)
set(cusolver_static_deps cublas_static cusparse_static culibos)
if(CUDAToolkit_VERSION VERSION_GREATER 11.2.1)
# cusolver depends on libcusolver_metis and cublasLt
# https://docs.nvidia.com/cuda/archive/11.2.2/cusolver/index.html#link-dependency
list(APPEND cusolver_deps cublasLt)
_CUDAToolkit_find_and_add_import_lib(cusolver_metis_static ALT metis_static) # implementation detail static lib
list(APPEND cusolver_static_deps cusolver_metis_static cublasLt_static)
endif()
if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 10.1.2)
# cusolver depends on liblapack_static.a starting with CUDA 10.1 update 2,
# https://docs.nvidia.com/cuda/archive/11.5.0/cusolver/index.html#static-link-lapack
_CUDAToolkit_find_and_add_import_lib(cusolver_lapack_static ALT lapack_static) # implementation detail static lib
list(APPEND cusolver_static_deps cusolver_lapack_static)
endif()
_CUDAToolkit_find_and_add_import_lib(cusolver DEPS ${cusolver_deps})
_CUDAToolkit_find_and_add_import_lib(cusolver_static DEPS ${cusolver_static_deps})
unset(cusolver_deps)
unset(cusolver_static_deps)
# nvGRAPH depends on cuRAND, and cuSOLVER.
_CUDAToolkit_find_and_add_import_lib(nvgraph DEPS curand cusolver)
_CUDAToolkit_find_and_add_import_lib(nvgraph_static DEPS curand_static cusolver_static)
# Process the majority of the NPP libraries.
foreach (cuda_lib nppial nppicc nppidei nppif nppig nppim nppist nppitc npps nppicom nppisu)
_CUDAToolkit_find_and_add_import_lib(${cuda_lib} DEPS nppc)
_CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS nppc_static)
endforeach()
find_path(CUDAToolkit_CUPTI_INCLUDE_DIR cupti.h PATHS
"${CUDAToolkit_ROOT_DIR}/extras/CUPTI/include"
"${CUDAToolkit_INCLUDE_DIR}/../extras/CUPTI/include"
"${CUDAToolkit_INCLUDE_DIR}"
NO_DEFAULT_PATH)
if(CUDAToolkit_CUPTI_INCLUDE_DIR)
_CUDAToolkit_find_and_add_import_lib(cupti
EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/
../extras/CUPTI/lib/
EXTRA_INCLUDE_DIRS "${CUDAToolkit_CUPTI_INCLUDE_DIR}")
_CUDAToolkit_find_and_add_import_lib(cupti_static
EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/
../extras/CUPTI/lib/
EXTRA_INCLUDE_DIRS "${CUDAToolkit_CUPTI_INCLUDE_DIR}")
endif()
_CUDAToolkit_find_and_add_import_lib(nvrtc DEPS cuda_driver)
if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 11.1.0)
if(NOT TARGET CUDA::nvptxcompiler_static)
_CUDAToolkit_find_and_add_import_lib(nvptxcompiler_static DEPS cuda_driver)
target_link_libraries(CUDA::nvptxcompiler_static INTERFACE Threads::Threads)
endif()
endif()
_CUDAToolkit_find_and_add_import_lib(nvml ALT nvidia-ml nvml)
if(WIN32)
# nvtools can be installed outside the CUDA toolkit directory
# so prefer the NVTOOLSEXT_PATH windows only environment variable
# In addition on windows the most common name is nvToolsExt64_1
find_library(CUDA_nvToolsExt_LIBRARY
NAMES nvToolsExt64_1 nvToolsExt64 nvToolsExt
PATHS ENV NVTOOLSEXT_PATH
ENV CUDA_PATH
PATH_SUFFIXES lib/x64 lib
)
endif()
_CUDAToolkit_find_and_add_import_lib(nvToolsExt ALT nvToolsExt64)
if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 10.0)
# nvToolsExt is deprecated since nvtx3 introduction.
# Warn only if the project requires a sufficiently new CMake to make migration possible.
if(CMAKE_MINIMUM_REQUIRED_VERSION VERSION_GREATER_EQUAL 3.25)
set_property(TARGET CUDA::nvToolsExt PROPERTY DEPRECATION "nvToolsExt has been superseded by nvtx3 since CUDA 10.0 and CMake 3.25. Use CUDA::nvtx3 and include <nvtx3/nvToolsExt.h> instead.")
endif()
# Header-only variant. Uses dlopen().
if(NOT TARGET CUDA::nvtx3)
add_library(CUDA::nvtx3 INTERFACE IMPORTED)
target_include_directories(CUDA::nvtx3 SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}")
target_link_libraries(CUDA::nvtx3 INTERFACE ${CMAKE_DL_LIBS})
endif()
endif()
_CUDAToolkit_find_and_add_import_lib(OpenCL)
endif()
if(_CUDAToolkit_Pop_ROOT_PATH)
list(REMOVE_AT CMAKE_FIND_ROOT_PATH 0)
unset(_CUDAToolkit_Pop_ROOT_PATH)
endif()
\ No newline at end of file
# Set CUDNN_FOUND, CUDNN_INCLUDE_DIRS, CUDNN_LIBRARY, CUDNN_VERSION_MAJOR, CUDNN_VERSION_MINOR, CUDNN_VERSION_PATCH and CUDNN_VERSION.
include(FindPackageHandleStandardArgs)
find_path(CUDNN_INCLUDE_DIRS cudnn.h HINTS ${CUDA_TOOLKIT_ROOT_DIR} PATH_SUFFIXES include)
find_library(CUDNN_LIBRARY NAMES cudnn HINTS ${CUDA_TOOLKIT_ROOT_DIR} PATH_SUFFIXES lib lib64 lib/x64)
find_package_handle_standard_args(CuDNN DEFAULT_MSG CUDNN_INCLUDE_DIRS CUDNN_LIBRARY)
if (CUDNN_INCLUDE_DIRS AND CUDNN_LIBRARY)
file(READ ${CUDNN_INCLUDE_DIRS}/cudnn.h CUDNN_FILE_CONTENTS)
string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)"
CUDNN_VERSION_MAJOR "${CUDNN_FILE_CONTENTS}")
string(REGEX REPLACE "define CUDNN_MAJOR * +([0-9]+)" "\\1"
CUDNN_VERSION_MAJOR "${CUDNN_VERSION_MAJOR}")
string(REGEX MATCH "define CUDNN_MINOR * +([0-9]+)"
CUDNN_VERSION_MINOR "${CUDNN_FILE_CONTENTS}")
string(REGEX REPLACE "define CUDNN_MINOR * +([0-9]+)" "\\1"
CUDNN_VERSION_MINOR "${CUDNN_VERSION_MINOR}")
string(REGEX MATCH "define CUDNN_PATCHLEVEL * +([0-9]+)"
CUDNN_VERSION_PATCH "${CUDNN_FILE_CONTENTS}")
string(REGEX REPLACE "define CUDNN_PATCHLEVEL * +([0-9]+)" "\\1"
CUDNN_VERSION_PATCH "${CUDNN_VERSION_PATCH}")
if(NOT CUDNN_VERSION_MAJOR)
file(READ ${CUDNN_INCLUDE_DIRS}/cudnn_version.h CUDNN_FILE_CONTENTS)
string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)"
CUDNN_VERSION_MAJOR "${CUDNN_FILE_CONTENTS}")
string(REGEX REPLACE "define CUDNN_MAJOR * +([0-9]+)" "\\1"
CUDNN_VERSION_MAJOR "${CUDNN_VERSION_MAJOR}")
endif()
if(NOT CUDNN_VERSION_MAJOR)
set(CUDNN_VERSION "?")
else()
set(CUDNN_VERSION "${CUDNN_VERSION_MAJOR}.${CUDNN_VERSION_MINOR}.${CUDNN_VERSION_PATCH}")
endif()
message(STATUS "CuDNN library status:")
message(STATUS " version: ${CUDNN_VERSION}")
message(STATUS " include path: ${CUDNN_INCLUDE_DIRS}")
message(STATUS " libraries: ${CUDNN_LIBRARY}")
endif()
\ No newline at end of file
# ~~~
# Copyright 2021 Olivier Le Doeuff
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# This module defines the following variables:
#
# - TensorRT_FOUND: A boolean specifying whether or not TensorRT was found.
# - TensorRT_VERSION: The exact version of TensorRT found
# - TensorRT_VERSION_MAJOR: The major version of TensorRT.
# - TensorRT_VERSION_MINOR: The minor version of TensorRT.
# - TensorRT_VERSION_PATCH: The patch version of TensorRT.
# - TensorRT_VERSION_TWEAK: The tweak version of TensorRT.
# - TensorRT_INCLUDE_DIRS: The path to TensorRT ``include`` folder containing the header files required to compile a project linking against TensorRT.
# - TensorRT_LIBRARY_DIRS: The path to TensorRT library directory that contains libraries.
#
# This module create following targets:
# - trt::nvinfer
# - trt::nvinfer_plugin
# - trt::nvonnxparser
# - trt::nvparsers
# This script was inspired from https://github.com/NicolasIRAGNE/CMakeScripts
# This script was inspired from https://github.com/NVIDIA/tensorrt-laboratory/blob/master/cmake/FindTensorRT.cmake
#
# Hints
# ^^^^^
# A user may set ``TensorRT_ROOT`` to an installation root to tell this module where to look.
# ~~~
if(NOT TensorRT_FIND_COMPONENTS)
set(TensorRT_FIND_COMPONENTS nvinfer nvinfer_plugin nvonnxparser nvparsers)
endif()
set(TensorRT_LIBRARIES)
# find the include directory of TensorRT
find_path(
TensorRT_INCLUDE_DIR
NAMES NvInfer.h
PATHS ${TensorRT_ROOT} ENV TensorRT_ROOT
PATH_SUFFIXES include
)
string(FIND ${TensorRT_INCLUDE_DIR} "NOTFOUND" _include_dir_notfound)
if(NOT _include_dir_notfound EQUAL -1)
if(TensorRT_FIND_REQUIRED)
message(FATAL_ERROR "Fail to find TensorRT, please set TensorRT_ROOT. Include path not found.")
endif()
return()
endif()
set(TensorRT_INCLUDE_DIRS ${TensorRT_INCLUDE_DIR})
# Extract version of tensorrt
if(EXISTS "${TensorRT_INCLUDE_DIR}/NvInferVersion.h")
file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInferVersion.h" TensorRT_MAJOR REGEX "^#define NV_TENSORRT_MAJOR [0-9]+.*$")
file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInferVersion.h" TensorRT_MINOR REGEX "^#define NV_TENSORRT_MINOR [0-9]+.*$")
file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInferVersion.h" TensorRT_PATCH REGEX "^#define NV_TENSORRT_PATCH [0-9]+.*$")
file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInferVersion.h" TensorRT_TWEAK REGEX "^#define NV_TENSORRT_BUILD [0-9]+.*$")
string(REGEX REPLACE "^#define NV_TENSORRT_MAJOR ([0-9]+).*$" "\\1" TensorRT_VERSION_MAJOR "${TensorRT_MAJOR}")
string(REGEX REPLACE "^#define NV_TENSORRT_MINOR ([0-9]+).*$" "\\1" TensorRT_VERSION_MINOR "${TensorRT_MINOR}")
string(REGEX REPLACE "^#define NV_TENSORRT_PATCH ([0-9]+).*$" "\\1" TensorRT_VERSION_PATCH "${TensorRT_PATCH}")
string(REGEX REPLACE "^#define NV_TENSORRT_BUILD ([0-9]+).*$" "\\1" TensorRT_VERSION_TWEAK "${TensorRT_TWEAK}")
set(TensorRT_VERSION "${TensorRT_VERSION_MAJOR}.${TensorRT_VERSION_MINOR}.${TensorRT_VERSION_PATCH}.${TensorRT_VERSION_TWEAK}")
endif()
function(_find_trt_component component)
# Find library for component (ie nvinfer, nvparsers, etc...)
find_library(
TensorRT_${component}_LIBRARY
NAMES ${component}
PATHS ${TensorRT_ROOT} ${TENSORRT_LIBRARY_DIR} ENV TensorRT_ROOT
)
string(FIND ${TensorRT_${component}_LIBRARY} "NOTFOUND" _library_not_found)
if(NOT TensorRT_LIBRARY_DIR)
get_filename_component(_path ${TensorRT_${component}_LIBRARY} DIRECTORY)
set(TensorRT_LIBRARY_DIR
"${_path}"
CACHE INTERNAL "TensorRT_LIBRARY_DIR"
)
endif()
if(NOT TensorRT_LIBRARY_DIRS)
get_filename_component(_path ${TensorRT_${component}_LIBRARY} DIRECTORY)
set(TensorRT_LIBRARY_DIRS
"${_path}"
CACHE INTERNAL "TensorRT_LIBRARY_DIRS"
)
endif()
# Library found, and doesn't already exists
if(_library_not_found EQUAL -1 AND NOT TARGET trt::${component})
set(TensorRT_${component}_FOUND
TRUE
CACHE INTERNAL "Found ${component}"
)
# Create a target
add_library(trt::${component} IMPORTED INTERFACE)
target_include_directories(trt::${component} SYSTEM INTERFACE "${TensorRT_INCLUDE_DIRS}")
target_link_libraries(trt::${component} INTERFACE "${TensorRT_${component}_LIBRARY}")
set(TensorRT_LIBRARIES ${TensorRT_LIBRARIES} ${TensorRT_${component}_LIBRARY})
endif()
endfunction()
# Find each components
foreach(component IN LISTS TensorRT_FIND_COMPONENTS)
_find_trt_component(${component})
endforeach()
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(TensorRT HANDLE_COMPONENTS VERSION_VAR TensorRT_VERSION REQUIRED_VARS TensorRT_INCLUDE_DIR)
\ No newline at end of file
#ifndef BATCH_STREAM_H
#define BATCH_STREAM_H
#include "NvInfer.h"
#include <algorithm>
#include <assert.h>
#include <stdio.h>
#include <vector>
class BatchStream
{
public:
BatchStream(unsigned int batchSize, std::vector<unsigned int> dims, unsigned int maxBatches, std::string prefix)
: _batchSize(batchSize)
, _maxBatches(maxBatches)
, _prefix(prefix)
{
_dims.nbDims = dims.size()+1; //The number of dimensions. Max 8.
assert(_dims.nbDims <= 8 && "The maximum number of dimensions supported for a tensor is 8");
_dims.d[0] = batchSize; //Batch Size
for(std::size_t i = 0; i < _dims.nbDims-1; ++i) _dims.d[i+1] = dims[i];
for(auto elem : dims) _imageSize *= elem;
_batch.resize(_batchSize * _imageSize, 0);
_fileBatch.resize(_dims.d[0] * _imageSize, 0);
reset(0);
}
// Resets data members
void reset(int firstBatch)
{
_batchCount = 0;
_fileCount = 0;
_fileBatchPos = _dims.d[0];
skip(firstBatch);
}
// Advance to next batch and return true, or return false if there is no batch left.
bool next()
{
if (_batchCount == _maxBatches)
return false;
for (int csize = 1, batchPos = 0; batchPos < _batchSize; batchPos += csize, _fileBatchPos += csize)
{
assert(_fileBatchPos > 0 && _fileBatchPos <= _dims.d[0]);
if (_fileBatchPos == _dims.d[0] && !update())
return false;
// copy the smaller of: elements left to fulfill the request, or elements left in the file buffer.
csize = std::min(_batchSize - batchPos, static_cast<int32_t>(_dims.d[0] - _fileBatchPos));
std::copy_n(getFileBatch() + _fileBatchPos * _imageSize, csize * _imageSize, getBatch() + batchPos * _imageSize);
}
_batchCount++;
return true;
}
// Skips the batches
void skip(int skipCount)
{
if (_batchSize >= _dims.d[0] && _batchSize % _dims.d[0] == 0 && _fileBatchPos == _dims.d[0])
{
_fileCount += skipCount * _batchSize / _dims.d[0];
return;
}
int x = _batchCount;
for (std::size_t i = 0; i < skipCount; ++i) next();
_batchCount = x;
}
float* getBatch() { return &_batch[0]; }
int getBatchesRead() const { return _batchCount; }
int getBatchSize() const { return _batchSize; }
int getImageSize() const { return _imageSize; }
nvinfer1::Dims getDims() const { return _dims; }
private:
float* getFileBatch() { return &_fileBatch[0]; }
bool update()
{
std::string inputFileName = _prefix + std::to_string(_fileCount++) + ".batch";
std::ifstream file(inputFileName, std::ios_base::in);
if (!file.is_open()) std::cout << "Could not open calibration file " << inputFileName << std::endl;
for(std::size_t i = 0; i < _imageSize; ++i)
{
if(file.eof())
{
std::cerr << "Error: Unexpected end of file. Wrong input size." << std::endl;
std::exit(EXIT_FAILURE);
}
file >> _fileBatch[i];
}
_fileBatchPos = 0;
file.close();
return true;
}
int _batchSize{0};
int _maxBatches{0};
int _batchCount{0};
int _fileCount{0};
int _fileBatchPos{0};
int _imageSize{1};
nvinfer1::Dims _dims;
std::vector<float> _batch;
std::vector<float> _fileBatch;
std::string _prefix;
};
#endif
#ifndef __AIDGE_TENSORRT_GRAPH_HPP__
#define __AIDGE_TENSORRT_GRAPH_HPP__
#include "Utils.hpp"
#include "cuda_utils.h"
#include <string>
#include <vector>
#include <NvInfer.h>
#include <NvOnnxParser.h>
// Allow TensorRT to use up to 1GB of GPU memory for tactic selection
constexpr size_t MAX_WORKSPACE_SIZE = 1ULL << 30; // 1 GB
typedef enum
{
SYNC,
ASYNC
} ExecutionMode_T;
typedef struct
{
std::string name;
int nbElements;
int size;
} IODesc;
typedef struct
{
std::vector<IODesc> inputs;
std::vector<IODesc> outputs;
unsigned int nIO;
} IOGraphDesc;
/**
* @class Graph
* @brief Manages the lifecycle and execution of a neural network graph using TensorRT.
*
* The Graph class encapsulates the functionality required to manage, configure, and execute
* a neural network graph for inference using NVIDIA's TensorRT. This includes loading models
* from ONNX or TensorRT files, setting the CUDA device and data types, managing calibration
* for INT8 precision, and running inference in both synchronous and asynchronous modes.
*/
class Graph
{
public:
/**
* @brief Constructor for the Graph class.
*
* @param filePath Path to the file to load (default is empty).
* @param device_id Device ID to use (default is 0).
* @param nbbits Number of bits for data (default is -32).
*/
Graph(std::string const &filePath,
unsigned int device_id,
int nbbits);
/**
* @brief Destructor for the Graph class.
*/
~Graph();
/**
* @brief Set the CUDA device.
*
* @param id Device ID.
*/
void device(unsigned int id);
/**
* @brief Set the data type for the graph.
*
* @param nbbits Number of bits for data.
*/
void databits(int nbbits);
/**
* @brief Set the data mode for the graph.
*
* @param datatype Data type for the graph.
*/
void datamode(nvinfer1::DataType datatype);
/**
* @brief Load a file into the graph.
*
* @param filePath Path to the file to load.
*/
void load(std::string const &filePath);
/**
* @brief Load an ONNX file into the graph.
*
* @param onnxModelPath Path to the ONNX model file.
*/
void load_onnx(std::string const &onnxModelPath);
/**
* @brief Load a TensorRT file into the graph.
*
* @param trtModelPath Path to the TensorRT model file.
*/
void load_trt(std::string const &trtModelPath);
/**
* @brief Save the graph to a file.
*
* @param fileName Name of the file to save.
*/
void save(std::string const &fileName);
/**
* @brief Initializes the TensorRT engine and execution context for the Graph class. This involves building a serialized network, deserializing it into a CUDA engine, and setting up the necessary execution context and I/O descriptors.
*/
void initialize();
/**
* @brief Calibrate the graph using the calibration data found inside the `calibration` folder.
* This folder should include a `.info` file containing the dimensions of the calibration data, along with the data stored in a `.batch` file*
* Calibration can be expensive, so it is beneficial to generate the calibration data once and then reuse it for subsequent builds of the network. The cache includes the regression cutoff and quantile values used to generate it, and will not be used if these do not match the settings of the current calibrator. However, the network should be recalibrated if its structure changes or if the input data set changes, and it is the responsibility of the application to ensure this.
*
* @param calibration_folder_path Path to the calibration folder.
* @param cache_file_path Path to the cache file.
* @param batch_size Batch size for calibration (default is 1).
*/
void calibrate(std::string const &calibration_folder_path, std::string const &cache_file_path, unsigned int batch_size);
/**
* @brief Profile the graph's execution by printing the average profiled tensorRT process time per stimulus.
*
* @param nb_iterations Number of iterations for profiling.
* @param mode Execution mode (SYNC or ASYNC).
*/
void profile(unsigned int nb_iterations, ExecutionMode_T mode = ExecutionMode_T::ASYNC);
/**
* @brief Automatically set the input profile for the graph.
*
* @param dims_inputs Dimensions of the input tensors.
*/
void auto_input_profile(std::vector<std::vector<int>> dims_inputs);
// Inference methods
/**
* @brief Run the graph.
*
* @param inputs Input data.
* @param outputs Output data.
* @param mode Execution mode (SYNC or ASYNC).
*/
void run(void **inputs, void **outputs, ExecutionMode_T mode = ExecutionMode_T::ASYNC);
/**
* @brief Run the graph asynchronously.
*
* @param inputs Input data.
* @param outputs Output data.
*/
void run_async(void **inputs, void **outputs);
/**
* @brief Run the graph synchronously.
*
* @param inputs Input data.
* @param outputs Output data.
*/
void run_sync(void **inputs, void **outputs);
// Getters
/**
* @brief Get the number of IO tensors in the graph.
*
* @return unsigned int Number of IO tensors.
*/
unsigned int getNbIO();
/**
* @brief Get the IO descriptors of the graph.
*
* @return IOGraphDesc IO descriptors.
*/
IOGraphDesc getIODescription();
protected:
/**
* @brief Initialize IO descriptors for the graph.
*/
void initialize_io_descriptors();
private:
// TensorRT objects for network, engine
// and context creation and management
nvinfer1::INetworkDefinition *_network{nullptr};
nvinfer1::ICudaEngine *_engine{nullptr};
nvinfer1::IBuilder *_builder{nullptr};
nvinfer1::IBuilderConfig *_builderconfig{nullptr};
nvinfer1::IExecutionContext *_context{nullptr};
nvinfer1::IOptimizationProfile *_profile{nullptr};
nvinfer1::IInt8Calibrator *_calibrator{nullptr};
// Graph IO information
IOGraphDesc _iodescriptors;
// Buffer for GPU computation
std::vector<void *> _iobuffer;
// Stream
cudaStream_t _stream{nullptr};
};
#endif // __AIDGE_TENSORRT_GRAPH_HPP__
#include <iterator>
class Int8EntropyCalibrator : public nvinfer1::IInt8EntropyCalibrator2
{
public:
Int8EntropyCalibrator(BatchStream& stream, int firstBatch, std::string cacheName, bool readCache = true)
: _stream(stream),
_calibrationCacheName(cacheName),
_readCache(readCache)
{
nvinfer1::Dims dims = _stream.getDims();
_inputCount = _stream.getBatchSize() * dims.d[1] * dims.d[2] * dims.d[3];
CHECK_CUDA_STATUS(cudaMalloc(&_deviceInput, _inputCount * sizeof(float)));
_stream.reset(firstBatch);
}
virtual ~Int8EntropyCalibrator()
{
CHECK_CUDA_STATUS(cudaFree(_deviceInput));
}
int getBatchSize() const noexcept override { return _stream.getBatchSize(); }
bool getBatch(void* bindings[], const char* names[], int nbBindings) noexcept override
{
if (!_stream.next())
{
return false;
}
CHECK_CUDA_STATUS(cudaMemcpy(_deviceInput, _stream.getBatch(), _inputCount * sizeof(float), cudaMemcpyHostToDevice));
bindings[0] = _deviceInput;
return true;
}
const void* readCalibrationCache(size_t& length) noexcept override
{
_calibrationCache.clear();
std::ifstream input(calibrationTableName(), std::ios::binary);
input >> std::noskipws;
if (_readCache && input.good())
{
std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(_calibrationCache));
}
length = _calibrationCache.size();
return length ? &_calibrationCache[0] : nullptr;
}
virtual void writeCalibrationCache(const void* cache, size_t length) noexcept override
{
std::ofstream output(calibrationTableName(), std::ios::binary);
output.write(reinterpret_cast<const char*>(cache), length);
}
private:
std::string calibrationTableName()
{
return _calibrationCacheName;
}
BatchStream _stream;
size_t _inputCount;
bool _readCache{true};
std::string _calibrationCacheName;
void* _deviceInput{nullptr};
std::vector<char> _calibrationCache;
};
#ifndef __AIDGE_TENSORRT_UTILS_HPP__
#define __AIDGE_TENSORRT_UTILS_HPP__
#include <iostream>
#include <sstream>
#include <iomanip>
#include <string>
#include <vector>
#include <algorithm>
#include <NvInfer.h>
#include <cuda_runtime.h>
#define DIV_UP(X, Y) ((X) / (Y) + ((X) % (Y) > 0))
#define CEIL_DIV(X, Y) (((X) + (Y)-1) / (Y))
static struct Profiler : public nvinfer1::IProfiler
{
typedef std::pair<std::string, float> Record;
std::vector<Record> mProfile;
virtual void reportLayerTime(const char* layerName, float ms) noexcept
{
auto record = std::find_if(mProfile.begin(), mProfile.end(), [&](const Record& r){ return r.first == layerName; });
if (record == mProfile.end())
mProfile.push_back(std::make_pair(layerName, ms));
else
record->second += ms;
}
} gProfiler;
static class Logger : public nvinfer1::ILogger
{
void log(Severity severity, const char* msg) noexcept override
{
switch (severity)
{
case Severity::kINTERNAL_ERROR:
std::cerr << "INTERNAL_ERROR: ";
break;
case Severity::kERROR:
std::cerr << "ERROR: ";
break;
case Severity::kWARNING:
std::cerr << "WARNING: ";
break;
case Severity::kINFO:
std::cerr << "INFO: ";
break;
default:
std::cerr << "VERBOSE: ";
break;
}
std::cerr << msg << std::endl;
}
} gLogger;
static bool endsWith(std::string const &str, std::string const &suffix)
{
if (str.length() < suffix.length()) {
return false;
}
return std::equal(suffix.rbegin(), suffix.rend(), str.rbegin());
}
static std::string removeSubstring(const std::string& input, const std::string& substringToRemove) {
std::string result = input;
size_t pos = result.find(substringToRemove);
if (pos != std::string::npos) {
result.erase(pos, substringToRemove.length());
}
return result;
}
static std::string baseName(const std::string& filePath)
{
const size_t slashPos = filePath.find_last_of("/\\");
return (slashPos == std::string::npos) ? filePath
: filePath.substr(slashPos + 1);
}
static size_t dataTypeToSize(nvinfer1::DataType dataType)
{
switch ((int)dataType) {
case int(nvinfer1::DataType::kFLOAT):
return 4;
case int(nvinfer1::DataType::kHALF):
return 2;
case int(nvinfer1::DataType::kINT8):
return 1;
case int(nvinfer1::DataType::kINT32):
return 4;
case int(nvinfer1::DataType::kBOOL):
return 1;
default:
return 4;
}
}
static bool cudaSupportsDataType(nvinfer1::DataType dataType)
{
int deviceId;
cudaError_t status = cudaGetDevice(&deviceId);
if (status != cudaSuccess) {
std::cerr << "Failed to get CUDA device: " << cudaGetErrorString(status) << std::endl;
return false;
}
cudaDeviceProp deviceProp;
status = cudaGetDeviceProperties(&deviceProp, deviceId);
if (status != cudaSuccess) {
std::cerr << "Failed to get device properties: " << cudaGetErrorString(status) << std::endl;
return false;
}
int major = deviceProp.major;
int minor = deviceProp.minor;
float computeCapability = major + minor * 0.1f;
switch (dataType) {
case nvinfer1::DataType::kFLOAT:
// FP32 supported on all SM 7.5+
return computeCapability >= 7.5f;
case nvinfer1::DataType::kHALF:
// FP16 supported on all SM 7.5+
return computeCapability >= 7.5f;
case nvinfer1::DataType::kINT8:
// INT8 supported on all SM 7.5+
return computeCapability >= 7.5f;
case nvinfer1::DataType::kINT32:
// INT32 supported on all SM 7.5+
return computeCapability >= 7.5f;
case nvinfer1::DataType::kBOOL:
// BOOL supported on all SM 7.5+
return computeCapability >= 7.5f;
default:
std::cerr << "Unknown data type in cudaSupportsDataType" << std::endl;
return false;
}
}
static bool cudaHasFastFp16()
{
return cudaSupportsDataType(nvinfer1::DataType::kHALF);
}
static bool cudaHasFastInt8()
{
return cudaSupportsDataType(nvinfer1::DataType::kINT8);
}
#endif // __AIDGE_TENSORRT_UTILS_HPP__
\ No newline at end of file
#ifndef __AIDGE_TENSORRT_CUDA_UTILS_H__
#define __AIDGE_TENSORRT_CUDA_UTILS_H__
#include <cublas_v2.h>
#include <cuda.h>
#include <cudnn.h>
#define FatalError(s) \
{ \
std::stringstream _where, _message; \
_where << __FILE__ << ':' << __LINE__; \
_message << std::string(s) + "\n" << __FILE__ << ':' << __LINE__; \
std::cerr << _message.str() << "\nAborting...\n"; \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
}
#define CHECK_CUDA_STATUS(status) \
{ \
std::stringstream _error; \
if (status != 0) { \
_error << "Cuda failure: " << cudaGetErrorString(status); \
FatalError(_error.str()); \
} \
}
#endif // __AIDGE_TENSORRT_CUDA_UTILS_H__
\ No newline at end of file
#include "Graph.hpp"
#include <iostream>
#include <vector>
// Uncomment if you want to run a single inference with static inputs
// #include "inputs.h"
int main()
{
Graph model;
model.load("model.onnx");
// Uncomment if the model does not have explicit batch
// Don't forget to change the values of the input dimensions
// std::vector<std::vector<int>> dims_input{{ 1, 1, 28, 28 }};
// model.auto_input_profile(dims_input);
// Uncomment if you want to activate FP16
// model.datamode(nvinfer1::DataType::kHALF);
model.initialize();
// Comment to remove model profiling
model.profile(10);
// Example of script to run a single inference with static inputs
/*
const unsigned int nb_classes = 10;
std::vector<void *> bufferIn {1, nullptr};
bufferIn[0] = (void *)inputs;
std::vector<void *> bufferOut {1, nullptr};
bufferOut[0] = (void *)new float[10];
std::vector<void *> bufferIn {1, nullptr};
bufferIn[0] = (void *)new char[28*28*1 * 4];
float *pData = (float *)bufferIn[0];
for (unsigned int j = 0; j < 784; ++j) {
pData[j] = inputs[j];
}
std::vector<void *> bufferOut {1, nullptr};
bufferOut[0] = (void *)new char[10 * 4];
model.run_async(bufferIn.data(), bufferOut.data());
float *floatArray = static_cast<float *>(bufferOut[0]);
for (unsigned int i = 0; i < nb_classes; ++i)
{
std::cout << i << ": " << floatArray[i] << std::endl;
}
delete[] (float *)bufferIn[0];
delete[] (float *)bufferOut[0];
*/
return 0;
}
\ No newline at end of file
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <pybind11/stl.h>
#include "Graph.hpp"
#include <vector>
namespace py = pybind11;
void init_Graph(py::module& m)
{
py::enum_<ExecutionMode_T>(m, "exe_mode")
.value("sync", ExecutionMode_T::SYNC)
.value("async", ExecutionMode_T::ASYNC)
.export_values()
;
py::class_<Graph>(m, "Graph")
.def(py::init<std::string, unsigned int, int>(),
py::arg("filepath") = "",
py::arg("device_id") = 0,
py::arg("nb_bits") = -32,
R"mydelimiter(
Construct a new Graph object.
:param filepath: Path to the file to load (default is empty).
:type filepath: str
:param device_id: Device ID to use (default is 0).
:type device_id: unsigned int
:param nb_bits: Number of bits for data (default is -32).
:type nb_bits: int
)mydelimiter")
.def("device", &Graph::device, py::arg("id"),
R"mydelimiter(
Set the CUDA device.
:param id: Device ID.
:type id: unsigned int
)mydelimiter")
.def("load", &Graph::load, py::arg("filepath"),
R"mydelimiter(
Load a graph from a file, either a `.onnx` file or a `.trt` engine.
:param filepath: Path to the file.
:type filepath: str
)mydelimiter")
.def("save", &Graph::save, py::arg("filepath"),
R"mydelimiter(
Save the current graph as a `.trt` engine.
:param filepath: Path to the file.
:type filepath: str
)mydelimiter")
.def("calibrate", &Graph::calibrate, py::arg("calibration_folder_path") = "./calibration_folder/", py::arg("cache_file_path") = "./calibration_cache", py::arg("batch_size") = 1,
R"mydelimiter(
Calibrate the graph to determine the appropriate scaling factors for converting floating-point values to lower-precision representations, using the calibration data found inside the specified `calibration_folder`. This folder should include a `.info` file containing the dimensions of the calibration data, along with the data stored in a `.batch` file
Calibration can be expensive, so it is beneficial to generate the calibration data once and then reuse it for subsequent builds of the network. The cache includes the regression cutoff and quantile values used to generate it, and will not be used if these do not match the settings of the current calibrator. However, the network should be recalibrated if its structure changes or if the input data set changes, and it is the responsibility of the application to ensure this.
:param calibration_folder_path: Path to the calibration folder.
:type calibration_folder_path: str
:param cache_file_path: Path to the cache file.
:type cache_file_path: str
:param batch_size: Batch size for calibration (default is 1).
:type batch_size: int
)mydelimiter")
.def("initialize", &Graph::initialize,
R"mydelimiter(
Initializes the TensorRT engine and execution context for the Graph class. This involves building a serialized network, deserializing it into a CUDA engine, and setting up the necessary execution context and I/O descriptors.
)mydelimiter")
.def("profile", &Graph::profile, py::arg("nb_iterations"), py::arg("mode")= ExecutionMode_T::ASYNC,
R"mydelimiter(
Profile the graph's execution by printing the average profiled TensorRT process time per stimulus.
:param nb_iterations: Number of iterations for profiling.
:type nb_iterations: unsigned int
:param mode: Execution mode (SYNC or ASYNC, default is ASYNC).
:type mode: ExecutionMode_T
)mydelimiter")
.def("run_sync", [](Graph& graph, py::list inputs) -> py::list {
py::list outputs;
std::vector<void *> bufferIn;
std::vector<void *> bufferOut;
IOGraphDesc iodesc = graph.getIODescription();
// Fill bufferIn for inference
for (py::handle array: inputs)
{
// py::buffer_info buf_info =
// array.cast<py::array_t<float>>().request();
py::buffer_info buf_info = array.cast<py::array>().request();
bufferIn.push_back(static_cast<void*>(buf_info.ptr));
}
// Allocate memory resources for bufferOut
for (unsigned int i = 0; i < iodesc.outputs.size(); ++i)
{
void* out = (void *)new char[iodesc.outputs[i].size];
bufferOut.push_back(out);
}
// Run inference
graph.run_sync(bufferIn.data(), bufferOut.data());
// Get outputs
for (unsigned int i = 0; i < iodesc.outputs.size(); ++i)
{
// Improve the code by being independant from the type of output
float* data_ptr = static_cast<float*>(bufferOut[i]);
py::array_t<float> processed_array = py::array_t<float>(
iodesc.outputs[i].nbElements, data_ptr);
outputs.append(processed_array);
}
return outputs;
}, py::arg("inputs"),
R"mydelimiter(
Run the graph.
:param inputs: Input data.
:type inputs: list
:param outputs: Output data.
:type outputs: list
:param mode: Execution mode (SYNC or ASYNC, default is ASYNC).
:type mode: ExecutionMode_T
)mydelimiter");
}
PYBIND11_MODULE(aidge_trt, m)
{
init_Graph(m);
}
#include "Graph.hpp"
#include <fstream>
#include <sstream>
#include "BatchStream.hpp"
#include "IInt8EntropyCalibrator.hpp"
#include <dirent.h>
Graph::Graph( std::string const& filePath = "",
unsigned int device_id = 0,
int nbbits = -32)
{
// ctor
this->_builder = nvinfer1::createInferBuilder(gLogger);
this->_profile = this->_builder->createOptimizationProfile();
this->_builderconfig = this->_builder->createBuilderConfig();
// this->_builderconfig->setMaxWorkspaceSize(MAX_WORKSPACE_SIZE);
this->_builderconfig->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, MAX_WORKSPACE_SIZE);
CHECK_CUDA_STATUS(cudaStreamCreate(&(this->_stream)));
device(device_id);
databits(nbbits);
if (!filePath.empty()) {
load(filePath);
}
}
Graph::~Graph()
{
// dtor
if (!this->_iobuffer.empty()) {
for (unsigned int i = 0; i < this->_iobuffer.size(); ++i) {
CHECK_CUDA_STATUS(cudaFree(this->_iobuffer[i]));
}
this->_iobuffer.clear();
}
CHECK_CUDA_STATUS(cudaStreamDestroy(this->_stream));
}
void Graph::device(unsigned int id)
{
CHECK_CUDA_STATUS(cudaSetDevice(id));
}
void Graph::databits(int nbbits)
{
nvinfer1::DataType datatype;
if (nbbits == -32) {
datatype = nvinfer1::DataType::kFLOAT;
}
else if (nbbits == -16) {
datatype = nvinfer1::DataType::kHALF;
}
else if (nbbits == -8) {
datatype = nvinfer1::DataType::kFP8;
}
else if (nbbits == 32) {
datatype = nvinfer1::DataType::kINT32;
}
else if (nbbits == 8) {
datatype = nvinfer1::DataType::kINT8;
}
else {
std::cout << "Cannot use this number of bits ( "
<< nbbits
<< ") for infering the network"
<< std::endl;
return;
}
datamode(datatype);
}
void Graph::datamode(nvinfer1::DataType datatype)
{
switch (datatype) {
case nvinfer1::DataType::kFLOAT:
// Do nothing as it is the default datatype
break;
case nvinfer1::DataType::kHALF: {
if (!cudaHasFastFp16()) {
std::cout << "Cannot use FP16 for this platform \nLet default datatype activated." << std::endl;
return;
}
this->_builderconfig->setFlag(nvinfer1::BuilderFlag::kFP16);
}
break;
case nvinfer1::DataType::kINT8: {
if (!cudaHasFastInt8()) {
std::cout << "Cannot use INT8 for this platform \nLet default datatype activated." << std::endl;
return;
}
// Mark calibrator as nullptr not to provide an INT8 calibrator
this->_builderconfig->setFlag(nvinfer1::BuilderFlag::kINT8);
}
break;
case nvinfer1::DataType::kFP8:
case nvinfer1::DataType::kINT32:
case nvinfer1::DataType::kBOOL:
case nvinfer1::DataType::kUINT8:
default:
std::cout << "Cannot use this datatype for infering the network \nLet default datatype activated." << std::endl;
break;
}
}
void Graph::calibrate( std::string const& calibration_folder_path = "./calibration_folder/",
std::string const& cache_file_path = "./calibration_cache",
unsigned int batch_size = 1)
{
// Open calibration files
const std::string calibDir = calibration_folder_path;
std::vector<std::string> filesCalib;
struct dirent* pFile;
DIR* pDir = opendir(calibDir.c_str());
if (pDir == NULL) {
std::cout << "No directory for batches calibration" << std::endl;
}
else {
while ((pFile = readdir(pDir)) != NULL)
{
if (pFile->d_name[0] != '.') filesCalib.push_back(std::string(calibDir + pFile->d_name));
}
closedir(pDir);
}
unsigned int nbCalibFiles = filesCalib.size();
if(nbCalibFiles == 0) std::cout << "Cannot find calibration files in dir " << calibDir << std::endl;
// Get input tensor shape by reading data.info file in calibration folder
std::vector<unsigned int> dims;
std::ifstream inputFile(calibDir + "/.info");
if (!inputFile.is_open()) {
std::cout << "Error opening the file .info" << std::endl;
} else {
std::string line;
// Read all lines from the file
while (std::getline(inputFile, line)) {
try {
unsigned int intValue = std::stoul(line); // Use stoul for unsigned int
dims.push_back(intValue);
} catch (const std::invalid_argument& e) {
std::cerr << "Error converting string to unsigned int: " << e.what() << std::endl;
} catch (const std::out_of_range& e) {
std::cerr << "Error: Value out of range for unsigned int conversion." << std::endl;
}
}
}
inputFile.close();
BatchStream calibrationStream(batch_size, dims, nbCalibFiles/batch_size, calibration_folder_path);
this->_calibrator = new Int8EntropyCalibrator(calibrationStream, 0, cache_file_path);
this->_builderconfig->setInt8Calibrator(this->_calibrator);
}
void Graph::load(std::string const& filePath)
{
if (endsWith(filePath, ".onnx")) {
load_onnx(filePath);
}
else if (endsWith(filePath, ".trt")) {
load_trt(filePath);
}
else {
throw std::runtime_error("Cannot load this format of file");
}
}
void Graph::load_onnx(std::string const& onnxModelPath)
{
// Impose TensorRT flags for the creation of the network
// Maybe change it to adapt graph to dynamic inputs
nvinfer1::NetworkDefinitionCreationFlags creationFlag;
creationFlag = 1 << static_cast<int32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
this->_network = this->_builder->createNetworkV2(creationFlag);
nvonnxparser::IParser* parser = nvonnxparser::createParser(*this->_network, gLogger);
parser->parseFromFile(onnxModelPath.c_str(), static_cast<int>(nvinfer1::ILogger::Severity::kINFO));
this->_network->setName(removeSubstring(baseName(onnxModelPath), ".onnx").c_str());
}
void Graph::load_trt(std::string const& trtModelPath)
{
std::ifstream cudaEngineStream(trtModelPath);
if(!cudaEngineStream.good())
throw std::runtime_error("Could not open cuda engine file " + trtModelPath);
nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(gLogger);
// Read the stringstream into a memory buffer and pass that to TRT
cudaEngineStream.seekg(0, std::ios::end);
const int modelSize = cudaEngineStream.tellg();
cudaEngineStream.seekg(0, std::ios::beg);
void* modelMem = malloc(modelSize);
if(!modelMem)
throw std::runtime_error("Could not allocate enough memory for load cuda engine file " + trtModelPath);
cudaEngineStream.read((char*)modelMem, modelSize);
this->_engine = runtime->deserializeCudaEngine(modelMem, modelSize);
free(modelMem);
}
void Graph::save(std::string const& fileName)
{
std::ofstream engineSerializedFile;
nvinfer1::IHostMemory* memory = this->_engine->serialize();
if (memory == nullptr)
throw std::runtime_error("Serialize engine failed");
// Open a new file
engineSerializedFile.open(fileName + ".trt", std::ios::out | std::ios::binary);
if (engineSerializedFile.is_open() && engineSerializedFile.good() && !engineSerializedFile.fail()) {
//Save the serialized engine data into the file
engineSerializedFile.write(reinterpret_cast<const char *>(memory->data()), memory->size());
engineSerializedFile.close();
}
else
throw std::runtime_error("Could not save cuda engine file in " + fileName + ".trt");
}
void Graph::initialize()
{
if (!this->_engine) {
nvinfer1::IHostMemory* engineString = this->_builder->buildSerializedNetwork(*(this->_network), *(this->_builderconfig));
if (engineString == nullptr || engineString->size() == 0)
throw std::runtime_error("Failed building serialized engine");
nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(gLogger);
this->_engine = runtime->deserializeCudaEngine(engineString->data(), engineString->size());
}
this->_context = this->_engine->createExecutionContext();
// Initialize IO information
initialize_io_descriptors();
}
void Graph::auto_input_profile(std::vector<std::vector<int>> dims_inputs)
{
// To improve by adding a system to read in a file/json the different optim and the dims
for (int i = 0; i < this->_network->getNbInputs(); ++i) {
nvinfer1::ITensor* input = this->_network->getInput(i);
nvinfer1::Dims dims{};
dims.nbDims = dims_inputs[i].size();
for (unsigned int k = 0; k < dims_inputs[i].size(); ++k) {
dims.d[k] = dims_inputs[i][k];
}
this->_profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kMIN, dims);
this->_profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kOPT, dims);
this->_profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kMAX, dims);
}
this->_builderconfig->addOptimizationProfile(this->_profile);
}
void Graph::initialize_io_descriptors()
{
this->_iodescriptors.nIO = this->_engine->getNbIOTensors();
for (int nIO = 0; nIO < this->_engine->getNbIOTensors(); ++nIO) {
std::string name = std::string(this->_engine->getIOTensorName(nIO));
nvinfer1::Dims dim = this->_context->getTensorShape(name.c_str());
int size = 1;
for (int j = 0; j < dim.nbDims; ++j) {
size *= dim.d[j];
}
int datasize = size * dataTypeToSize(this->_engine->getTensorDataType(name.c_str()));
IODesc descriptor {name, size, datasize};
switch (this->_engine->getTensorIOMode(name.c_str())) {
case nvinfer1::TensorIOMode::kINPUT:
this->_iodescriptors.inputs.push_back(descriptor);
break;
case nvinfer1::TensorIOMode::kOUTPUT:
this->_iodescriptors.outputs.push_back(descriptor);
break;
case nvinfer1::TensorIOMode::kNONE:
default:
break;
}
}
}
void Graph::run(void** inputs, void** outputs, ExecutionMode_T mode)
{
switch (mode) {
case SYNC: {
run_sync(inputs, outputs);
break;
}
case ASYNC: {
run_async(inputs, outputs);
break;
}
default:
throw std::runtime_error("Running mode not supported");
}
}
void Graph::run_async(void** inputs, void** outputs)
{
unsigned int nbInputs = this->_iodescriptors.inputs.size();
unsigned int nbOutputs = this->_iodescriptors.outputs.size();
// Check if memory resources have been allocated for inputs and outputs
// If not, allocate memory on device
if (this->_iobuffer.empty()) {
for (unsigned int i = 0; i < nbInputs; ++i) {
void* inputPtr;
CHECK_CUDA_STATUS(cudaMalloc(&inputPtr, this->_iodescriptors.inputs[i].size));
this->_context->setTensorAddress(this->_iodescriptors.inputs[i].name.c_str(), inputPtr);
this->_iobuffer.push_back(inputPtr);
}
for (unsigned int i = 0; i < nbOutputs; ++i) {
void* outputPtr;
CHECK_CUDA_STATUS(cudaMalloc(&outputPtr, this->_iodescriptors.outputs[i].size));
this->_context->setTensorAddress(this->_iodescriptors.outputs[i].name.c_str(), outputPtr);
this->_iobuffer.push_back(outputPtr);
}
}
// Copy inputs to GPU
for (unsigned int i = 0; i < nbInputs; ++i) {
CHECK_CUDA_STATUS(cudaMemcpy(this->_iobuffer[i],
inputs[i],
this->_iodescriptors.inputs[i].size,
cudaMemcpyHostToDevice));
}
// Run inference on GPU
this->_context->enqueueV3(this->_stream);
// Copy outputs to CPU
for (unsigned int i = 0; i < nbOutputs; ++i) {
CHECK_CUDA_STATUS(cudaMemcpy(outputs[i],
this->_iobuffer[i + nbInputs],
this->_iodescriptors.outputs[i].size,
cudaMemcpyDeviceToHost));
}
}
void Graph::run_sync(void** inputs, void** outputs)
{
unsigned int nbInputs = this->_iodescriptors.inputs.size();
unsigned int nbOutputs = this->_iodescriptors.outputs.size();
// Check if memory resources have been allocated for inputs and outputs
// If not, allocate memory on device
if (this->_iobuffer.empty()) {
for (unsigned int i = 0; i < nbInputs; ++i) {
void* inputPtr;
CHECK_CUDA_STATUS(cudaMalloc(&inputPtr, this->_iodescriptors.inputs[i].size));
this->_iobuffer.push_back(inputPtr);
}
for (unsigned int i = 0; i < nbOutputs; ++i) {
void* outputPtr;
CHECK_CUDA_STATUS(cudaMalloc(&outputPtr, this->_iodescriptors.outputs[i].size));
this->_iobuffer.push_back(outputPtr);
}
}
// Copy inputs to GPU
for (unsigned int i = 0; i < nbInputs; ++i) {
CHECK_CUDA_STATUS(cudaMemcpy(this->_iobuffer[i],
inputs[i],
this->_iodescriptors.inputs[i].size,
cudaMemcpyHostToDevice));
}
// Run inference on GPU
this->_context->executeV2(this->_iobuffer.data());
// Copy outputs to CPU
for (unsigned int i = 0; i < nbOutputs; ++i) {
CHECK_CUDA_STATUS(cudaMemcpy(outputs[i],
this->_iobuffer[i + nbInputs],
this->_iodescriptors.outputs[i].size,
cudaMemcpyDeviceToHost));
}
}
void Graph::profile(unsigned int nb_iterations, ExecutionMode_T mode)
{
if(!this->_context) {
throw std::runtime_error(
"Cannot profile the graph without context from engine");
}
unsigned int nbInputs = this->_iodescriptors.inputs.size();
unsigned int nbOutputs = this->_iodescriptors.outputs.size();
// Initialize input buffer on CPU
std::vector<void *> inputs {nbInputs, nullptr};
for (unsigned int i = 0; i < nbInputs; ++i) {
inputs[i] = (void *)new char[this->_iodescriptors.inputs[i].size];
unsigned int nbElts = this->_iodescriptors.inputs[i].size / dataTypeToSize(this->_engine->getTensorDataType(this->_iodescriptors.inputs[i].name.c_str()));
float *pData = (float *)inputs[i];
for (unsigned int j = 0; j < nbElts; ++j) {
pData[j] = float(j);
}
}
// Initialize output buffer on CPU
std::vector<void *> outputs {nbOutputs, nullptr};
for (unsigned int i = 0; i < nbOutputs; ++i) {
outputs[i] = (void *)new char[this->_iodescriptors.outputs[i].size];
}
// Run 1st inference to allocate GPU resources
run(inputs.data(), outputs.data(), mode);
this->_context->setProfiler(&gProfiler);
for (unsigned int i = 0; i < nb_iterations; ++i) {
run(inputs.data(), outputs.data(), mode);
}
double totalProcessTime = 0.0;
for (size_t i = 0; i < gProfiler.mProfile.size(); ++i)
totalProcessTime += gProfiler.mProfile[i].second / nb_iterations;
for (size_t i = 0; i < gProfiler.mProfile.size(); i++)
{
const double processTimeMs = gProfiler.mProfile[i].second / nb_iterations;
const double workLoad = (processTimeMs / totalProcessTime) * 100.0;
std::string barrelLoad(((unsigned int)workLoad + 1) * 2, '*');
std::cout << std::setprecision(10)
<< "(" << std::setfill('0') << std::setw(2)
<< (unsigned int)workLoad << "%) " << barrelLoad
<< " " << gProfiler.mProfile[i].first << ": "
<< processTimeMs << " ms"
<< std::endl;
}
std::cout << "Average profiled tensorRT process time per stimulus = "
<< totalProcessTime << " ms" << std::endl;
for (unsigned int i = 0; i < nbInputs; ++i) {
delete[] (char *)inputs[i];
}
for (unsigned int i = 0; i < nbOutputs; ++i) {
delete[] (char *)outputs[i];
}
}
unsigned int Graph::getNbIO()
{
return this->_iodescriptors.nIO;
}
IOGraphDesc Graph::getIODescription()
{
return this->_iodescriptors;
}
"""Example test file for the TensorRT Python API.
"""
# TODO Update the path to the shared object if needed
import build.lib.aidge_trt as aidge_trt
import numpy as np
if __name__ == '__main__':
model = aidge_trt.Graph("model.onnx")
model.initialize()
# Profile with 10 iterations
model.profile(10)
# Execution example
# img: numpy.array = np.load("PATH TO NPY file")
# output: numpy.array = model.run_sync([img])
#!/usr/bin/env bash
# This script is not supposed to run alone
# Must be used with the Makefile of this export
OBJDIR=build
BINDIR=bin
cmake -B$OBJDIR $@
cmake --build $OBJDIR
# Add write permissions for users
# to clean build and bin folders outside the container
if [ -d "$OBJDIR" ]; then
chmod -R a+w $OBJDIR
fi
if [ -d "$BINDIR" ]; then
chmod -R a+w $BINDIR
fi
#!/usr/bin/env bash
set -Eeuo pipefail
# first arg is `-f` or `--some-option` or there are no args
if [ "$#" -eq 0 ] || [ "${1#-}" != "$1" ]; then
exec bash "$@"
fi
exec "$@"
FROM nvcr.io/nvidia/tensorrt:25.05-py3
# Start bash login shell
COPY docker-entrypoint.sh /usr/local/bin/
RUN chmod +x /usr/local/bin/docker-entrypoint.sh
ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"]
CMD ["/bin/bash", "-i"]