Compare revisions

62fe9d5f · aa3f1136 · e69bdb10 · ac345a00 · 0c47d7d3 · 4d17363d
--- a/.gitignore
+++ b/.gitignore
+# common
+.cache
 # C++ Build
 build*/
 install*/
+include/aidge/backend/cuda_version.h
 # VSCode
 .vscode
@@ -10,6 +14,9 @@ install*/
 __pycache__
 *.pyc
 *.egg-info
+dist*/
+wheelhouse/*
+_version.py
 # Mermaid
 *.mmd
@@ -18,4 +25,4 @@ __pycache__
 xml*/
 # ONNX
 *.onnx
\ No newline at end of file
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
-################################################################################
+###############################################################################
-# Pre-configured CI/CD for your Aidge module.
+#                Aidge Continuous Integration and Deployment                  #
-#
+#                                                                             #
-# Three stages are already pre-configured to run on Eclipse Aidge CI:
+###############################################################################
-# - build: ubuntu_cpp, ubuntu_python and windows_cpp;
-# - test: ubuntu_cpp, ubuntu_python and windows_cpp;
-# - coverage: ubuntu_cpp and ubuntu_python.
-#
-# If your project is pure C++ or pure Python, you can remove the "_python" or 
-# "_cpp" jobs respectively.
-# "ubuntu" jobs require an Ubuntu runner with a docker executor with tag 
-# "docker".
-# "windows" jobs require a Windows runner with a docker-windows executor with 
-# tag "windows".
-#
-# You can change the docker images in the YML scripts directly. The default 
-# images are:
-# - nvidia/cuda:12.2.0-devel-ubuntu22.04 for Ubuntu jobs;
-# - buildtools for Windows jobs, built on top of 
-#   mcr.microsoft.com/windows/servercore:ltsc2022 with Microsoft Visual Studio 
-#   2022 BuildTools installed.
-#
-# See Aidge project wiki for more details on how to setup your own docker images
-# and Gitlab runners.
-################################################################################
 stages:
-  # Build
+  - static_analysis
  - build
-  # Unit test stage
  - test
-  # Code coverage
  - coverage
+  - release
+  - deploy
 include:
-  - local: '/.gitlab/ci/_global.gitlab-ci.yml'
+  - project: 'eclipse/aidge/gitlab_shared_files'
-  - local: '/.gitlab/ci/build.gitlab-ci.yml'
+    ref: 'main'
-  - local: '/.gitlab/ci/test.gitlab-ci.yml'
+    file:
-  - local: '/.gitlab/ci/coverage.gitlab-ci.yml'
+      # choose which jobs to run by including the corresponding files.
+      - '.gitlab/ci/ubuntu_cpp.gitlab-ci.yml'
+      - '.gitlab/ci/ubuntu_python.gitlab-ci.yml'
+      - '.gitlab/ci/release/cibuildwheel_ubuntu.gitlab-ci.yml'
+      # - '.gitlab/ci/windows_cpp.gitlab-ci.yml'
+      # - '.gitlab/ci/windows_python.gitlab-ci.yml'
+      # - '.gitlab/ci/release/cibuildwheel_windows.gitlab-ci.yml'
+build:ubuntu_python:
+  # Use cudnn image instead of the cuda one
+  image: nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
+test:ubuntu_python:
+  # Use cudnn image instead of the cuda one
+  image: nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
+coverage:ubuntu_python:
+  # Use cudnn image instead of the cuda one
+  image: nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
+.build:ubuntu_cpp:template:
+  # Use cudnn image instead of the cuda one
+  image: nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
+test:ubuntu_cpp:
+  # Use cudnn image instead of the cuda one
+  image: nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
+coverage:ubuntu_cpp:
+  # Use cudnn image instead of the cuda one
+  image: nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
+release:pip:ubuntu:
+  tags:
+    - release:cuda
+  variables:
+    DOCKER_HOST: unix:///var/run/docker.sock
+    CIBW_ENVIRONMENT: >-
+      BUILD_WITH_CUDA=1
+      AIDGE_DEPENDENCIES='aidge_core aidge_backend_cpu'
+      AIDGE_INSTALL='/AIDGE_INSTALL_CIBUILDWHEEL'
+      DOCKER_HOST='unix:///var/run/docker.sock'
+      ARCH='x86_64'
+      CUDNN_VERSION='9'
+      CUDA_MAJOR_VERSION='12'
+      CUDA_MINOR_VERSION='8'
+      SEARCH_PATH='/home/gitlab-runner/builds/$CI_RUNNER_SHORT_TOKEN/$CI_CONCURRENT_ID'
+      CIBW_REPAIR_WHEEL_COMMAND='auditwheel --verbose repair {wheel} -w {dest_dir} --exclude libcudart.so.12 --exclude libcudnn.so.9 --exclude libcublas.so.12 --exclude libcublasLt.so.12'
+  parallel:
+    matrix:
+      # - CIBW_BUILD: "cp38-manylinux_x86_64"
+      # - CIBW_BUILD: "cp39-manylinux_x86_64"
+      - CIBW_BUILD: "cp310-manylinux_x86_64"
+      - CIBW_BUILD: "cp311-manylinux_x86_64"
+      - CIBW_BUILD: "cp312-manylinux_x86_64"
+      - CIBW_BUILD: "cp313-manylinux_x86_64"
+  before_script:
+    # retrieve aidge dependencies
+    - DEPENDENCY_JOB="build:ubuntu_python"
+    - !reference [.ubuntu:download:repositories, before_script] # located in common.gitlab-ci.yml
+  script:
+    # - /home/ubuntu/.local/bin/cibuildwheel --output-dir wheelhouse
+    - cibuildwheel --output-dir wheelhouse
+  # after_script:
+  #   # Ensure all files are owned by the correct user at the end of the job
+  #   # Note: sudo requires the job to have privileged = true
+  #   - sudo chown -R $(whoami):$(whoami) .
--- a/.gitlab/ci/_global.gitlab-ci.yml
+++ b/.gitlab/ci/_global.gitlab-ci.yml
-################################################################################
-# Centralized definitions of common job parameter values.                      #
-# Parameters with many optional configurations may be in separate files.       #
-#                                                                              #
-################################################################################
-variables:
-  GIT_SUBMODULE_STRATEGY: recursive
-  OMP_NUM_THREADS: 4
-  GIT_SSL_NO_VERIFY: 1
-  DEBIAN_FRONTEND: noninteractive
-# See https://docs.gitlab.com/ee/ci/yaml/workflow.html#switch-between-branch-pipelines-and-merge-request-pipelines
-workflow:
-  rules:
-    - if: $CI_PIPELINE_SOURCE == "merge_request_event"
-    - if: $CI_COMMIT_BRANCH && $CI_OPEN_MERGE_REQUESTS
-      when: never
-    - if: $CI_COMMIT_BRANCH
-default:
-  image: nvidia/cuda:12.2.0-devel-ubuntu22.04
-  before_script:
-    - apt update
-    - apt install -y cmake cppcheck python-is-python3 pip git gcovr unzip curl
-    - apt install -y libcudnn8-dev
--- a/.gitlab/ci/build.gitlab-ci.yml
+++ b/.gitlab/ci/build.gitlab-ci.yml
-include:
-  - remote: 'https://gitlab.eclipse.org/eclipse/aidge/gitlab_shared_files/-/raw/main/.gitlab/ci/shared_script.gitlab-ci.yml'
-build:ubuntu_cpp:
-  stage: build
-  needs: []
-  tags:
-    - docker
-  script:
-    # Download dependencies
-    - DEPENDENCY_JOB="build:ubuntu_cpp"
-    # aidge_core
-    - DEPENDENCY_NAME="aidge_core"
-    - !reference [.download_dependency, script]
-    # aidge_backend_cpu
-    - DEPENDENCY_NAME="aidge_backend_cpu"
-    - !reference [.download_dependency, script]
-    # Build current module
-    - export CMAKE_PREFIX_PATH=../install_cpp
-    - mkdir -p build_cpp
-    - cd build_cpp
-    - cmake -DCMAKE_INSTALL_PREFIX:PATH=../install_cpp -DCMAKE_BUILD_TYPE=Debug -DWERROR=ON -DCOVERAGE=ON ..
-    - make -j4 all install
-  artifacts:
-    expire_in: 1 week
-    paths:
-      - build_cpp/
-      - install_cpp/
-build:ubuntu_cpp_g++10:
-  stage: build
-  needs: []
-  tags:
-    - docker
-  script:
-        # Download dependencies
-    - DEPENDENCY_JOB="build:ubuntu_cpp"
-    # aidge_core
-    - DEPENDENCY_NAME="aidge_core"
-    - !reference [.download_dependency, script]
-    # aidge_backend_cpu
-    - DEPENDENCY_NAME="aidge_backend_cpu"
-    - !reference [.download_dependency, script
-    # Build current module
-    - export CMAKE_PREFIX_PATH=../install_cpp
-    - apt install -y g++-10
-    - mkdir -p build_cpp
-    - mkdir -p install_cpp
-    - cd build_cpp
-    - export CXX=/usr/bin/g++-10
-    - cmake -DCMAKE_INSTALL_PREFIX:PATH=../install_cpp -DCMAKE_BUILD_TYPE=Debug -DWERROR=ON -DCOVERAGE=ON ..
-    - make -j4 all install
-build:ubuntu_cpp_g++12:
-  stage: build
-  needs: []
-  tags:
-    - docker
-  script:
-    # Download dependencies
-    - DEPENDENCY_JOB="build:ubuntu_cpp"
-    # aidge_core
-    - DEPENDENCY_NAME="aidge_core"
-    - !reference [.download_dependency, script]
-    # aidge_backend_cpu
-    - DEPENDENCY_NAME="aidge_backend_cpu"
-    - !reference [.download_dependency, script]
-    # Build current module
-    - export CMAKE_PREFIX_PATH=../install_cpp
-    - apt install -y g++-12
-    - mkdir -p build_cpp
-    - mkdir -p install_cpp
-    - cd build_cpp
-    - export CXX=/usr/bin/g++-12
-    - cmake -DCMAKE_INSTALL_PREFIX:PATH=../install_cpp -DCMAKE_BUILD_TYPE=Debug -DWERROR=ON -DCOVERAGE=ON ..
-    - make -j4 all install
-build:ubuntu_cpp_clang12:
-  stage: build
-  needs: []
-  tags:
-    - docker
-  script:
-    # Download dependencies
-    - DEPENDENCY_JOB="build:ubuntu_cpp"
-    # aidge_core
-    - DEPENDENCY_NAME="aidge_core"
-    - !reference [.download_dependency, script]
-    # aidge_backend_cpu
-    - DEPENDENCY_NAME="aidge_backend_cpu"
-    - !reference [.download_dependency, script]
-    # Build current module
-    - export CMAKE_PREFIX_PATH=../install_cpp
-    - apt install -y clang-12
-    - mkdir -p build_cpp
-    - mkdir -p install_cpp
-    - cd build_cpp
-    - export CXX=/usr/bin/clang++-12
-    - cmake -DCMAKE_INSTALL_PREFIX:PATH=../install_cpp -DCMAKE_BUILD_TYPE=Debug -DWERROR=ON -DCOVERAGE=ON ..
-    - make -j4 all install
-build:ubuntu_cpp_clang15:
-  stage: build
-  needs: []
-  tags:
-    - docker
-  script:
-    # Download dependencies
-    - DEPENDENCY_JOB="build:ubuntu_cpp"
-    # aidge_core
-    - DEPENDENCY_NAME="aidge_core"
-    - !reference [.download_dependency, script]
-    # aidge_backend_cpu
-    - DEPENDENCY_NAME="aidge_backend_cpu"
-    - !reference [.download_dependency, script]
-    # Build current module
-    - export CMAKE_PREFIX_PATH=../install_cpp
-    - apt install -y clang-15
-    - mkdir -p build_cpp
-    - mkdir -p install_cpp
-    - cd build_cpp
-    - export CXX=/usr/bin/clang++-15
-    - cmake -DCMAKE_INSTALL_PREFIX:PATH=../install_cpp -DCMAKE_BUILD_TYPE=Debug -DWERROR=ON -DCOVERAGE=ON ..
-    - make -j4 all install
-build:ubuntu_python:
-  stage: build
-  needs: []
-  tags:
-    - docker
-  script:
-    # Download dependencies
-    - DEPENDENCY_JOB="build:ubuntu_python"
-    # aidge_core (python)
-    - DEPENDENCY_NAME="aidge_core"
-    - !reference [.download_dependency, script]
-    # aidge_backend_cpu (python)
-    - DEPENDENCY_NAME="aidge_backend_cpu"
-    - !reference [.download_dependency, script]
-    - python3 -m pip install virtualenv
-    - virtualenv venv
-    - source venv/bin/activate
-    - python3 -m pip install -r requirements.txt
-    - python3 -m pip install .
-  artifacts:
-    expire_in: 1 week
-    paths:
-      - venv/
-# build:windows_cpp:
-#   stage: build
-#   needs: []
-#   tags:
-#     - windows
-#   image: buildtools
-#   before_script:
-#     # Install Chocolatey
-#     - Set-ExecutionPolicy Bypass -Scope Process -Force; [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072; iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))
-#     # Install dependencies
-#     - choco install cmake.install --installargs '"ADD_CMAKE_TO_PATH=System"' -Y
-#     - choco install git -Y
-#     - choco install python -Y
-#     - choco install cuda -Y
-#     # Update PATH
-#     - $env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User")
-#   script:
-#     # Download dependencies
-#     # aidge_core
-#     - 'curl "https://gitlab.eclipse.org/api/v4/projects/5139/jobs/artifacts/main/download?job=build:windows_cpp" -o build_artifacts.zip'
-#     - Expand-Archive -Path .\build_artifacts.zip -DestinationPath . -Force
-#     - Remove-Item .\build_cpp\ -Recurse
-#     # aidge_backend_cpu
-#     - 'curl "https://gitlab.eclipse.org/api/v4/projects/5140/jobs/artifacts/master/download?job=build:windows_cpp" -o build_artifacts.zip'
-#     - Expand-Archive -Path .\build_artifacts.zip -DestinationPath . -Force
-#     - Remove-Item .\build_cpp\ -Recurse
-#     - $env:CMAKE_PREFIX_PATH = '../install_cpp'
-#     - mkdir -p build_cpp
-#     - cd build_cpp
-#     - cmake -DCMAKE_INSTALL_PREFIX:PATH=../install_cpp -DCMAKE_BUILD_TYPE=Debug ..
-#     - cmake --build . -j2
-#     - cmake --install . --config Debug
-#   artifacts:
-#     expire_in: 1 week
-#     paths:
-#       - build_cpp/
-#       - install_cpp/
-# build:windows_python:
-#   stage: build
-#   needs: []
-#   tags:
-#     - windows
-#   image: buildtools
-#   before_script:
-#     # Install Chocolatey
-#     - Set-ExecutionPolicy Bypass -Scope Process -Force; [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072; iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))
-#     # Install dependencies
-#     - choco install cmake.install --installargs '"ADD_CMAKE_TO_PATH=System"' -Y
-#     - choco install git -Y
-#     - choco install python -Y
-#     - choco install cuda -Y
-#     # Update PATH
-#     - $env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User")
-#   script:
-#     # Download dependencies
-#     # aidge_core (Python)
-#     - 'curl "https://gitlab.eclipse.org/api/v4/projects/5139/jobs/artifacts/main/download?job=build:windows_python" -o build_artifacts.zip'
-#     - Expand-Archive -Path .\build_artifacts.zip -DestinationPath . -Force
-#     # aidge_backend_cpu (Python)
-#     - 'curl "https://gitlab.eclipse.org/api/v4/projects/5140/jobs/artifacts/master/download?job=build:windows_python" -o build_artifacts.zip'
-#     - Expand-Archive -Path .\build_artifacts.zip -DestinationPath . -Force
-#     - python -m pip install virtualenv
-#     - virtualenv venv
-#     - venv\Scripts\Activate.ps1
-#     - python -m pip install -r requirements.txt
-#     - python -m pip install .
-#   artifacts:
-#     expire_in: 1 week
-#     paths:
-#       - venv/
--- a/.gitlab/ci/cibuildwheel_build_deps_before_build_wheel.ps1
+++ b/.gitlab/ci/cibuildwheel_build_deps_before_build_wheel.ps1
+$ErrorActionPreference = "Stop"
+# Retrieve and clean the dependencies string from the environment variable
+$AIDGE_DEPENDENCIES = $env:AIDGE_DEPENDENCIES -split ' '
+Write-Host "Aidge dependencies : $AIDGE_DEPENDENCIES"
+if ( $($AIDGE_DEPENDENCIES.Length) -eq 0) {
+        Write-Host "- No dependencies provided for current repsitory"
+        New-Item -ItemType Directory -Force -Path ".\build" | Out-Null
+        Remove-Item -Path ".\build\*" -Recurse -Force
+    } else {
+        Write-Host "Retrieving given dependencies to build current package : $AIDGE_DEPENDENCIES"
+    foreach ($dep in $($AIDGE_DEPENDENCIES -split " ")) {
+        Write-Host "Retrieving : $dep"
+        $curr_loc=$(Get-Location)
+        Set-Location ../$dep
+        Get-Location 
+        Get-ChildItem .
+        New-Item -Path ".\build" -ItemType Directory -Force | Out-Null
+        Get-ChildItem -Path ".\build" -File | Remove-Item -Force
+        python -m pip install . -v
+        Set-Location $curr_loc
+    }
+}
--- a/.gitlab/ci/cibuildwheel_build_deps_before_build_wheel.sh
+++ b/.gitlab/ci/cibuildwheel_build_deps_before_build_wheel.sh
+#!/bin/bash
+set -e
+if [[ "$1" == "" ]]; then
+  echo "build aidge deps in cibuildwheel container before building wheel."
+  echo "search path defines where the dependencies will be searched."
+  echo "Hint : In wheel containers, files are mounted on /host by default."
+  echo "\nusage : ./cibuildwheel_build_deps_before_build_wheel.sh $search_path"
+fi
+set -x
+if [[ $AIDGE_DEPENDENCIES ==  "" ]]; then # case for aidge_ core
+  mkdir -p build # creating build if its not already there to hold the build of cpp files
+  rm -rf build/* # build from scratch
+else
+  for repo in $AIDGE_DEPENDENCIES ; do # case for other projects
+    search_path=$1
+    REPO_PATH=$(find $search_path ! -writable -prune -o  -type d     \
+                                    -name "$repo"                    \
+                                    -not -path "*/install/*"         \
+                                    -not -path "*/.git/*"            \
+                                    -not -path "*/.mypy_cache/*"     \
+                                    -not -path "*/miniconda/*"       \
+                                    -not -path "*/conda/*"           \
+                                    -not -path "*/.local/*"          \
+                                    -not -path "*/lib/*"             \
+                                    -not -path "*/$repo/$repo/*"     \
+                                    -not -path "*/proc/*"            \
+                                    -print -quit)
+    if [[ -z "$REPO_PATH" ]]; then
+      echo "ERROR : dependency $repo not found in search_path \"$search_path\". ABORTING."
+      exit -1
+    fi
+    cd $REPO_PATH
+    mkdir -p build # creating build if its not already there to hold the build of cpp files
+    rm -rf build/* # build from scratch
+    pip install . -v
+    # Give all rights on generated build folder to avoid root issues once out of the Docker
+    chmod -R a+rwX build/
+    chmod -R a+rwX *.egg-info/
+    cd -
+  done
+fi
+set +x
+set +e
--- a/.gitlab/ci/coverage.gitlab-ci.yml
+++ b/.gitlab/ci/coverage.gitlab-ci.yml
-coverage:ubuntu_cpp:
-  stage: coverage
-  needs: ["build:ubuntu_cpp"]
-  tags:
-    - docker
-  script:
-    - cd build_cpp
-    - ctest --output-on-failure
-    # HTML report for visualization
-    - gcovr --html-details --exclude-unreachable-branches -o coverage.html --root ${CI_PROJECT_DIR} --filter '\.\./include/' --filter '\.\./src/'
-    # Coberta XML report for Gitlab integration
-    - gcovr --xml-pretty --exclude-unreachable-branches --print-summary -o coverage.xml --root ${CI_PROJECT_DIR} --filter '\.\./include/' --filter '\.\./src/'
-  coverage: /^\s*lines:\s*\d+.\d+\%/
-  artifacts:
-    name: ${CI_JOB_NAME}-${CI_COMMIT_REF_NAME}-${CI_COMMIT_SHA}
-    expire_in: 2 days
-    reports:
-      coverage_report:
-        coverage_format: cobertura
-        path: build_cpp/coverage.xml
-coverage:ubuntu_python:
-  stage: coverage
-  needs: ["build:ubuntu_python"]
-  tags:
-    - docker
-  script:
-    - source venv/bin/activate
-    - python3 -m pip install numpy coverage
-    - cd ${CI_PROJECT_NAME}
-    # Retrieve the installation path of the module, since it is installed with pip.
-    - export MODULE_LOCATION=`python -c "import ${CI_PROJECT_NAME} as _; print(_.__path__[0])"`
-    - python3 -m coverage run --source=$MODULE_LOCATION -m unittest discover -s unit_tests/ -v -b
-    - python3 -m coverage report
-    - python3 -m coverage xml
-  coverage: '/(?i)total.*? (100(?:\.0+)?\%|[1-9]?\d(?:\.\d+)?\%)$/'
-  artifacts:
-    reports:
-      coverage_report:
-        coverage_format: cobertura
-        path: ${CI_PROJECT_NAME}/coverage.xml
--- a/.gitlab/ci/test.gitlab-ci.yml
+++ b/.gitlab/ci/test.gitlab-ci.yml
-test:ubuntu_cpp:
-  stage: test
-  needs: ["build:ubuntu_cpp"]
-  tags:
-    - docker
-  script:
-    - cd build_cpp
-    - ctest --output-junit ctest-results.xml --output-on-failure
-  artifacts:
-    reports:
-      junit: build_cpp/ctest-results.xml
-test:ubuntu_python:
-  stage: test
-  needs: ["build:ubuntu_python"]
-  tags:
-    - docker
-  script:
-    - source venv/bin/activate
-    - cd ${CI_PROJECT_NAME}
-    - python3 -m pip install numpy unittest-xml-reporting
-    - python3 -m pip list
-    # Run on discovery all tests located in core/unit_tests/python
-    - python3 -m xmlrunner discover -s unit_tests/ -v -b --output-file xmlrunner-results.xml
-  artifacts:
-    reports:
-      junit: ${CI_PROJECT_NAME}/xmlrunner-results.xml
-# test:windows_cpp:
-#   stage: test
-#   needs: ["build:windows_cpp"]
-#   tags:
-#     - windows
-#   image: buildtools
-#   before_script:
-#     # Install Chocolatey
-#     - Set-ExecutionPolicy Bypass -Scope Process -Force; [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072; iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))
-#     # Install dependencies
-#     - choco install cmake.install --installargs '"ADD_CMAKE_TO_PATH=System"' -Y
-#     - choco install python -Y
-#     # Update PATH
-#     - $env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User")
-#   script:
-#     - cd build_cpp
-#     - ctest --output-junit ctest-results.xml --output-on-failure
-#   artifacts:
-#     reports:
-#       junit: build_cpp/ctest-results.xml
--- a/CHANGELOG
+++ b/CHANGELOG
+# Version 0.5.0 (January 31, 2024)
+# Version 0.4.0 (December 6, 2024)
+# Version 0.1.0 (January 23, 2024)
+Initial release
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
 # CMake >= 3.18 is required for good support of FindCUDAToolkit
 cmake_minimum_required(VERSION 3.18)
-file(READ "${CMAKE_SOURCE_DIR}/version.txt" version)
+set(CMAKE_CXX_STANDARD 14)
-file(READ "${CMAKE_SOURCE_DIR}/project_name.txt" project)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS  OFF)
-message(STATUS "Project name: ${project}")
+file(STRINGS "${CMAKE_SOURCE_DIR}/version.txt" version)
-message(STATUS "Project version: ${version}")
+# Parse version.txt to retrieve Major, Minor and Path
+string(REGEX MATCH "([0-9]+\\.[0-9]+\\.[0-9]+)" _ MATCHES ${version})
+set(PROJECT_VERSION_MAJOR ${CMAKE_MATCH_1})
+set(PROJECT_VERSION_MINOR ${CMAKE_MATCH_2})
+set(PROJECT_VERSION_PATCH ${CMAKE_MATCH_3})
-# Note : project name is {project} and python module name is also {project} 
+project(aidge_backend_cuda
-set(module_name _${project}) # target name
+        VERSION ${version}
+        DESCRIPTION "CUDA implementations of the operators of aidge framework."
+        LANGUAGES CXX)
+message(STATUS "Project name: ${CMAKE_PROJECT_NAME}")
+message(STATUS "Project version: ${version}")
+execute_process(
+    COMMAND git rev-parse --short HEAD
+    WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+    OUTPUT_VARIABLE GIT_COMMIT_HASH
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    ERROR_QUIET
+)
+message(STATUS "Latest git commit: ${GIT_COMMIT_HASH}")
+# Define a preprocessor macro with the Git commit version
-project(${project})
+# Note : project name is ${CMAKE_PROJECT_NAME} and python module name is also ${CMAKE_PROJECT_NAME}
+set(module_name _${CMAKE_PROJECT_NAME}) # target name
 ##############################################
 # Define options
-option(PYBIND "python binding" ON)
+option(PYBIND "python binding (Default: OFF)" OFF)
-option(WERROR "Warning as error" OFF)
+option(WERROR "Warning as error (Default: OFF)" OFF)
-option(TEST "Enable tests" ON)
+option(TEST "Enable tests (Default: ON)" ON)
-option(COVERAGE "Enable coverage" OFF)
+option(COVERAGE "Enable coverage (Default: OFF)" OFF)
+option(ENABLE_ASAN "Enable ASan (AddressSanitizer) for runtime analysis of memory use (over/underflow, memory leak, ...) (Default: OFF)" OFF)
 ##############################################
 # Import utils CMakeLists
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
-include(PybindModuleCreation)
 if(CMAKE_COMPILER_IS_GNUCXX AND COVERAGE)
    Include(CodeCoverage)
 endif()
-enable_language(CUDA)
 ##############################################
 # Find system dependencies
-find_package(CUDAToolkit REQUIRED)
+##############################################
+# FIND AIDGE Dependencies
+if(NOT $ENV{AIDGE_INSTALL} STREQUAL "")
+    set(CMAKE_INSTALL_PREFIX $ENV{AIDGE_INSTALL})
+    list(APPEND CMAKE_PREFIX_PATH $ENV{AIDGE_INSTALL})
+    message(WARNING "Env var AIDGE_INSTALL detected : $ENV{AIDGE_INSTALL}. Set CMAKE_INSTALL_PREFIX to AIDGE_INSTALL & added to CMAKE_PREFIX_PATH"
+                    "\n\tCMAKE_INSTALL_PREFIX = ${CMAKE_INSTALL_PREFIX}"
+                    "\n\tCMAKE_PREFIX_PATH = ${CMAKE_PREFIX_PATH}")
+endif()
 find_package(aidge_core REQUIRED)
-find_package(aidge_backend_cpu REQUIRED)
+if(TEST)
+    find_package(aidge_backend_cpu REQUIRED)
+endif()
+##########
+# CUDA
+if(NOT $ENV{CIBUILDWHEEL} STREQUAL "")
+    message(WARNING "Env var CIBUILDWHEEL detected : currently building for a release job."
+                    "\nSetting manually CUDACXX, PATH & LD_LIBRARY_PATH Variables")
+    list(APPEND ENV{LD_LIBRARY_PATH} /usr/local/cuda/lib64)
+    list(APPEND ENV{PATH} /usr/local/cuda/bin)
+    set(ENV{CUDACXX} /usr/local/cuda/bin/nvcc)
+endif()
+find_package(CUDAToolkit 12 REQUIRED)
+if(NOT DEFINED CMAKE_CUDA_STANDARD)
+    set(CMAKE_CUDA_STANDARD 14)
+    set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+endif()
+if(NOT DEFINED CMAKE_CUDA_ARCHITECURE)
+    set(CMAKE_CUDA_ARCHITECTURE native)
+endif()
+message(STATUS "Cuda compiler version = ${CMAKE_CUDA_COMPILER_VERSION}")
+# Define a preprocessor macro with the Cuda compiler version
+add_definitions(-DCUDA_COMPILER_VERSION="${CMAKE_CUDA_COMPILER_VERSION}")
+message(STATUS "CUDA STANDARD : ${CMAKE_CUDA_STANDARD}")
+message(STATUS "CUDA ARCHITECTURE : ${CMAKE_CUDA_ARCHITECTURES}")
+enable_language(CUDA)
 ##############################################
 # Create target and set properties
 file(GLOB_RECURSE src_files "src/*.cpp" "src/*.cu")
 file(GLOB_RECURSE inc_files "include/*.hpp")
 add_library(${module_name} ${src_files} ${inc_files})
+# PYTHON BINDING
+if (PYBIND)
+    # Handles Python + pybind11 headers dependencies
+    include(PybindModuleCreation)
+    # creates a target of the same name as CMAKE_PROJECT_NAME
+    generate_python_binding(${CMAKE_PROJECT_NAME} ${module_name}) # the python bindings module has the same name as the project.
+    target_link_libraries(${module_name}
+        PUBLIC
+            pybind11::pybind11
+        PRIVATE
+            Python::Module
+        )
+endif()
+message(STATUS "Creating ${CMAKE_CURRENT_SOURCE_DIR}/include/aidge/backend/cuda_version.h")
+# Generate version.h file from config file version.h.in
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/include/aidge/backend/version.h.in"
+    "${CMAKE_CURRENT_SOURCE_DIR}/include/aidge/backend/cuda_version.h"
+)
 target_link_libraries(${module_name}
    PUBLIC
        _aidge_core # _ is added because we link the target not the project
-        _aidge_backend_cpu # _ is added because we link the target not the project
+    PRIVATE
-        CUDA::cudart
+        CUDA::cublas
        cudnn
+        CUDA::cudart
 )
+if( ${ENABLE_ASAN} )
+    message("Building ${module_name} with ASAN.")
+    set(SANITIZE_FLAGS -fsanitize=address -fno-omit-frame-pointer)
+    target_link_libraries(${module_name}
+        PUBLIC
+            -fsanitize=address
+    )
+    target_compile_options(${module_name}
+        PRIVATE
+            ${SANITIZE_FLAGS}
+    )
+endif()
+if(TEST)
+    target_link_libraries(${module_name}
+    PUBLIC
+        _aidge_backend_cpu # _ is added because we link the target not the project
+    )
+endif()
 #Set target properties
 target_include_directories(${module_name}
    PUBLIC
@@ -62,27 +161,9 @@ target_include_directories(${module_name}
        ${CMAKE_CURRENT_SOURCE_DIR}/src
 )
-if(NOT DEFINED CMAKE_CUDA_STANDARD)
-    set(CMAKE_CUDA_STANDARD 14)
-    set(CMAKE_CUDA_STANDARD_REQUIRED ON)
-endif()
 set_property(TARGET ${module_name} PROPERTY POSITION_INDEPENDENT_CODE ON)
 set_target_properties(${module_name} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-# PYTHON BINDING
-if (PYBIND)
-    generate_python_binding(${project} ${module_name})
-    # Handles Python + pybind11 headers dependencies
-    target_link_libraries(${module_name}
-        PUBLIC 
-            pybind11::pybind11
-        PRIVATE
-            Python::Python
-        )
-endif()
 target_compile_features(${module_name} PRIVATE cxx_std_14)
 target_compile_options(${module_name} PRIVATE
@@ -101,11 +182,10 @@ endif()
 ##############################################
 # Installation instructions
 include(GNUInstallDirs)
-set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/${project})
+set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/${CMAKE_PROJECT_NAME})
-install(TARGETS ${module_name} EXPORT ${project}-targets
+install(TARGETS ${module_name} EXPORT ${CMAKE_PROJECT_NAME}-targets
  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
@@ -116,42 +196,44 @@ install(DIRECTORY include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
 #Export the targets to a script
-install(EXPORT ${project}-targets
+install(EXPORT ${CMAKE_PROJECT_NAME}-targets
- FILE "${project}-targets.cmake"
+ FILE "${CMAKE_PROJECT_NAME}-targets.cmake"
 DESTINATION ${INSTALL_CONFIGDIR}
-#  COMPONENT ${module_name} 
+#  COMPONENT ${module_name}
-)  
+)
 #Create a ConfigVersion.cmake file
 include(CMakePackageConfigHelpers)
 write_basic_package_version_file(
-    "${CMAKE_CURRENT_BINARY_DIR}/${project}-config-version.cmake"
+    "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_PROJECT_NAME}-config-version.cmake"
    VERSION ${version}
    COMPATIBILITY AnyNewerVersion
 )
-configure_package_config_file("${project}-config.cmake.in"
+configure_package_config_file("${CMAKE_PROJECT_NAME}-config.cmake.in"
-    "${CMAKE_CURRENT_BINARY_DIR}/${project}-config.cmake"
+    "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_PROJECT_NAME}-config.cmake"
    INSTALL_DESTINATION ${INSTALL_CONFIGDIR}
 )
 #Install the config, configversion and custom find modules
 install(FILES
-    "${CMAKE_CURRENT_BINARY_DIR}/${project}-config.cmake"
+    "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_PROJECT_NAME}-config.cmake"
-    "${CMAKE_CURRENT_BINARY_DIR}/${project}-config-version.cmake"
+    "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_PROJECT_NAME}-config-version.cmake"
    DESTINATION ${INSTALL_CONFIGDIR}
 )
 ##############################################
 ## Exporting from the build tree
-export(EXPORT ${project}-targets
+export(EXPORT ${CMAKE_PROJECT_NAME}-targets
-    FILE "${CMAKE_CURRENT_BINARY_DIR}/${project}-targets.cmake")
+    FILE "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_PROJECT_NAME}-targets.cmake")
 ##############################################
 ## Add test
 if(TEST)
+    if(PYBIND)
+        message(FATAL_ERROR "PYBIND and TEST are both enabled. But cannot compile with catch_2.\nChoose between pybind and Catch2 for compilation.")
+    endif()
    enable_testing()
    add_subdirectory(unit_tests)
 endif()
--- a/MANIFEST.in
+++ b/MANIFEST.in
+include README.md LICENCE
+recursive-include aidge_backend_cuda *.py 
+recursive-exclude aidge_backend_cuda/unit_tests *.py
+recursive-include include *.hpp
+recursive-include src *.cpp
+recursive-include python_binding *.cpp
+include CMakeLists.txt
--- a/README.md
+++ b/README.md
@@ -3,15 +3,30 @@
 # Aidge CUDA library
 You can find in this folder the library that implements the CUDA operators.
+[TOC]
-## Pip installation
+## Installation
-You will need to install first the aidge_core library before installing aidge_backend_cuda.
+### Dependencies
-Also, make sure that the install path was set before installing aidge_core library.
+- [CUDA ToolKit 12.4](https://developer.nvidia.com/cuda-12-4-0-download-archive)
-Then run in your python environnement : 
+- [CUDnn9](https://developer.nvidia.com/cudnn-downloads) make sure to install the CUDA 12 compatible version
+- `GCC`
+- `Make`/`Ninja`
+- `CMake`
+- `Python` (optional, if you have no intend to use this library in python with pybind)
+#### Aidge dependencies
+ - `aidge_core`
+ - `aidge_backend_cpu`
+### Pip installation
 ``` bash
 pip install . -v
 ```
+> **TIPS:** Use environment variables to change compilation options:
+> - `AIDGE_INSTALL`: to set the installation folder. Defaults to /usr/local/lib. :warning: This path must be identical to aidge_core install path.
+> - `AIDGE_PYTHON_BUILD_TYPE`: to set the compilation mode to **Debug** or **Release** 
+> - `AIDGE_BUILD_GEN`: to set the build backend with 
 ## Standard C++ Compilation

--- a/aidge_backend_cuda/__init__.py
+++ b/aidge_backend_cuda/__init__.py
-from aidge_backend_cuda.aidge_backend_cuda import * # import so generated by PyBind
+from aidge_backend_cuda.aidge_backend_cuda import *  # import so generated by PyBind
+from . import benchmark
--- a/aidge_backend_cuda/benchmark.py
+++ b/aidge_backend_cuda/benchmark.py
+import time
+import numpy as np
+import aidge_core
+import aidge_backend_cuda
+def measure_inference_time(model: aidge_core.GraphView, input_data: list[str, np.ndarray], nb_warmup: int = 10, nb_iterations: int = 50) -> list[float]:
+    # update model and inputs backend
+    model.set_backend("cuda")
+    ordered_inputs = [aidge_core.Tensor(i[1]) for i in input_data]
+    for ordered_input in ordered_inputs:
+        ordered_input.set_backend("cuda")
+    scheduler = aidge_core.SequentialScheduler(model)
+    scheduler.generate_scheduling()
+    timings = []
+    # Warm-up runs.
+    for i in range(nb_warmup + nb_iterations):
+        if i < nb_warmup:
+            scheduler.forward(forward_dims=False, data=ordered_inputs)
+        else:
+            start = time.process_time()
+            scheduler.forward(forward_dims=False, data=ordered_inputs)
+            end = time.process_time()
+            timings.append((end - start))
+    return timings
+def compute_output(model: aidge_core.GraphView, input_data: list[str, np.ndarray]) -> list[np.ndarray]:
+    # update model and inputs backend
+    model.set_backend("cuda", device = 1)
+    ordered_inputs = [aidge_core.Tensor(i[1]) for i in input_data]
+    for ordered_input in ordered_inputs:
+        ordered_input.set_backend("cuda", device = 1)
+    scheduler = aidge_core.SequentialScheduler(model)
+    scheduler.generate_scheduling()
+    scheduler.forward(forward_dims=False, data=ordered_inputs)
+    outs = []
+    for pair in model.get_ordered_outputs():
+        t = pair[0].get_operator().get_output(pair[1])
+        t.set_backend("cpu")
+        outs.append(t)
+    return [np.array(out) for out in outs]
\ No newline at end of file
--- a/aidge_backend_cuda/unit_tests/test_tensor.py
+++ b/aidge_backend_cuda/unit_tests/test_tensor.py
@@ -6,15 +6,17 @@ import numpy as np
 class test_tensor(unittest.TestCase):
-    """Test tensor binding
+    """Test tensor binding"""
-    """
    def setUp(self):
        pass
    def tearDown(self):
        pass
    def test_getavailable_backends(self):
        self.assertTrue("cuda" in aidge_core.Tensor.get_available_backends())
-if __name__ == '__main__':
+if __name__ == "__main__":
    unittest.main()
--- a/cmake/PybindModuleCreation.cmake
+++ b/cmake/PybindModuleCreation.cmake
-function(generate_python_binding name target_to_bind)
+function(generate_python_binding pybind_module_name target_to_bind)
    add_definitions(-DPYBIND)
    Include(FetchContent)
+    set(PYBIND_VERSION v2.13.6)
+    set(PYBIND11_FINDPYTHON ON)
+    message(STATUS "Retrieving pybind ${PYBIND_VERSION} from git")
    FetchContent_Declare(
-    PyBind11
+        PyBind11
-    GIT_REPOSITORY https://github.com/pybind/pybind11.git
+        GIT_REPOSITORY https://github.com/pybind/pybind11.git
-    GIT_TAG        v2.10.4 # or a later release
+        GIT_TAG        ${PYBIND_VERSION} # or a later release
    )
    # Use the New FindPython mode, recommanded. Requires CMake 3.15+
-    find_package(Python COMPONENTS Interpreter Development)
+    find_package(Python COMPONENTS Interpreter Development.Module)
    FetchContent_MakeAvailable(PyBind11)
-    message(STATUS "Creating binding for module ${name}")
+    message(STATUS "Creating binding for module ${pybind_module_name}")
    file(GLOB_RECURSE pybind_src_files "python_binding/*.cpp")
-    pybind11_add_module(${name} MODULE ${pybind_src_files} "NO_EXTRAS") # NO EXTRA recquired for pip install
+    pybind11_add_module(${pybind_module_name} MODULE ${pybind_src_files} "NO_EXTRAS") # NO EXTRA recquired for pip install
-    target_include_directories(${name} PUBLIC "python_binding")
+    target_include_directories(${pybind_module_name} PRIVATE "python_binding")
-    target_link_libraries(${name} PUBLIC ${target_to_bind})
+    target_link_libraries(${pybind_module_name}
+        PRIVATE
+            ${target_to_bind}
+            CUDA::cublas
+            cudnn
+            CUDA::cudart
+    )
+    set_property(TARGET ${pybind_module_name} PROPERTY POSITION_INDEPENDENT_CODE ON)
+    set_target_properties(${pybind_module_name} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+    set_target_properties(${pybind_module_name} PROPERTIES
+        CMAKE_SHARED_LINKER_FLAGS "-Wl,--exclude-libs,ALL"
+    )
+    set_target_properties(${module} PROPERTIES INSTALL_RPATH "")
 endfunction()
--- a/include/aidge/backend/cuda.hpp
+++ b/include/aidge/backend/cuda.hpp
@@ -11,9 +11,42 @@
 #ifndef AIDGE_BACKEND_CUDA_IMPORTS_H_
 #define AIDGE_BACKEND_CUDA_IMPORTS_H_
+#include "aidge/backend/cuda_version.h"
 #include "aidge/backend/cuda/data/TensorImpl.hpp"
+#include "aidge/backend/cuda/operator/OperatorImpl.hpp"
+#include "aidge/backend/cuda/operator/AbsImpl.hpp"
+#include "aidge/backend/cuda/operator/AddImpl.hpp"
+#include "aidge/backend/cuda/operator/ArgMaxImpl.hpp"
+#include "aidge/backend/cuda/operator/AvgPoolingImpl.hpp"
+#include "aidge/backend/cuda/operator/BatchNormImpl.hpp"
 #include "aidge/backend/cuda/operator/ConvImpl.hpp"
-#include "aidge/backend/cuda/operator/ProducerImpl.hpp"
+#include "aidge/backend/cuda/operator/ClipImpl.hpp"
+#include "aidge/backend/cuda/operator/DivImpl.hpp"
+#include "aidge/backend/cuda/operator/EqualImpl.hpp"
+#include "aidge/backend/cuda/operator/ErfImpl.hpp"
+#include "aidge/backend/cuda/operator/FCImpl.hpp"
+#include "aidge/backend/cuda/operator/GlobalAveragePoolingImpl.hpp"
+#include "aidge/backend/cuda/operator/ILayerNormImpl.hpp"
+#include "aidge/backend/cuda/operator/LRNImpl.hpp"
+#include "aidge/backend/cuda/operator/LnImpl.hpp"
+#include "aidge/backend/cuda/operator/MaxPoolingImpl.hpp"
+#include "aidge/backend/cuda/operator/MatMulImpl.hpp"
+#include "aidge/backend/cuda/operator/MulImpl.hpp"
+#include "aidge/backend/cuda/operator/PadImpl.hpp"
+#include "aidge/backend/cuda/operator/PowImpl.hpp"
+#include "aidge/backend/cuda/operator/ReduceMeanImpl.hpp"
+#include "aidge/backend/cuda/operator/ReduceSumImpl.hpp"
+#include "aidge/backend/cuda/operator/ReLUImpl.hpp"
+#include "aidge/backend/cuda/operator/RoundImpl.hpp"
+#include "aidge/backend/cuda/operator/ShiftGELUImpl.hpp"
+#include "aidge/backend/cuda/operator/ShiftMaxImpl.hpp"
+#include "aidge/backend/cuda/operator/SigmoidImpl.hpp"
+#include "aidge/backend/cuda/operator/SoftmaxImpl.hpp"
+#include "aidge/backend/cuda/operator/SqrtImpl.hpp"
+#include "aidge/backend/cuda/operator/SubImpl.hpp"
+#include "aidge/backend/cuda/operator/TanhImpl.hpp"
 #endif /* AIDGE_BACKEND_CUDA_IMPORTS_H_ */
\ No newline at end of file
--- a/include/aidge/backend/cuda/data/TensorImpl.hpp
+++ b/include/aidge/backend/cuda/data/TensorImpl.hpp
 #ifndef AIDGE_BACKEND_CUDA_DATA_TENSORIMPL_H_
 #define AIDGE_BACKEND_CUDA_DATA_TENSORIMPL_H_
+#include <cstddef>  // std::size_t
+#include <memory>
+#include <string>
+#include <vector>
+#include <cuda.h>
+#include <type_traits>  // std::enable_if, std::is_same
 #include "aidge/backend/TensorImpl.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/utils/Registrar.hpp"
@@ -13,30 +21,33 @@
 namespace Aidge {
 template <typename SRC_T, typename DST_T>
-void thrust_copy(const SRC_T* /*srcData*/, DST_T* /*dstData*/, size_t /*size*/);
+void thrust_copy(const SRC_T* srcData, DST_T* dstData, size_t size);
-template <typename SRC_T, typename std::enable_if<!std::is_same<half_float::half, SRC_T>::value>::type* = nullptr>
-void thrust_copy(const SRC_T* srcData, half_float::half* dstData, size_t size);
-template <typename DST_T, typename std::enable_if<!std::is_same<half_float::half, DST_T>::value>::type* = nullptr>
-void thrust_copy(const half_float::half* srcData, DST_T* dstData, size_t size);
-template <>
-void thrust_copy(const half_float::half* srcData, half_float::half* dstData, size_t size);
 /**
 * @brief Abstract class for the TensorImpl_cuda class template.
- * @details Its purpose is to provide access to base methods that are specific 
+ * @details Its purpose is to provide access to base methods that are specific
- * to the implementation (which are therefore not present in the TensorImpl 
+ * to the implementation (which are therefore not present in the TensorImpl
 * class), but whose data type does not need to be known.
 */
 class TensorImpl_cuda_ {
+protected:
+    mutable cudnnTensorDescriptor_t mCudnnTensor = nullptr;
 public:
    /**
     * @brief Return the CuDNN tensor descriptor of the tensor.
-     * @details This method uses lazy initialization for the descriptor 
+     * @details This method uses lazy initialization for the descriptor
     * (which is therefore mutable in the derived class).
     * @return cudnnTensorDescriptor_t CuDNN tensor descriptor.
     */
-    virtual const cudnnTensorDescriptor_t& getCudnnTensorDesc() const = 0;
+    virtual const cudnnTensorDescriptor_t& getCudnnTensorDesc(const Tensor& tensor) const = 0;
+    virtual ~TensorImpl_cuda_() {
+        if (mCudnnTensor != nullptr)
+            cudnnDestroyTensorDescriptor(mCudnnTensor);
+    }
 };
 template <class T>
@@ -54,151 +65,186 @@ private:
    }
 private:
-    const Tensor &mTensor;  // Impl needs to access Tensor information, but is not
-                            // supposed to change it!
-    /// Pointer to the data and its capacity
    future_std::span<T> mData;
    /// If this instance own the data, std::unique_ptr manages it
    std::unique_ptr<T, decltype(&cudaDelete)> mDataOwner;
-    mutable cudnnTensorDescriptor_t mCudnnTensor = nullptr;
 public:
-    static constexpr const char *Backend = "cuda";
+    static const std::string Backend;
+    TensorImpl_cuda(DeviceIdx_t device, std::vector<DimSize_t> dims) : TensorImpl(Backend, device, dims), mDataOwner(nullptr, cudaDelete) {}
-    TensorImpl_cuda(const Tensor &tensor) : TensorImpl(Backend), mTensor(tensor), mDataOwner(nullptr, cudaDelete) {}
    bool operator==(const TensorImpl &otherImpl) const override final;
-    static std::unique_ptr<TensorImpl_cuda> create(const Tensor &tensor) {
+    static std::shared_ptr<TensorImpl_cuda> create(DeviceIdx_t device, std::vector<DimSize_t> dims) {
-        return std::make_unique<TensorImpl_cuda<T>>(tensor);
+        return std::make_shared<TensorImpl_cuda<T>>(device, dims);
    }
    // native interface
    const future_std::span<T>& data() const { return mData; }
-    std::size_t size() const override { return mData.size(); }
+    inline std::size_t capacity() const noexcept override { return mData.size(); }
-    std::size_t scalarSize() const override { return sizeof(T); }
+    std::size_t scalarSize() const noexcept override { return sizeof(T); }
-    void setDevice(DeviceIdx_t device) override {
+    void zeros() override final {
-        mDevice = device;
+        CHECK_CUDA_STATUS(cudaMemset(rawPtr(), T(0), mNbElts * sizeof(T)));
    }
    void copy(const void *src, NbElts_t length, NbElts_t offset = 0) override {
-        void* dst = static_cast<void*>(static_cast<T*>(rawPtr()) + offset);
+        AIDGE_ASSERT(offset + length <= mNbElts, "TensorImpl_cuda<{}>::copy(): copy offset ({}) + length ({}) is above capacity ({})", typeid(T).name(), offset, length, mNbElts);
-        CHECK_CUDA_STATUS(cudaMemcpy(dst, src, length * sizeof(T), cudaMemcpyDeviceToDevice));
+        const T* srcT = static_cast<const T *>(src);
+        T* dstT = static_cast<T *>(rawPtr(offset));
+        AIDGE_ASSERT(dstT < srcT || dstT >= srcT + length, "TensorImpl_cuda<{}>::copy(): overlapping copy is not supported", typeid(T).name());
+        CHECK_CUDA_STATUS(cudaMemcpy(dstT, srcT, length * sizeof(T), cudaMemcpyDeviceToDevice));
    }
-    void copyCast(const void *src, NbElts_t length, const DataType srcDt) override {
+    void copyCast(const void *src, const DataType srcDt, NbElts_t length, NbElts_t offset = 0) override {
        if (length == 0) {
            return;
        }
-        AIDGE_ASSERT(length <= mData.size() || length <= mTensor.size(), "copy length is above capacity");
+        AIDGE_ASSERT(offset + length <= mNbElts, "TensorImpl_cuda<{}>::copyCast(): copy offset ({}) + length ({}) is above capacity ({})", typeid(T).name(), offset, length, mNbElts);
-        if (srcDt == DataType::Float64) {
+        switch (srcDt) {
+        case DataType::Float64:
            thrust_copy(static_cast<const double*>(src),
-                        static_cast<T*>(rawPtr()),
+                        static_cast<T*>(rawPtr(offset)),
-                        length);
+                        static_cast<size_t>(length));
-        }
+            break;
-        else if (srcDt == DataType::Float32) {
+        case DataType::Float32:
            thrust_copy(static_cast<const float*>(src),
-                        static_cast<T*>(rawPtr()),
+                        static_cast<T*>(rawPtr(offset)),
-                        length);
+                        static_cast<size_t>(length));
-        }
+            break;
-        else if (srcDt == DataType::Float16) {
+        case DataType::Float16:
            thrust_copy(static_cast<const half_float::half*>(src),
-                        static_cast<T*>(rawPtr()),
+                        static_cast<T*>(rawPtr(offset)),
-                        length);
+                        static_cast<size_t>(length));
-        }
+            break;
-        else if (srcDt == DataType::Int64) {
+        case DataType::Int64:
            thrust_copy(static_cast<const int64_t*>(src),
-                        static_cast<T*>(rawPtr()),
+                        static_cast<T*>(rawPtr(offset)),
-                        length);
+                        static_cast<size_t>(length));
-        }
+            break;
-        else if (srcDt == DataType::UInt64) {
+        case DataType::UInt64:
            thrust_copy(static_cast<const uint64_t*>(src),
-                        static_cast<T*>(rawPtr()),
+                        static_cast<T*>(rawPtr(offset)),
-                        length);
+                        static_cast<size_t>(length));
-        }
+            break;
-        else if (srcDt == DataType::Int32) {
+        case DataType::Int32:
            thrust_copy(static_cast<const int32_t*>(src),
-                        static_cast<T*>(rawPtr()),
+                        static_cast<T*>(rawPtr(offset)),
-                        length);
+                        static_cast<size_t>(length));
-        }
+            break;
-        else if (srcDt == DataType::UInt32) {
+        case DataType::UInt32:
            thrust_copy(static_cast<const uint32_t*>(src),
-                        static_cast<T*>(rawPtr()),
+                        static_cast<T*>(rawPtr(offset)),
-                        length);
+                        static_cast<size_t>(length));
-        }
+            break;
-        else if (srcDt == DataType::Int16) {
+        case DataType::Int16:
            thrust_copy(static_cast<const int16_t*>(src),
-                        static_cast<T*>(rawPtr()),
+                        static_cast<T*>(rawPtr(offset)),
-                        length);
+                        static_cast<size_t>(length));
-        }
+            break;
-        else if (srcDt == DataType::UInt16) {
+        case DataType::UInt16:
            thrust_copy(static_cast<const uint16_t*>(src),
-                        static_cast<T*>(rawPtr()),
+                        static_cast<T*>(rawPtr(offset)),
-                        length);
+                        static_cast<size_t>(length));
-        }
+            break;
-        else if (srcDt == DataType::Int8) {
+        case DataType::Int8:
            thrust_copy(static_cast<const int8_t*>(src),
-                        static_cast<T*>(rawPtr()),
+                        static_cast<T*>(rawPtr(offset)),
-                        length);
+                        static_cast<size_t>(length));
-        }
+            break;
-        else if (srcDt == DataType::UInt8) {
+        case DataType::UInt8:
            thrust_copy(static_cast<const uint8_t*>(src),
-                        static_cast<T*>(rawPtr()),
+                        static_cast<T*>(rawPtr(offset)),
-                        length);
+                        static_cast<size_t>(length));
-        }
+            break;
-        else {
+        default:
-            AIDGE_THROW_OR_ABORT(std::runtime_error, "Unsupported data type.");
+            AIDGE_THROW_OR_ABORT(std::runtime_error, "TensorImpl_cuda<{}>::copyCast(): unsupported data type {}.", typeid(T).name(), srcDt);
+            break;
        }
    }
-    void copyFromDevice(const void *src, NbElts_t length, const std::pair<std::string, DeviceIdx_t>& device) override {
+    void copyFromDevice(const void *src, const std::pair<std::string, DeviceIdx_t>& device, NbElts_t length, NbElts_t offset = 0) override {
-        AIDGE_ASSERT(length <= mData.size() || length <= mTensor.size(), "copy length is above capacity");
+        AIDGE_ASSERT(offset + length <= mNbElts, "TensorImpl_cuda<{}>::copyFromDevice(): copy offset ({}) + length ({}) is above capacity ({})", typeid(T).name(), offset, length, mNbElts);
-        CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(), src, length * sizeof(T), cudaMemcpyDeviceToDevice));
+        CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(offset), src, length * sizeof(T), cudaMemcpyDeviceToDevice));
    }
-    void copyFromHost(const void *src, NbElts_t length) override {
+    void copyFromHost(const void *src, NbElts_t length, NbElts_t offset = 0) override {
-        AIDGE_ASSERT(length <= mData.size() || length <= mTensor.size(), "copy length is above capacity");
+        AIDGE_ASSERT(offset + length <= mNbElts, "TensorImpl_cuda<{}>::copyFromHost(): copy offset ({}) + length ({}) is above capacity ({})", typeid(T).name(), offset, length, mNbElts);
-        CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(), src, length * sizeof(T), cudaMemcpyHostToDevice));
+        CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(offset), src, length * sizeof(T), cudaMemcpyHostToDevice));
    }
-    void copyToHost(void *dst, NbElts_t length) const override {
+    void copyToHost(void *dst, NbElts_t length, NbElts_t offset = 0) const override {
-        AIDGE_ASSERT(length <= mData.size() || length <= mTensor.size(), "copy length is above capacity");
+        AIDGE_ASSERT(offset + length <= mNbElts, "TensorImpl_cuda<{}>::copyToHost(): copy offset ({}) + length ({}) is above capacity ({})", typeid(T).name(), offset, length, mNbElts);
-        CHECK_CUDA_STATUS(cudaMemcpy(dst, rawPtr(), length * sizeof(T), cudaMemcpyDeviceToHost));
+        CHECK_CUDA_STATUS(cudaMemcpy(dst, rawPtr(offset), length * sizeof(T), cudaMemcpyDeviceToHost));
    }
    void *rawPtr(NbElts_t offset = 0) override {
+        cudaSetDevice(mDevice);
        lazyInit();
        return (mData.data() + offset);
    };
    const void *rawPtr(NbElts_t offset = 0) const override {
-        AIDGE_ASSERT(mData.size() >= mTensor.size(), "accessing uninitialized const rawPtr");
+        AIDGE_ASSERT(mData.size() >= mNbElts, "TensorImpl_cuda<{}>::rawPtr(): accessing uninitialized const rawPtr", typeid(T).name());
        return (mData.data() + offset);
    };
-    const cudnnTensorDescriptor_t& getCudnnTensorDesc() const override {
+    const cudnnTensorDescriptor_t& getCudnnTensorDesc(const Tensor& tensor) const override {
        if (mCudnnTensor == nullptr) {
            CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&mCudnnTensor));
-            if (mTensor.size() > 0) {
+            if (tensor.size() > 0) {
                /**
                **      cudNN Tensors are restricted to having at least 4 dimensions :
                **      When working with lower dimensionsal data, unused dimensions are set to 1.
                **      Referes to the cudnnSetTensorNdDescriptor documentation from :
                **      https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html
                **/
-                std::vector<int> dims(mTensor.dims().begin(), mTensor.dims().end());
+                std::vector<int> dims(tensor.dims().cbegin(), tensor.dims().cend());
+                std::vector<int> strides(tensor.strides().cbegin(), tensor.strides().cend());
-                if (dims.size() < 4)
+                if (dims.size() < 4) {
                    dims.resize(4, 1);
+                    strides.resize(4, 1);
+                }
+                CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(mCudnnTensor,
+                                            CudaContext::data_type<T>::value,
+                                            dims.size(),
+                                            &dims[0],
+                                            &strides[0]));
+            }
+        }
+        else {
+            // Compare if the shape of the tensor has changed
+            cudnnDataType_t currentDataType;
+            int currentNbDims;
+            // Since we don't know the nb dims of the current tensor, we init with CUDNN_DIM_MAX then remove the trailing zeros
+            std::vector<int> currentDims(CUDNN_DIM_MAX);
+            std::vector<int> currentStrides(CUDNN_DIM_MAX);
+            CHECK_CUDNN_STATUS(cudnnGetTensorNdDescriptor(mCudnnTensor, CUDNN_DIM_MAX, &currentDataType, &currentNbDims, currentDims.data(), currentStrides.data()));
+            // Remove the trailing zeros
+            currentDims.erase(std::find_if(currentDims.rbegin(), currentDims.rend(), [](int x) { return x != 0; }).base(),
+                              currentDims.end());
+            std::vector<int> dims(tensor.dims().cbegin(), tensor.dims().cend());
+            if (dims.size() < 4) {
+                dims.resize(4, 1);
+            }
-                std::vector<int> strides(dims.size(), 1);
+            // Update descriptor if shape has changed
+            if (dims!=currentDims) {
+                std::vector<int> strides(tensor.strides().cbegin(), tensor.strides().cend());
-                for (size_t dim = 1; dim < dims.size(); ++dim) {
+                if (strides.size() < 4) {
-                    strides[dims.size() - dim - 1] = strides[dims.size() - dim] * dims[dims.size() - dim];
+                    strides.resize(4, 1);
                }
                CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(mCudnnTensor,
@@ -208,42 +254,42 @@ public:
                                            &strides[0]));
            }
        }
        return mCudnnTensor;
    }
    void setRawPtr(void *ptr, NbElts_t length) override final {
-        AIDGE_ASSERT(length >= mTensor.size(), "trying to set raw pointer of insufficient capacity");
+        AIDGE_ASSERT(length >= mNbElts, "TensorImpl_cuda<{}>::setRawPtr(): trying to set raw pointer (length: {}) of insufficient capacity (required: {})", typeid(T).name(), length, mNbElts);
        mData = future_std::span<T>(static_cast<T *>(ptr), length);
        mDataOwner.reset();
    };
-    virtual ~TensorImpl_cuda() {
+    virtual ~TensorImpl_cuda() = default;
-        if (mCudnnTensor != nullptr)
-            cudnnDestroyTensorDescriptor(mCudnnTensor);
-    }
 private:
    void lazyInit() {
-        if (mData.size() < mTensor.size()) {
+        if (mData.size() < mNbElts) {
            // Need more data, a re-allocation will occur
-            AIDGE_ASSERT(mData.empty() || mDataOwner != nullptr, "trying to enlarge non-owned data");
+            AIDGE_ASSERT(mData.empty() || mDataOwner != nullptr, "TensorImpl_cuda<{}>: trying to enlarge non-owned data", typeid(T).name());
-            mDataOwner.reset(cudaAlloc(mTensor.size()));
+            mDataOwner.reset(cudaAlloc(mNbElts));
-            mData = future_std::span<T>(mDataOwner.get(), mTensor.size());
+            mData = future_std::span<T>(mDataOwner.get(), mNbElts);
        }
    }
 };
-namespace {
+template <typename T>
-static Registrar<Tensor> registrarTensorImpl_cuda_Float64(
+const std::string TensorImpl_cuda<T>::Backend = "cuda";
-        {"cuda", DataType::Float64}, Aidge::TensorImpl_cuda<double>::create);
-static Registrar<Tensor> registrarTensorImpl_cuda_Float32(
+REGISTRAR(Tensor, {"cuda", DataType::Float64}, Aidge::TensorImpl_cuda<double>::create);
-        {"cuda", DataType::Float32}, Aidge::TensorImpl_cuda<float>::create);
+REGISTRAR(Tensor, {"cuda", DataType::Float32}, Aidge::TensorImpl_cuda<float>::create);
-static Registrar<Tensor> registrarTensorImpl_cuda_Float16(
+REGISTRAR(Tensor, {"cuda", DataType::Float16}, Aidge::TensorImpl_cuda<half_float::half>::create);
-        {"cuda", DataType::Float16}, Aidge::TensorImpl_cuda<half_float::half>::create);
+REGISTRAR(Tensor, {"cuda", DataType::Int64}, Aidge::TensorImpl_cuda<int64_t>::create);
-static Registrar<Tensor> registrarTensorImpl_cuda_Int32(
+REGISTRAR(Tensor, {"cuda", DataType::Int32}, Aidge::TensorImpl_cuda<int32_t>::create);
-        {"cuda", DataType::Int32}, Aidge::TensorImpl_cuda<int>::create);
+REGISTRAR(Tensor, {"cuda", DataType::Int16}, Aidge::TensorImpl_cuda<int16_t>::create);
-}  // namespace
+REGISTRAR(Tensor, {"cuda", DataType::Int8}, Aidge::TensorImpl_cuda<int8_t>::create);
+REGISTRAR(Tensor, {"cuda", DataType::UInt64}, Aidge::TensorImpl_cuda<uint64_t>::create);
+REGISTRAR(Tensor, {"cuda", DataType::UInt32}, Aidge::TensorImpl_cuda<uint32_t>::create);
+REGISTRAR(Tensor, {"cuda", DataType::UInt16}, Aidge::TensorImpl_cuda<uint16_t>::create);
+REGISTRAR(Tensor, {"cuda", DataType::UInt8}, Aidge::TensorImpl_cuda<uint8_t>::create);
 }  // namespace Aidge
 #endif /* AIDGE_BACKEND_CUDA_DATA_TENSORIMPL_H_ */
--- a/include/aidge/backend/cuda/operator/AbsImpl.hpp
+++ b/include/aidge/backend/cuda/operator/AbsImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_BACKEND_CUDA_OPERATOR_ABSIMPL_H_
+#define AIDGE_BACKEND_CUDA_OPERATOR_ABSIMPL_H_
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <cudnn.h>
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/Abs.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+namespace Aidge {
+// Operator implementation entry point for the backend
+class AbsImpl_cuda : public OperatorImpl {
+public:
+    AbsImpl_cuda(const Abs_Op& op) : OperatorImpl(op, "cuda") {}
+    static std::unique_ptr<AbsImpl_cuda> create(const Abs_Op& op) {
+        return std::make_unique<AbsImpl_cuda>(op);
+    }
+    virtual std::vector<ImplSpec> getAvailableImplSpecs() const override {
+        return {
+            {DataType::Float64},
+            {DataType::Float32},
+            {DataType::Float16},
+        };
+    }
+    void forward() override;
+    void backward() override;
+private:
+    std::shared_ptr<Tensor> mInputFallback;
+    std::shared_ptr<Tensor> mOutputGradFallback;
+    template <class T> void forward_(const Tensor& input);
+    template <class T> void backward_(const Tensor& input, const Tensor& outputGrad);
+};
+// Implementation entry point registration to Operator
+REGISTRAR(Abs_Op, "cuda", Aidge::AbsImpl_cuda::create);
+}  // namespace Aidge
+#endif /* AIDGE_BACKEND_CUDA_OPERATOR_ABSIMPL_H_ */
--- a/src/operator/ProducerImpl.cpp
+++ b/src/operator/ProducerImpl.cpp
 /********************************************************************************
- * Copyright (c) 2023 CEA-List
+ * Copyright (c) 2024 CEA-List
 *
 * This program and the accompanying materials are made available under the
 * terms of the Eclipse Public License 2.0 which is available at
@@ -9,26 +9,28 @@
 *
 ********************************************************************************/
-#include <cassert>
+#ifndef AIDGE_CUDA_OPERATOR_ABSIMPL_KERNELS_H_
-#include <numeric> // std::accumulate
+#define AIDGE_CUDA_OPERATOR_ABSIMPL_KERNELS_H_
-#include <vector>
-#include "aidge/data/Tensor.hpp"
+#include <stdexcept>
-#include "aidge/operator/Producer.hpp"
+#include <cfloat>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <cuda_fp16.h>
+#include "aidge/data/Data.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
 #include "aidge/utils/Types.h"
-#include "aidge/backend/cuda/operator/ProducerImpl.hpp"
+namespace Aidge {
-Aidge::DimSize_t Aidge::ProducerImpl_cuda::getNbProducedData(
+template <class T>
-    Aidge::IOIndex_t outputIdx) const
+void absForward(const T* input, T* output, int size);
-{
-    // Requires the whole tensors, regardless of available data on inputs
-    assert(outputIdx == 0 && "operator has only one output");
-    (void) outputIdx;
-    return std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->size();
 }
+#endif /* AIDGE_CUDA_OPERATOR_ABSIMPL_KERNELS_H_ */
-void Aidge::ProducerImpl_cuda::forward()
-{
-}
No results found