Compare revisions

ec78b3aa · 61e04aa3 · 6971fe6c · 2247fece · b41bfda0 · 4cad5349
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@
 # C++ Build
 build*/
 install*/
+include/aidge/backend/cuda_version.h

 # VSCode
 .vscode

--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -12,23 +12,46 @@ stages:
  - deploy

 include:
-  - project: 'eclipse/aidge/gitlab_shared_files' 
+  - project: 'eclipse/aidge/gitlab_shared_files'
    ref: 'main'
-    file: 
+    file:
      # choose which jobs to run by including the corresponding files.
      - '.gitlab/ci/ubuntu_cpp.gitlab-ci.yml'

      - '.gitlab/ci/ubuntu_python.gitlab-ci.yml'
-      - '.gitlab/ci/release/cibuildwheel_ubuntu.gitlab-ci.yml'   
+      - '.gitlab/ci/release/cibuildwheel_ubuntu.gitlab-ci.yml'

      # - '.gitlab/ci/windows_cpp.gitlab-ci.yml'

-      # - '.gitlab/ci/windows_python.gitlab-ci.yml'   
-      # - '.gitlab/ci/release/cibuildwheel_windows.gitlab-ci.yml'   
+      # - '.gitlab/ci/windows_python.gitlab-ci.yml'
+      # - '.gitlab/ci/release/cibuildwheel_windows.gitlab-ci.yml'

+build:ubuntu_python:
+  # Use cudnn image instead of the cuda one
+  image: nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
+
+test:ubuntu_python:
+  # Use cudnn image instead of the cuda one
+  image: nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
+
+coverage:ubuntu_python:
+  # Use cudnn image instead of the cuda one
+  image: nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
+
+.build:ubuntu_cpp:template:
+  # Use cudnn image instead of the cuda one
+  image: nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
+
+test:ubuntu_cpp:
+  # Use cudnn image instead of the cuda one
+  image: nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
+
+coverage:ubuntu_cpp:
+  # Use cudnn image instead of the cuda one
+  image: nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04

 release:pip:ubuntu:
-  tags: 
+  tags:
    - release:cuda
  variables:
    DOCKER_HOST: unix:///var/run/docker.sock
@@ -36,19 +59,22 @@ release:pip:ubuntu:
      BUILD_WITH_CUDA=1
      AIDGE_DEPENDENCIES='aidge_core aidge_backend_cpu'
      AIDGE_INSTALL='/AIDGE_INSTALL_CIBUILDWHEEL'
-      CUDA_TOOLKIT_VERSION='11-8'
      DOCKER_HOST='unix:///var/run/docker.sock'
      ARCH='x86_64'
      CUDNN_VERSION='9'
-      CUDA_MAJOR_VERSION='11'
+      CUDA_MAJOR_VERSION='12'
      CUDA_MINOR_VERSION='8'
-      SEARCH_PATH='/home/ubuntu/builds/$CI_RUNNER_SHORT_TOKEN/$CI_CONCURRENT_ID'
+      SEARCH_PATH='/home/gitlab-runner/builds/$CI_RUNNER_SHORT_TOKEN/$CI_CONCURRENT_ID'
+      CIBW_REPAIR_WHEEL_COMMAND='auditwheel --verbose repair {wheel} -w {dest_dir} --exclude libcudart.so.12 --exclude libcudnn.so.9 --exclude libcublas.so.12 --exclude libcublasLt.so.12'

  parallel:
    matrix:
-      - CIBW_BUILD: "cp38-manylinux_x86_64"
-      - CIBW_BUILD: "cp39-manylinux_x86_64"
+      # - CIBW_BUILD: "cp38-manylinux_x86_64"
+      # - CIBW_BUILD: "cp39-manylinux_x86_64"
      - CIBW_BUILD: "cp310-manylinux_x86_64"
+      - CIBW_BUILD: "cp311-manylinux_x86_64"
+      - CIBW_BUILD: "cp312-manylinux_x86_64"
+      - CIBW_BUILD: "cp313-manylinux_x86_64"

  before_script:
    # retrieve aidge dependencies
@@ -56,9 +82,11 @@ release:pip:ubuntu:
    - !reference [.ubuntu:download:repositories, before_script] # located in common.gitlab-ci.yml

  script:
-    - /home/ubuntu/.local/bin/cibuildwheel --output-dir wheelhouse
+    # - /home/ubuntu/.local/bin/cibuildwheel --output-dir wheelhouse
+    - cibuildwheel --output-dir wheelhouse

-  after_script:
-    # Ensure all files are owned by the correct user at the end of the job
-    - sudo chown -R $(whoami):$(whoami) .
+  # after_script:
+  #   # Ensure all files are owned by the correct user at the end of the job
+  #   # Note: sudo requires the job to have privileged = true
+  #   - sudo chown -R $(whoami):$(whoami) .

--- a/.gitlab/ci/cibuildwheel_build_deps_before_build_wheel.sh
+++ b/.gitlab/ci/cibuildwheel_build_deps_before_build_wheel.sh
 #!/bin/bash
 set -e
-if [[ "$1" == "" ]]; then 
+if [[ "$1" == "" ]]; then
  echo "build aidge deps in cibuildwheel container before building wheel."
  echo "search path defines where the dependencies will be searched."
  echo "Hint : In wheel containers, files are mounted on /host by default."
@@ -10,13 +10,14 @@ set -x
 if [[ $AIDGE_DEPENDENCIES ==  "" ]]; then # case for aidge_ core
  mkdir -p build # creating build if its not already there to hold the build of cpp files
  rm -rf build/* # build from scratch
-else 
+else
  for repo in $AIDGE_DEPENDENCIES ; do # case for other projects
    search_path=$1
    REPO_PATH=$(find $search_path ! -writable -prune -o  -type d     \
                                    -name "$repo"                    \
                                    -not -path "*/install/*"         \
                                    -not -path "*/.git/*"            \
+                                    -not -path "*/.mypy_cache/*"     \
                                    -not -path "*/miniconda/*"       \
                                    -not -path "*/conda/*"           \
                                    -not -path "*/.local/*"          \
@@ -24,7 +25,7 @@ else
                                    -not -path "*/$repo/$repo/*"     \
                                    -not -path "*/proc/*"            \
                                    -print -quit)
-    if [[ -z "$REPO_PATH" ]]; then 
+    if [[ -z "$REPO_PATH" ]]; then
      echo "ERROR : dependency $repo not found in search_path \"$search_path\". ABORTING."
      exit -1
    fi
@@ -33,6 +34,10 @@ else
    mkdir -p build # creating build if its not already there to hold the build of cpp files
    rm -rf build/* # build from scratch
    pip install . -v
+
+    # Give all rights on generated build folder to avoid root issues once out of the Docker
+    chmod -R a+rwX build/
+    chmod -R a+rwX *.egg-info/
    cd -
  done
 fi

--- a/CHANGELOG
+++ b/CHANGELOG
+# Version 0.5.0 (January 31, 2024)
+
+# Version 0.4.0 (December 6, 2024)
+
 # Version 0.1.0 (January 23, 2024)

 Initial release
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
 # CMake >= 3.18 is required for good support of FindCUDAToolkit
 cmake_minimum_required(VERSION 3.18)
-set(CXX_STANDARD 14)
+
+set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS  OFF)

 file(STRINGS "${CMAKE_SOURCE_DIR}/version.txt" version)

+# Parse version.txt to retrieve Major, Minor and Path
+string(REGEX MATCH "([0-9]+\\.[0-9]+\\.[0-9]+)" _ MATCHES ${version})
+set(PROJECT_VERSION_MAJOR ${CMAKE_MATCH_1})
+set(PROJECT_VERSION_MINOR ${CMAKE_MATCH_2})
+set(PROJECT_VERSION_PATCH ${CMAKE_MATCH_3})
+
 project(aidge_backend_cuda
        VERSION ${version}
        DESCRIPTION "CUDA implementations of the operators of aidge framework."
@@ -21,18 +30,17 @@ execute_process(
 )
 message(STATUS "Latest git commit: ${GIT_COMMIT_HASH}")
 # Define a preprocessor macro with the Git commit version
-add_definitions(-DGIT_COMMIT_HASH="${GIT_COMMIT_HASH}")

 # Note : project name is ${CMAKE_PROJECT_NAME} and python module name is also ${CMAKE_PROJECT_NAME}
 set(module_name _${CMAKE_PROJECT_NAME}) # target name

 ##############################################
 # Define options
-option(PYBIND "python binding" OFF)
-option(WERROR "Warning as error" OFF)
-option(TEST "Enable tests" ON)
-option(COVERAGE "Enable coverage" OFF)
-option(ENABLE_ASAN "Enable ASan (AddressSanitizer) for runtime analysis of memory use (over/underflow, memory leak, ...)" OFF)
+option(PYBIND "python binding (Default: OFF)" OFF)
+option(WERROR "Warning as error (Default: OFF)" OFF)
+option(TEST "Enable tests (Default: ON)" ON)
+option(COVERAGE "Enable coverage (Default: OFF)" OFF)
+option(ENABLE_ASAN "Enable ASan (AddressSanitizer) for runtime analysis of memory use (over/underflow, memory leak, ...) (Default: OFF)" OFF)

 ##############################################
 # Import utils CMakeLists
@@ -60,14 +68,14 @@ endif()

 ##########
 # CUDA
-if(NOT $ENV{AIDGE_INSTALL} STREQUAL "")
+if(NOT $ENV{CIBUILDWHEEL} STREQUAL "")
    message(WARNING "Env var CIBUILDWHEEL detected : currently building for a release job."
                    "\nSetting manually CUDACXX, PATH & LD_LIBRARY_PATH Variables")
    list(APPEND ENV{LD_LIBRARY_PATH} /usr/local/cuda/lib64)
    list(APPEND ENV{PATH} /usr/local/cuda/bin)
    set(ENV{CUDACXX} /usr/local/cuda/bin/nvcc)
 endif()
-find_package(CUDAToolkit REQUIRED)
+find_package(CUDAToolkit 12 REQUIRED)
 if(NOT DEFINED CMAKE_CUDA_STANDARD)
    set(CMAKE_CUDA_STANDARD 14)
    set(CMAKE_CUDA_STANDARD_REQUIRED ON)
@@ -97,7 +105,7 @@ if (PYBIND)
    # Handles Python + pybind11 headers dependencies
    include(PybindModuleCreation)
    # creates a target of the same name as CMAKE_PROJECT_NAME
-        generate_python_binding(${CMAKE_PROJECT_NAME} ${module_name}) # the python bindings module has the same name as the project.
+    generate_python_binding(${CMAKE_PROJECT_NAME} ${module_name}) # the python bindings module has the same name as the project.

    target_link_libraries(${module_name}
        PUBLIC
@@ -107,12 +115,21 @@ if (PYBIND)
        )
 endif()

+message(STATUS "Creating ${CMAKE_CURRENT_SOURCE_DIR}/include/aidge/backend/cuda_version.h")
+# Generate version.h file from config file version.h.in
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/include/aidge/backend/version.h.in"
+    "${CMAKE_CURRENT_SOURCE_DIR}/include/aidge/backend/cuda_version.h"
+)
+
+
 target_link_libraries(${module_name}
    PUBLIC
        _aidge_core # _ is added because we link the target not the project
-        CUDA::cudart
+    PRIVATE
        CUDA::cublas
        cudnn
+        CUDA::cudart
 )

 if( ${ENABLE_ASAN} )

--- a/README.md
+++ b/README.md
@@ -8,6 +8,8 @@ You can find in this folder the library that implements the CUDA operators.
 ## Installation

 ### Dependencies
+- [CUDA ToolKit 12.4](https://developer.nvidia.com/cuda-12-4-0-download-archive)
+- [CUDnn9](https://developer.nvidia.com/cudnn-downloads) make sure to install the CUDA 12 compatible version
 - `GCC`
 - `Make`/`Ninja`
 - `CMake`

--- a/aidge_backend_cuda/__init__.py
+++ b/aidge_backend_cuda/__init__.py
 from aidge_backend_cuda.aidge_backend_cuda import *  # import so generated by PyBind
-from ._version import *
+from . import benchmark
--- a/aidge_backend_cuda/benchmark.py
+++ b/aidge_backend_cuda/benchmark.py
+import time
+
+import numpy as np
+
+import aidge_core
+import aidge_backend_cuda
+
+def measure_inference_time(model: aidge_core.GraphView, input_data: list[str, np.ndarray], nb_warmup: int = 10, nb_iterations: int = 50) -> list[float]:
+    # update model and inputs backend
+    model.set_backend("cuda")
+    ordered_inputs = [aidge_core.Tensor(i[1]) for i in input_data]
+    for ordered_input in ordered_inputs:
+        ordered_input.set_backend("cuda")
+
+    scheduler = aidge_core.SequentialScheduler(model)
+    scheduler.generate_scheduling()
+
+    timings = []
+    # Warm-up runs.
+    for i in range(nb_warmup + nb_iterations):
+        if i < nb_warmup:
+            scheduler.forward(forward_dims=False, data=ordered_inputs)
+        else:
+            start = time.process_time()
+            scheduler.forward(forward_dims=False, data=ordered_inputs)
+            end = time.process_time()
+            timings.append((end - start))
+    return timings
+
+def compute_output(model: aidge_core.GraphView, input_data: list[str, np.ndarray]) -> list[np.ndarray]:
+    # update model and inputs backend
+    model.set_backend("cuda", device = 1)
+    ordered_inputs = [aidge_core.Tensor(i[1]) for i in input_data]
+    for ordered_input in ordered_inputs:
+        ordered_input.set_backend("cuda", device = 1)
+
+    scheduler = aidge_core.SequentialScheduler(model)
+    scheduler.generate_scheduling()
+
+    scheduler.forward(forward_dims=False, data=ordered_inputs)
+    outs = []
+    for pair in model.get_ordered_outputs():
+        t = pair[0].get_operator().get_output(pair[1])
+        t.set_backend("cpu")
+        outs.append(t)
+
+    return [np.array(out) for out in outs]
\ No newline at end of file
--- a/cmake/PybindModuleCreation.cmake
+++ b/cmake/PybindModuleCreation.cmake
-function(generate_python_binding pybind_module_name target_to_bind) 
+function(generate_python_binding pybind_module_name target_to_bind)
    add_definitions(-DPYBIND)
    Include(FetchContent)

-    set(PYBIND_VERSION v2.10.4)
+    set(PYBIND_VERSION v2.13.6)
    set(PYBIND11_FINDPYTHON ON)
    message(STATUS "Retrieving pybind ${PYBIND_VERSION} from git")

@@ -20,6 +20,18 @@ function(generate_python_binding pybind_module_name target_to_bind)
    file(GLOB_RECURSE pybind_src_files "python_binding/*.cpp")

    pybind11_add_module(${pybind_module_name} MODULE ${pybind_src_files} "NO_EXTRAS") # NO EXTRA recquired for pip install
-    target_include_directories(${pybind_module_name} PUBLIC "python_binding")
-    target_link_libraries(${pybind_module_name} PUBLIC ${target_to_bind})
+    target_include_directories(${pybind_module_name} PRIVATE "python_binding")
+    target_link_libraries(${pybind_module_name}
+        PRIVATE
+            ${target_to_bind}
+            CUDA::cublas
+            cudnn
+            CUDA::cudart
+    )
+    set_property(TARGET ${pybind_module_name} PROPERTY POSITION_INDEPENDENT_CODE ON)
+    set_target_properties(${pybind_module_name} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+    set_target_properties(${pybind_module_name} PROPERTIES
+        CMAKE_SHARED_LINKER_FLAGS "-Wl,--exclude-libs,ALL"
+    )
+    set_target_properties(${module} PROPERTIES INSTALL_RPATH "")
 endfunction()
--- a/include/aidge/backend/cuda.hpp
+++ b/include/aidge/backend/cuda.hpp
@@ -11,35 +11,44 @@

 #ifndef AIDGE_BACKEND_CUDA_IMPORTS_H_
 #define AIDGE_BACKEND_CUDA_IMPORTS_H_
+#include "aidge/backend/cuda_version.h"

 #include "aidge/backend/cuda/data/TensorImpl.hpp"
+
+#include "aidge/backend/cuda/operator/OperatorImpl.hpp"
+
+#include "aidge/backend/cuda/operator/AbsImpl.hpp"
 #include "aidge/backend/cuda/operator/AddImpl.hpp"
-#include "aidge/backend/cuda/operator/AndImpl.hpp"
 #include "aidge/backend/cuda/operator/ArgMaxImpl.hpp"
 #include "aidge/backend/cuda/operator/AvgPoolingImpl.hpp"
 #include "aidge/backend/cuda/operator/BatchNormImpl.hpp"
+#include "aidge/backend/cuda/operator/BitShiftImpl.hpp"
 #include "aidge/backend/cuda/operator/ConvImpl.hpp"
+#include "aidge/backend/cuda/operator/ClipImpl.hpp"
 #include "aidge/backend/cuda/operator/DivImpl.hpp"
+#include "aidge/backend/cuda/operator/EqualImpl.hpp"
+#include "aidge/backend/cuda/operator/ErfImpl.hpp"
 #include "aidge/backend/cuda/operator/FCImpl.hpp"
 #include "aidge/backend/cuda/operator/GlobalAveragePoolingImpl.hpp"
+#include "aidge/backend/cuda/operator/ILayerNormImpl.hpp"
+#include "aidge/backend/cuda/operator/LRNImpl.hpp"
 #include "aidge/backend/cuda/operator/LnImpl.hpp"
 #include "aidge/backend/cuda/operator/MaxPoolingImpl.hpp"
+#include "aidge/backend/cuda/operator/MatMulImpl.hpp"
 #include "aidge/backend/cuda/operator/MulImpl.hpp"
 #include "aidge/backend/cuda/operator/PadImpl.hpp"
 #include "aidge/backend/cuda/operator/PowImpl.hpp"
 #include "aidge/backend/cuda/operator/ReduceMeanImpl.hpp"
 #include "aidge/backend/cuda/operator/ReduceSumImpl.hpp"
 #include "aidge/backend/cuda/operator/ReLUImpl.hpp"
-#include "aidge/backend/cuda/operator/ShiftMaxImpl.hpp"
+#include "aidge/backend/cuda/operator/ResizeImpl.hpp"
+#include "aidge/backend/cuda/operator/RoundImpl.hpp"
 #include "aidge/backend/cuda/operator/ShiftGELUImpl.hpp"
-#include "aidge/backend/cuda/operator/ReshapeImpl.hpp"
+#include "aidge/backend/cuda/operator/ShiftMaxImpl.hpp"
 #include "aidge/backend/cuda/operator/SigmoidImpl.hpp"
+#include "aidge/backend/cuda/operator/SoftmaxImpl.hpp"
+#include "aidge/backend/cuda/operator/SqrtImpl.hpp"
 #include "aidge/backend/cuda/operator/SubImpl.hpp"
 #include "aidge/backend/cuda/operator/TanhImpl.hpp"

-#include "aidge/backend/cuda/operator/ShiftMaxImpl.hpp"
-#include "aidge/backend/cuda/operator/ShiftGELUImpl.hpp"
-#include "aidge/backend/cuda/operator/ILayerNormImpl.hpp"
-
-
 #endif /* AIDGE_BACKEND_CUDA_IMPORTS_H_ */
--- a/include/aidge/backend/cuda/data/Interpolation.cuh
+++ b/include/aidge/backend/cuda/data/Interpolation.cuh
+/********************************************************************************
+ * Copyright (c) 2025 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CUDA_DATA_INTERPOLATION_H_
+#define AIDGE_CUDA_DATA_INTERPOLATION_H_
+
+#include "aidge/data/Interpolation.hpp"
+namespace Aidge {
+namespace InterpolationCUDA {
+
+    /**
+     * @brief Computes the approximate input coordinates corresponding to given output coordinates,
+     *        according to the specified coordinate transformation mode.
+     *
+     * This function performs an "untransform" step for spatial interpolation, mapping output-space
+     * integer coordinates (`coordOut`) back into approximate input-space floating-point coordinates
+     * (`coordInApprox`) based on the transformation strategy provided.
+     *
+     * @param coordOut        Pointer to the output coordinates (integer values).
+     * @param inputDims       Pointer to the dimensions of the input tensor.
+     * @param outputDims      Pointer to the dimensions of the output tensor.
+     * @param coordTransfoMode The coordinate transformation mode (e.g., AlignCorners, HalfPixel, etc.).
+     * @param roi             Pointer to region of interest values, used only for TFCropAndResize mode;
+     *                        expected to be of size 2 * rank.
+     * @param coordInApprox   Pointer to the output array where the approximate input coordinates will be stored.
+     * @param rank            The dimensionality of the spatial domain (e.g., 2 for 2D, 3 for 3D).
+     */
+    __device__ void untransformCoordinates(
+        const int* coordOut,
+        const int* inputDims,
+        const int* outputDims,
+        Aidge::Interpolation::CoordinateTransformation coordTransfoMode,
+        const float* roi,
+        float* coordInApprox,
+        int rank);
+
+    /**
+     * @brief Retrieves neighboring input tensor values around a set of continuous coordinates for interpolation.
+     *
+     * This function gathers neighboring values from a multidimensional tensor for use in interpolation,
+     * depending on the interpolation mode (e.g., linear or cubic), scaling, padding strategy, and
+     * anti-aliasing settings. The gathered values and their coordinates are stored in output buffers.
+     *
+     * @tparam T               The data type of the input and output tensor values.
+     * @param tensorValues     Pointer to the input tensor values (flattened array).
+     * @param tensorDims       Pointer to the dimensions of the input tensor.
+     * @param coords           Pointer to the floating-point coordinates in the input space.
+     * @param scales           Pointer to scaling factors per dimension.
+     * @param rank             The number of spatial dimensions in the tensor.
+     * @param mode             Interpolation mode (e.g., Linear, Cubic).
+     * @param paddingMode      Padding mode used for out-of-bound accesses (e.g., Zero, Edge).
+     * @param antialiasing     Whether antialiasing is enabled (affects kernel footprint).
+     * @param outValues        Pointer to the buffer where retrieved neighbor values will be stored.
+     * @param outCoords        Pointer to the buffer where the corresponding coordinates of neighbors will be stored.
+     *                         Output shape is [maxNeighbours x rank].
+     * @param outCount         Pointer to a single integer where the number of valid neighbors will be written.
+     * @param maxNeighbours    Maximum number of neighbors to retrieve (capacity of the output buffers).
+     */
+    template <typename T>
+    __device__ void retrieveNeighboursKernel(
+        const T* tensorValues,
+        const int* tensorDims,
+        float* coords,
+        const float* scales,
+        int rank,
+        Aidge::Interpolation::Mode mode,
+        Aidge::PadBorderType paddingMode,
+        bool antialiasing,
+        T* outValues,
+        int* outCoords,
+        int* outCount,
+        int maxNeighbours
+    );
+
+    /**
+     * @brief Performs N-dimensional linear interpolation given neighbor points and their values.
+     *
+     * This function computes a weighted average of neighbor values based on linear interpolation
+     * weights derived from their distance to a target coordinate. It optionally applies antialiasing
+     * scaling to the distances. The weights are normalized to ensure smooth interpolation.
+     *
+     * @tparam T                   The data type of the input and output values.
+     * @param coordToInterpolate  Pointer to the floating-point coordinate to interpolate at (length = rank).
+     * @param scales              Pointer to scale factors per dimension.
+     * @param pointsCoords        Pointer to neighbor coordinates (flattened array of size coordsNbr × rank).
+     * @param pointValues         Pointer to values corresponding to neighbor coordinates.
+     * @param coordsNbr           Number of neighboring points used in the interpolation.
+     * @param rank                Number of spatial dimensions.
+     * @param antialiasing        Whether to apply antialiasing by scaling coordinate deltas.
+     * @return T                  The interpolated value at the target coordinate.
+     */
+    template <typename T>
+    __device__ T interpolateLinear(
+        const float* coordToInterpolate,
+        const float* scales,
+        const int* pointsCoords,
+        const T* pointValues,
+        int coordsNbr,
+        int rank,
+        bool antialiasing
+    );
+
+    /**
+     * @brief Performs N-dimensional cubic interpolation given neighbor points and their values.
+     *
+     * This function computes a weighted average of neighbor values based on cubic interpolation weights,
+     * optionally applying antialiasing and excluding neighbors outside input dimensions.
+     * Static dimensions (with size 1 and scale 1) are ignored in the weight calculation.
+     *
+     * @tparam T                   The data type of the input and output values.
+     * @param coordToInterpolate  Pointer to the floating-point coordinate to interpolate at (length = rank).
+     * @param scales              Pointer to scale factors per dimension.
+     * @param pointsCoords        Pointer to neighbor coordinates (flattened array of size coordsNbr × rank).
+     * @param pointValues         Pointer to values corresponding to neighbor coordinates.
+     * @param coordsNbr           Number of neighboring points used in the interpolation.
+     * @param rank                Number of spatial dimensions.
+     * @param inputDims           Pointer to input tensor dimensions.
+     * @param a                   Cubic interpolation parameter (often -0.75 for Catmull-Rom).
+     * @param antialiasing        Whether to apply antialiasing in weight calculation.
+     * @param excludeOutside      Whether to exclude neighbors outside input dimensions.
+     * @return T                  The interpolated value at the target coordinate.
+     */
+    template <typename T>
+    __device__ T interpolateCubic(
+        float* coordToInterpolate,
+        const float* scales,
+        int* pointsCoords,
+        T* pointValues,
+        int coordsNbr,
+        int rank,
+        const int* inputDims,
+        float a,
+        bool antialiasing,
+        bool excludeOutside
+    );
+
+    /**
+     * @brief Dispatches to the appropriate interpolation method (linear or cubic) based on the mode.
+     *
+     * This function selects the interpolation method and computes the interpolated value
+     * at the given coordinate using either cubic or linear interpolation. Returns zero
+     * for unsupported interpolation modes.
+     *
+     * @tparam T                   The data type of the input and output values.
+     * @param coordToInterpolate  Pointer to the floating-point coordinate to interpolate at (length = rank).
+     * @param scales              Pointer to scale factors per dimension.
+     * @param pointsCoords        Pointer to neighbor coordinates (flattened array of size coordsNbr × rank).
+     * @param pointValues         Pointer to values corresponding to neighbor coordinates.
+     * @param coordsNbr           Number of neighboring points used in the interpolation.
+     * @param rank                Number of spatial dimensions.
+     * @param mode                Interpolation mode (Linear or Cubic).
+     * @param cubicCoeffA         Cubic interpolation coefficient (used only for cubic mode).
+     * @param antialiasing        Whether to apply antialiasing in interpolation.
+     * @param excludeOutside      Whether to exclude neighbors outside input dimensions (only cubic mode).
+     * @param inputDims           Pointer to input tensor dimensions.
+     * @return T                  The interpolated value at the target coordinate.
+     */
+    template <typename T>
+    __device__ T interpolate(float* coordToInterpolate,
+                            const float* scales,
+                            int* pointsCoords,
+                            T* pointValues,
+                            int coordsNbr,
+                            int rank,
+                            Aidge::Interpolation::Mode mode,
+                            float cubicCoeffA,
+                            bool antialiasing,
+                            bool excludeOutside,
+                            const int* inputDims);
+} // namespace InterpolationCUDA
+} // namespace Aidge
+#endif /*AIDGE_CUDA_DATA_INTERPOLATION_H_*/
\ No newline at end of file
--- a/include/aidge/backend/cuda/data/TensorImpl.hpp
+++ b/include/aidge/backend/cuda/data/TensorImpl.hpp
@@ -4,6 +4,10 @@
 #include <cstddef>  // std::size_t
 #include <memory>
 #include <string>
+#include <vector>
+
+#include <cuda.h>
+#include <type_traits>  // std::enable_if, std::is_same

 #include "aidge/backend/TensorImpl.hpp"
 #include "aidge/data/Tensor.hpp"
@@ -17,14 +21,9 @@

 namespace Aidge {

+
 template <typename SRC_T, typename DST_T>
-void thrust_copy(const SRC_T* /*srcData*/, DST_T* /*dstData*/, size_t /*size*/);
-template <typename SRC_T, typename std::enable_if<!std::is_same<half_float::half, SRC_T>::value>::type* = nullptr>
-void thrust_copy(const SRC_T* srcData, half_float::half* dstData, size_t size);
-template <typename DST_T, typename std::enable_if<!std::is_same<half_float::half, DST_T>::value>::type* = nullptr>
-void thrust_copy(const half_float::half* srcData, DST_T* dstData, size_t size);
-template <>
-void thrust_copy(const half_float::half* srcData, half_float::half* dstData, size_t size);
+void thrust_copy(const SRC_T* srcData, DST_T* dstData, size_t size);

 /**
 * @brief Abstract class for the TensorImpl_cuda class template.
@@ -94,7 +93,7 @@ public:
    }

    void copy(const void *src, NbElts_t length, NbElts_t offset = 0) override {
-        AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "TensorImpl_cuda<{}>::copy(): copy length ({}) is above capacity ({})", typeid(T).name(), length, mNbElts);
+        AIDGE_ASSERT(offset + length <= mNbElts, "TensorImpl_cuda<{}>::copy(): copy offset ({}) + length ({}) is above capacity ({})", typeid(T).name(), offset, length, mNbElts);
        const T* srcT = static_cast<const T *>(src);
        T* dstT = static_cast<T *>(rawPtr(offset));

@@ -107,62 +106,67 @@ public:
            return;
        }

-        AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "TensorImpl_cuda<{}>::copyCast(): copy length ({}) is above capacity ({})", typeid(T).name(), length, mNbElts);
+        AIDGE_ASSERT(offset + length <= mNbElts, "TensorImpl_cuda<{}>::copyCast(): copy offset ({}) + length ({}) is above capacity ({})", typeid(T).name(), offset, length, mNbElts);
        switch (srcDt) {
        case DataType::Float64:
            thrust_copy(static_cast<const double*>(src),
                        static_cast<T*>(rawPtr(offset)),
-                        length);
+                        static_cast<size_t>(length));
            break;
        case DataType::Float32:
            thrust_copy(static_cast<const float*>(src),
                        static_cast<T*>(rawPtr(offset)),
-                        length);
+                        static_cast<size_t>(length));
            break;
        case DataType::Float16:
            thrust_copy(static_cast<const half_float::half*>(src),
                        static_cast<T*>(rawPtr(offset)),
-                        length);
+                        static_cast<size_t>(length));
            break;
        case DataType::Int64:
            thrust_copy(static_cast<const int64_t*>(src),
                        static_cast<T*>(rawPtr(offset)),
-                        length);
+                        static_cast<size_t>(length));
            break;
        case DataType::UInt64:
            thrust_copy(static_cast<const uint64_t*>(src),
                        static_cast<T*>(rawPtr(offset)),
-                        length);
+                        static_cast<size_t>(length));
            break;
        case DataType::Int32:
            thrust_copy(static_cast<const int32_t*>(src),
                        static_cast<T*>(rawPtr(offset)),
-                        length);
+                        static_cast<size_t>(length));
            break;
        case DataType::UInt32:
            thrust_copy(static_cast<const uint32_t*>(src),
                        static_cast<T*>(rawPtr(offset)),
-                        length);
+                        static_cast<size_t>(length));
            break;
        case DataType::Int16:
            thrust_copy(static_cast<const int16_t*>(src),
                        static_cast<T*>(rawPtr(offset)),
-                        length);
+                        static_cast<size_t>(length));
            break;
        case DataType::UInt16:
            thrust_copy(static_cast<const uint16_t*>(src),
                        static_cast<T*>(rawPtr(offset)),
-                        length);
+                        static_cast<size_t>(length));
            break;
        case DataType::Int8:
            thrust_copy(static_cast<const int8_t*>(src),
                        static_cast<T*>(rawPtr(offset)),
-                        length);
+                        static_cast<size_t>(length));
            break;
        case DataType::UInt8:
            thrust_copy(static_cast<const uint8_t*>(src),
                        static_cast<T*>(rawPtr(offset)),
-                        length);
+                        static_cast<size_t>(length));
+            break;
+        case DataType::Boolean:
+            thrust_copy(static_cast<const bool*>(src),
+                        static_cast<T*>(rawPtr(offset)),
+                        static_cast<size_t>(length));
            break;
        default:
            AIDGE_THROW_OR_ABORT(std::runtime_error, "TensorImpl_cuda<{}>::copyCast(): unsupported data type {}.", typeid(T).name(), srcDt);
@@ -171,21 +175,22 @@ public:
    }

    void copyFromDevice(const void *src, const std::pair<std::string, DeviceIdx_t>& device, NbElts_t length, NbElts_t offset = 0) override {
-        AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "TensorImpl_cuda<{}>::copyFromDevice(): copy length ({}) is above capacity ({})", typeid(T).name(), length, mNbElts);
+        AIDGE_ASSERT(offset + length <= mNbElts, "TensorImpl_cuda<{}>::copyFromDevice(): copy offset ({}) + length ({}) is above capacity ({})", typeid(T).name(), offset, length, mNbElts);
        CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(offset), src, length * sizeof(T), cudaMemcpyDeviceToDevice));
    }

    void copyFromHost(const void *src, NbElts_t length, NbElts_t offset = 0) override {
-        AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "TensorImpl_cuda<{}>::copyFromHost(): copy length ({}) is above capacity ({})", typeid(T).name(), length, mNbElts);
+        AIDGE_ASSERT(offset + length <= mNbElts, "TensorImpl_cuda<{}>::copyFromHost(): copy offset ({}) + length ({}) is above capacity ({})", typeid(T).name(), offset, length, mNbElts);
        CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(offset), src, length * sizeof(T), cudaMemcpyHostToDevice));
    }

    void copyToHost(void *dst, NbElts_t length, NbElts_t offset = 0) const override {
-        AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "TensorImpl_cuda<{}>::copyToHost(): copy length ({}) is above capacity ({})", typeid(T).name(), length, mNbElts);
+        AIDGE_ASSERT(offset + length <= mNbElts, "TensorImpl_cuda<{}>::copyToHost(): copy offset ({}) + length ({}) is above capacity ({})", typeid(T).name(), offset, length, mNbElts);
        CHECK_CUDA_STATUS(cudaMemcpy(dst, rawPtr(offset), length * sizeof(T), cudaMemcpyDeviceToHost));
    }

    void *rawPtr(NbElts_t offset = 0) override {
+        cudaSetDevice(mDevice);
        lazyInit();
        return (mData.data() + offset);
    };
@@ -279,16 +284,18 @@ private:
 template <typename T>
 const std::string TensorImpl_cuda<T>::Backend = "cuda";

-namespace {
-static Registrar<Tensor> registrarTensorImpl_cuda_Float64(
-        {"cuda", DataType::Float64}, Aidge::TensorImpl_cuda<double>::create);
-static Registrar<Tensor> registrarTensorImpl_cuda_Float32(
-        {"cuda", DataType::Float32}, Aidge::TensorImpl_cuda<float>::create);
-static Registrar<Tensor> registrarTensorImpl_cuda_Float16(
-        {"cuda", DataType::Float16}, Aidge::TensorImpl_cuda<half_float::half>::create);
-static Registrar<Tensor> registrarTensorImpl_cuda_Int32(
-        {"cuda", DataType::Int32}, Aidge::TensorImpl_cuda<int32_t>::create);
-}  // namespace
+REGISTRAR(Tensor, {"cuda", DataType::Float64}, Aidge::TensorImpl_cuda<double>::create);
+REGISTRAR(Tensor, {"cuda", DataType::Float32}, Aidge::TensorImpl_cuda<float>::create);
+REGISTRAR(Tensor, {"cuda", DataType::Float16}, Aidge::TensorImpl_cuda<half_float::half>::create);
+REGISTRAR(Tensor, {"cuda", DataType::Int64}, Aidge::TensorImpl_cuda<int64_t>::create);
+REGISTRAR(Tensor, {"cuda", DataType::Int32}, Aidge::TensorImpl_cuda<int32_t>::create);
+REGISTRAR(Tensor, {"cuda", DataType::Int16}, Aidge::TensorImpl_cuda<int16_t>::create);
+REGISTRAR(Tensor, {"cuda", DataType::Int8}, Aidge::TensorImpl_cuda<int8_t>::create);
+REGISTRAR(Tensor, {"cuda", DataType::UInt64}, Aidge::TensorImpl_cuda<uint64_t>::create);
+REGISTRAR(Tensor, {"cuda", DataType::UInt32}, Aidge::TensorImpl_cuda<uint32_t>::create);
+REGISTRAR(Tensor, {"cuda", DataType::UInt16}, Aidge::TensorImpl_cuda<uint16_t>::create);
+REGISTRAR(Tensor, {"cuda", DataType::UInt8}, Aidge::TensorImpl_cuda<uint8_t>::create);
+REGISTRAR(Tensor, {"cuda", DataType::Boolean}, Aidge::TensorImpl_cuda<bool>::create);
 }  // namespace Aidge

 #endif /* AIDGE_BACKEND_CUDA_DATA_TENSORIMPL_H_ */
--- a/include/aidge/backend/cuda/operator/AbsImpl.hpp
+++ b/include/aidge/backend/cuda/operator/AbsImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_BACKEND_CUDA_OPERATOR_ABSIMPL_H_
+#define AIDGE_BACKEND_CUDA_OPERATOR_ABSIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include <cudnn.h>
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/Abs.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+class AbsImpl_cuda : public OperatorImpl {
+public:
+    AbsImpl_cuda(const Abs_Op& op) : OperatorImpl(op, "cuda") {}
+
+    static std::unique_ptr<AbsImpl_cuda> create(const Abs_Op& op) {
+        return std::make_unique<AbsImpl_cuda>(op);
+    }
+
+    virtual std::vector<ImplSpec> getAvailableImplSpecs() const override {
+        return {
+            {DataType::Float64},
+            {DataType::Float32},
+            {DataType::Float16},
+        };
+    }
+
+    void forward() override;
+
+private:
+
+    std::shared_ptr<Tensor> mInputFallback;
+    std::shared_ptr<Tensor> mOutputGradFallback;
+
+    template <class T> void forward_(const Tensor& input);
+};
+
+// Implementation entry point registration to Operator
+REGISTRAR(Abs_Op, "cuda", Aidge::AbsImpl_cuda::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_BACKEND_CUDA_OPERATOR_ABSIMPL_H_ */
--- a/include/aidge/backend/cuda/operator/AbsImpl_CUDA_kernels.hpp
+++ b/include/aidge/backend/cuda/operator/AbsImpl_CUDA_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CUDA_OPERATOR_ABSIMPL_KERNELS_H_
+#define AIDGE_CUDA_OPERATOR_ABSIMPL_KERNELS_H_
+
+#include <stdexcept>
+#include <cfloat>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <cuda_fp16.h>
+
+#include "aidge/data/Data.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+
+template <class T>
+void absForward(const T* input, T* output, int size);
+
+}
+#endif /* AIDGE_CUDA_OPERATOR_ABSIMPL_KERNELS_H_ */
+
+
+
+
+
--- a/include/aidge/backend/cuda/operator/AddImpl.hpp
+++ b/include/aidge/backend/cuda/operator/AddImpl.hpp
@@ -36,7 +36,7 @@ public:
        return std::make_unique<AddImpl_cuda>(op);
    }

-    virtual std::set<ImplSpec> getAvailableImplSpecs() const override {
+    virtual std::vector<ImplSpec> getAvailableImplSpecs() const override {
        return {
            {DataType::Float64},
            {DataType::Float32},
@@ -46,10 +46,18 @@ public:

    void forward() override;
    void backward() override;
+    ~AddImpl_cuda();

 private:
-    template <class T> void forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides);
-    template <class T> void backward_(const Tensor& outGrad, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides);
+    std::vector<cudnnTensorDescriptor_t> mTensorDesc;
+    cudnnReduceTensorDescriptor_t mBwdReduceDesc = nullptr;
+    size_t mBwdWorkspaceSize = 0;
+    void* mBwdWorkspace = nullptr;
+    std::vector<std::shared_ptr<Tensor>> mInputFallbacks;
+    std::shared_ptr<Tensor> mOutputGradFallback;
+
+    template <class T> void forward_(const std::vector<std::reference_wrapper<Tensor>>& inputs);
+    template <class T> void backward_(const Tensor& outGrad);
 };

 // Implementation entry point registration to Operator

--- a/include/aidge/backend/cuda/operator/ArgMaxImpl.hpp
+++ b/include/aidge/backend/cuda/operator/ArgMaxImpl.hpp
@@ -36,11 +36,11 @@ public:
        return std::make_unique<ArgMaxImpl_cuda>(op);
    }

-    virtual std::set<ImplSpec> getAvailableImplSpecs() const override {
+    virtual std::vector<ImplSpec> getAvailableImplSpecs() const override {
        return {
-            {DataType::Float64},
-            {DataType::Float32},
-            {DataType::Float16},
+            {DataType::Float64, DataType::Int64},
+            {DataType::Float32, DataType::Int64},
+            {DataType::Float16, DataType::Int64},
        };
    }

@@ -50,7 +50,7 @@ private:
    // CuDNN specific variables
    std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback;

-    template <class T> void forward_(const Tensor& input, std::int32_t axis, DimSize_t selectLastIdx);
+    template <class I, class O> void forward_(const Tensor& input, std::int32_t axis, DimSize_t selectLastIdx);
 };

 // Implementation entry point registration to Operator

--- a/include/aidge/backend/cuda/operator/ArgMaxImpl_CUDA_kernels.hpp
+++ b/include/aidge/backend/cuda/operator/ArgMaxImpl_CUDA_kernels.hpp
@@ -23,8 +23,8 @@

 namespace Aidge
 {
-    template <class T>
-    void ArgMax_cuda_forward_kernel(const T* input, T* output,
+    template <class I, class O>
+    void ArgMax_cuda_forward_kernel(const I* input, O* output,
                                    const std::vector<int>& inputDims, const std::vector<int>& inputStrides,
                                    int axis, int total_elems, std::size_t selectLastIdx);
 }

--- a/include/aidge/backend/cuda/operator/AvgPoolingImpl.hpp
+++ b/include/aidge/backend/cuda/operator/AvgPoolingImpl.hpp
@@ -37,7 +37,7 @@ public:
        return std::make_unique<AvgPoolingImpl_cuda>(op);
    }

-    virtual std::set<ImplSpec> getAvailableImplSpecs() const override {
+    virtual std::vector<ImplSpec> getAvailableImplSpecs() const override {
        return {
            {DataType::Float64},
            {DataType::Float32},

--- a/include/aidge/backend/cuda/operator/BatchNormImpl.hpp
+++ b/include/aidge/backend/cuda/operator/BatchNormImpl.hpp
@@ -37,7 +37,7 @@ public:
        return std::make_unique<BatchNormImpl_cuda>(op);
    }

-    virtual std::set<ImplSpec> getAvailableImplSpecs() const override {
+    virtual std::vector<ImplSpec> getAvailableImplSpecs() const override {
        return {
            {DataType::Float64},
            {DataType::Float32},

--- a/include/aidge/backend/cuda/operator/BitShiftImpl.hpp
+++ b/include/aidge/backend/cuda/operator/BitShiftImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_BACKEND_CUDA_OPERATOR_BITSHIFTIMPL_H_
+#define AIDGE_BACKEND_CUDA_OPERATOR_BITSHIFTIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include <cudnn.h>
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/BitShift.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+class BitShiftImpl_cuda : public OperatorImpl {
+public:
+    BitShiftImpl_cuda(const BitShift_Op& op) : OperatorImpl(op, "cuda") {}
+
+    static std::unique_ptr<BitShiftImpl_cuda> create(const BitShift_Op& op) {
+        return std::make_unique<BitShiftImpl_cuda>(op);
+    }
+
+    virtual std::vector<ImplSpec> getAvailableImplSpecs() const override {
+        return {
+            {DataType::Int32},
+            {DataType::Int64}
+        };
+    }
+
+    void forward() override;
+
+private:
+    template <class T> void forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides);
+};
+
+// Implementation entry point registration to Operator
+REGISTRAR(BitShift_Op, "cuda", Aidge::BitShiftImpl_cuda::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_BACKEND_CUDA_OPERATOR_BITSHIFTIMPL_H_ */
No results found