Compare revisions

2247fece · b41bfda0 · 4cad5349 · 39132750 · 1ae97a27 · 3629ffc6
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@
 # C++ Build
 build*/
 install*/
+include/aidge/backend/cuda_version.h

 # VSCode
 .vscode

--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -12,23 +12,46 @@ stages:
  - deploy

 include:
-  - project: 'eclipse/aidge/gitlab_shared_files' 
+  - project: 'eclipse/aidge/gitlab_shared_files'
    ref: 'main'
-    file: 
+    file:
      # choose which jobs to run by including the corresponding files.
      - '.gitlab/ci/ubuntu_cpp.gitlab-ci.yml'

      - '.gitlab/ci/ubuntu_python.gitlab-ci.yml'
-      - '.gitlab/ci/release/cibuildwheel_ubuntu.gitlab-ci.yml'   
+      - '.gitlab/ci/release/cibuildwheel_ubuntu.gitlab-ci.yml'

      # - '.gitlab/ci/windows_cpp.gitlab-ci.yml'

-      # - '.gitlab/ci/windows_python.gitlab-ci.yml'   
-      # - '.gitlab/ci/release/cibuildwheel_windows.gitlab-ci.yml'   
+      # - '.gitlab/ci/windows_python.gitlab-ci.yml'
+      # - '.gitlab/ci/release/cibuildwheel_windows.gitlab-ci.yml'

+build:ubuntu_python:
+  # Use cudnn image instead of the cuda one
+  image: nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
+
+test:ubuntu_python:
+  # Use cudnn image instead of the cuda one
+  image: nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
+
+coverage:ubuntu_python:
+  # Use cudnn image instead of the cuda one
+  image: nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
+
+.build:ubuntu_cpp:template:
+  # Use cudnn image instead of the cuda one
+  image: nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
+
+test:ubuntu_cpp:
+  # Use cudnn image instead of the cuda one
+  image: nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
+
+coverage:ubuntu_cpp:
+  # Use cudnn image instead of the cuda one
+  image: nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04

 release:pip:ubuntu:
-  tags: 
+  tags:
    - release:cuda
  variables:
    DOCKER_HOST: unix:///var/run/docker.sock
@@ -36,19 +59,22 @@ release:pip:ubuntu:
      BUILD_WITH_CUDA=1
      AIDGE_DEPENDENCIES='aidge_core aidge_backend_cpu'
      AIDGE_INSTALL='/AIDGE_INSTALL_CIBUILDWHEEL'
-      CUDA_TOOLKIT_VERSION='11-8'
      DOCKER_HOST='unix:///var/run/docker.sock'
      ARCH='x86_64'
      CUDNN_VERSION='9'
-      CUDA_MAJOR_VERSION='11'
+      CUDA_MAJOR_VERSION='12'
      CUDA_MINOR_VERSION='8'
-      SEARCH_PATH='/home/ubuntu/builds/$CI_RUNNER_SHORT_TOKEN/$CI_CONCURRENT_ID'
+      SEARCH_PATH='/home/gitlab-runner/builds/$CI_RUNNER_SHORT_TOKEN/$CI_CONCURRENT_ID'
+      CIBW_REPAIR_WHEEL_COMMAND='auditwheel --verbose repair {wheel} -w {dest_dir} --exclude libcudart.so.12 --exclude libcudnn.so.9 --exclude libcublas.so.12 --exclude libcublasLt.so.12'

  parallel:
    matrix:
-      - CIBW_BUILD: "cp38-manylinux_x86_64"
-      - CIBW_BUILD: "cp39-manylinux_x86_64"
+      # - CIBW_BUILD: "cp38-manylinux_x86_64"
+      # - CIBW_BUILD: "cp39-manylinux_x86_64"
      - CIBW_BUILD: "cp310-manylinux_x86_64"
+      - CIBW_BUILD: "cp311-manylinux_x86_64"
+      - CIBW_BUILD: "cp312-manylinux_x86_64"
+      - CIBW_BUILD: "cp313-manylinux_x86_64"

  before_script:
    # retrieve aidge dependencies
@@ -56,9 +82,11 @@ release:pip:ubuntu:
    - !reference [.ubuntu:download:repositories, before_script] # located in common.gitlab-ci.yml

  script:
-    - /home/ubuntu/.local/bin/cibuildwheel --output-dir wheelhouse
+    # - /home/ubuntu/.local/bin/cibuildwheel --output-dir wheelhouse
+    - cibuildwheel --output-dir wheelhouse

-  after_script:
-    # Ensure all files are owned by the correct user at the end of the job
-    - sudo chown -R $(whoami):$(whoami) .
+  # after_script:
+  #   # Ensure all files are owned by the correct user at the end of the job
+  #   # Note: sudo requires the job to have privileged = true
+  #   - sudo chown -R $(whoami):$(whoami) .

--- a/.gitlab/ci/cibuildwheel_build_deps_before_build_wheel.sh
+++ b/.gitlab/ci/cibuildwheel_build_deps_before_build_wheel.sh
 #!/bin/bash
 set -e
-if [[ "$1" == "" ]]; then 
+if [[ "$1" == "" ]]; then
  echo "build aidge deps in cibuildwheel container before building wheel."
  echo "search path defines where the dependencies will be searched."
  echo "Hint : In wheel containers, files are mounted on /host by default."
@@ -10,13 +10,14 @@ set -x
 if [[ $AIDGE_DEPENDENCIES ==  "" ]]; then # case for aidge_ core
  mkdir -p build # creating build if its not already there to hold the build of cpp files
  rm -rf build/* # build from scratch
-else 
+else
  for repo in $AIDGE_DEPENDENCIES ; do # case for other projects
    search_path=$1
    REPO_PATH=$(find $search_path ! -writable -prune -o  -type d     \
                                    -name "$repo"                    \
                                    -not -path "*/install/*"         \
                                    -not -path "*/.git/*"            \
+                                    -not -path "*/.mypy_cache/*"     \
                                    -not -path "*/miniconda/*"       \
                                    -not -path "*/conda/*"           \
                                    -not -path "*/.local/*"          \
@@ -24,7 +25,7 @@ else
                                    -not -path "*/$repo/$repo/*"     \
                                    -not -path "*/proc/*"            \
                                    -print -quit)
-    if [[ -z "$REPO_PATH" ]]; then 
+    if [[ -z "$REPO_PATH" ]]; then
      echo "ERROR : dependency $repo not found in search_path \"$search_path\". ABORTING."
      exit -1
    fi
@@ -33,6 +34,10 @@ else
    mkdir -p build # creating build if its not already there to hold the build of cpp files
    rm -rf build/* # build from scratch
    pip install . -v
+
+    # Give all rights on generated build folder to avoid root issues once out of the Docker
+    chmod -R a+rwX build/
+    chmod -R a+rwX *.egg-info/
    cd -
  done
 fi

--- a/CHANGELOG
+++ b/CHANGELOG
+# Version 0.5.0 (January 31, 2024)
+
+# Version 0.4.0 (December 6, 2024)
+
 # Version 0.1.0 (January 23, 2024)

 Initial release
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
 # CMake >= 3.18 is required for good support of FindCUDAToolkit
 cmake_minimum_required(VERSION 3.18)
-set(CXX_STANDARD 14)
+
+set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS  OFF)

 file(STRINGS "${CMAKE_SOURCE_DIR}/version.txt" version)

+# Parse version.txt to retrieve Major, Minor and Path
+string(REGEX MATCH "([0-9]+\\.[0-9]+\\.[0-9]+)" _ MATCHES ${version})
+set(PROJECT_VERSION_MAJOR ${CMAKE_MATCH_1})
+set(PROJECT_VERSION_MINOR ${CMAKE_MATCH_2})
+set(PROJECT_VERSION_PATCH ${CMAKE_MATCH_3})
+
 project(aidge_backend_cuda
        VERSION ${version}
        DESCRIPTION "CUDA implementations of the operators of aidge framework."
@@ -21,18 +30,17 @@ execute_process(
 )
 message(STATUS "Latest git commit: ${GIT_COMMIT_HASH}")
 # Define a preprocessor macro with the Git commit version
-add_definitions(-DGIT_COMMIT_HASH="${GIT_COMMIT_HASH}")

 # Note : project name is ${CMAKE_PROJECT_NAME} and python module name is also ${CMAKE_PROJECT_NAME}
 set(module_name _${CMAKE_PROJECT_NAME}) # target name

 ##############################################
 # Define options
-option(PYBIND "python binding" OFF)
-option(WERROR "Warning as error" OFF)
-option(TEST "Enable tests" ON)
-option(COVERAGE "Enable coverage" OFF)
-option(ENABLE_ASAN "Enable ASan (AddressSanitizer) for runtime analysis of memory use (over/underflow, memory leak, ...)" OFF)
+option(PYBIND "python binding (Default: OFF)" OFF)
+option(WERROR "Warning as error (Default: OFF)" OFF)
+option(TEST "Enable tests (Default: ON)" ON)
+option(COVERAGE "Enable coverage (Default: OFF)" OFF)
+option(ENABLE_ASAN "Enable ASan (AddressSanitizer) for runtime analysis of memory use (over/underflow, memory leak, ...) (Default: OFF)" OFF)

 ##############################################
 # Import utils CMakeLists
@@ -60,14 +68,14 @@ endif()

 ##########
 # CUDA
-if(NOT $ENV{AIDGE_INSTALL} STREQUAL "")
+if(NOT $ENV{CIBUILDWHEEL} STREQUAL "")
    message(WARNING "Env var CIBUILDWHEEL detected : currently building for a release job."
                    "\nSetting manually CUDACXX, PATH & LD_LIBRARY_PATH Variables")
    list(APPEND ENV{LD_LIBRARY_PATH} /usr/local/cuda/lib64)
    list(APPEND ENV{PATH} /usr/local/cuda/bin)
    set(ENV{CUDACXX} /usr/local/cuda/bin/nvcc)
 endif()
-find_package(CUDAToolkit REQUIRED)
+find_package(CUDAToolkit 12 REQUIRED)
 if(NOT DEFINED CMAKE_CUDA_STANDARD)
    set(CMAKE_CUDA_STANDARD 14)
    set(CMAKE_CUDA_STANDARD_REQUIRED ON)
@@ -97,7 +105,7 @@ if (PYBIND)
    # Handles Python + pybind11 headers dependencies
    include(PybindModuleCreation)
    # creates a target of the same name as CMAKE_PROJECT_NAME
-        generate_python_binding(${CMAKE_PROJECT_NAME} ${module_name}) # the python bindings module has the same name as the project.
+    generate_python_binding(${CMAKE_PROJECT_NAME} ${module_name}) # the python bindings module has the same name as the project.

    target_link_libraries(${module_name}
        PUBLIC
@@ -107,12 +115,21 @@ if (PYBIND)
        )
 endif()

+message(STATUS "Creating ${CMAKE_CURRENT_SOURCE_DIR}/include/aidge/backend/cuda_version.h")
+# Generate version.h file from config file version.h.in
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/include/aidge/backend/version.h.in"
+    "${CMAKE_CURRENT_SOURCE_DIR}/include/aidge/backend/cuda_version.h"
+)
+
+
 target_link_libraries(${module_name}
    PUBLIC
        _aidge_core # _ is added because we link the target not the project
-        CUDA::cudart
+    PRIVATE
        CUDA::cublas
        cudnn
+        CUDA::cudart
 )

 if( ${ENABLE_ASAN} )

--- a/README.md
+++ b/README.md
@@ -8,6 +8,8 @@ You can find in this folder the library that implements the CUDA operators.
 ## Installation

 ### Dependencies
+- [CUDA ToolKit 12.4](https://developer.nvidia.com/cuda-12-4-0-download-archive)
+- [CUDnn9](https://developer.nvidia.com/cudnn-downloads) make sure to install the CUDA 12 compatible version
 - `GCC`
 - `Make`/`Ninja`
 - `CMake`

--- a/aidge_backend_cuda/__init__.py
+++ b/aidge_backend_cuda/__init__.py
 from aidge_backend_cuda.aidge_backend_cuda import *  # import so generated by PyBind
-from ._version import *
+from . import benchmark
--- a/aidge_backend_cuda/benchmark.py
+++ b/aidge_backend_cuda/benchmark.py
+import time
+
+import numpy as np
+
+import aidge_core
+import aidge_backend_cuda
+
+def measure_inference_time(model: aidge_core.GraphView, input_data: list[str, np.ndarray], nb_warmup: int = 10, nb_iterations: int = 50) -> list[float]:
+    # update model and inputs backend
+    model.set_backend("cuda")
+    ordered_inputs = [aidge_core.Tensor(i[1]) for i in input_data]
+    for ordered_input in ordered_inputs:
+        ordered_input.set_backend("cuda")
+
+    scheduler = aidge_core.SequentialScheduler(model)
+    scheduler.generate_scheduling()
+
+    timings = []
+    # Warm-up runs.
+    for i in range(nb_warmup + nb_iterations):
+        if i < nb_warmup:
+            scheduler.forward(forward_dims=False, data=ordered_inputs)
+        else:
+            start = time.process_time()
+            scheduler.forward(forward_dims=False, data=ordered_inputs)
+            end = time.process_time()
+            timings.append((end - start))
+    return timings
+
+def compute_output(model: aidge_core.GraphView, input_data: list[str, np.ndarray]) -> list[np.ndarray]:
+    # update model and inputs backend
+    model.set_backend("cuda", device = 1)
+    ordered_inputs = [aidge_core.Tensor(i[1]) for i in input_data]
+    for ordered_input in ordered_inputs:
+        ordered_input.set_backend("cuda", device = 1)
+
+    scheduler = aidge_core.SequentialScheduler(model)
+    scheduler.generate_scheduling()
+
+    scheduler.forward(forward_dims=False, data=ordered_inputs)
+    outs = []
+    for pair in model.get_ordered_outputs():
+        t = pair[0].get_operator().get_output(pair[1])
+        t.set_backend("cpu")
+        outs.append(t)
+
+    return [np.array(out) for out in outs]
\ No newline at end of file
--- a/cmake/PybindModuleCreation.cmake
+++ b/cmake/PybindModuleCreation.cmake
-function(generate_python_binding pybind_module_name target_to_bind) 
+function(generate_python_binding pybind_module_name target_to_bind)
    add_definitions(-DPYBIND)
    Include(FetchContent)

-    set(PYBIND_VERSION v2.10.4)
+    set(PYBIND_VERSION v2.13.6)
    set(PYBIND11_FINDPYTHON ON)
    message(STATUS "Retrieving pybind ${PYBIND_VERSION} from git")

@@ -20,6 +20,18 @@ function(generate_python_binding pybind_module_name target_to_bind)
    file(GLOB_RECURSE pybind_src_files "python_binding/*.cpp")

    pybind11_add_module(${pybind_module_name} MODULE ${pybind_src_files} "NO_EXTRAS") # NO EXTRA recquired for pip install
-    target_include_directories(${pybind_module_name} PUBLIC "python_binding")
-    target_link_libraries(${pybind_module_name} PUBLIC ${target_to_bind})
+    target_include_directories(${pybind_module_name} PRIVATE "python_binding")
+    target_link_libraries(${pybind_module_name}
+        PRIVATE
+            ${target_to_bind}
+            CUDA::cublas
+            cudnn
+            CUDA::cudart
+    )
+    set_property(TARGET ${pybind_module_name} PROPERTY POSITION_INDEPENDENT_CODE ON)
+    set_target_properties(${pybind_module_name} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+    set_target_properties(${pybind_module_name} PROPERTIES
+        CMAKE_SHARED_LINKER_FLAGS "-Wl,--exclude-libs,ALL"
+    )
+    set_target_properties(${module} PROPERTIES INSTALL_RPATH "")
 endfunction()
--- a/include/aidge/backend/cuda.hpp
+++ b/include/aidge/backend/cuda.hpp
@@ -11,35 +11,43 @@

 #ifndef AIDGE_BACKEND_CUDA_IMPORTS_H_
 #define AIDGE_BACKEND_CUDA_IMPORTS_H_
+#include "aidge/backend/cuda_version.h"

 #include "aidge/backend/cuda/data/TensorImpl.hpp"
+
+#include "aidge/backend/cuda/operator/OperatorImpl.hpp"
+
+#include "aidge/backend/cuda/operator/AbsImpl.hpp"
 #include "aidge/backend/cuda/operator/AddImpl.hpp"
-#include "aidge/backend/cuda/operator/AndImpl.hpp"
 #include "aidge/backend/cuda/operator/ArgMaxImpl.hpp"
 #include "aidge/backend/cuda/operator/AvgPoolingImpl.hpp"
 #include "aidge/backend/cuda/operator/BatchNormImpl.hpp"
+#include "aidge/backend/cuda/operator/BitShiftImpl.hpp"
 #include "aidge/backend/cuda/operator/ConvImpl.hpp"
+#include "aidge/backend/cuda/operator/ClipImpl.hpp"
 #include "aidge/backend/cuda/operator/DivImpl.hpp"
+#include "aidge/backend/cuda/operator/EqualImpl.hpp"
+#include "aidge/backend/cuda/operator/ErfImpl.hpp"
 #include "aidge/backend/cuda/operator/FCImpl.hpp"
 #include "aidge/backend/cuda/operator/GlobalAveragePoolingImpl.hpp"
+#include "aidge/backend/cuda/operator/ILayerNormImpl.hpp"
+#include "aidge/backend/cuda/operator/LRNImpl.hpp"
 #include "aidge/backend/cuda/operator/LnImpl.hpp"
 #include "aidge/backend/cuda/operator/MaxPoolingImpl.hpp"
+#include "aidge/backend/cuda/operator/MatMulImpl.hpp"
 #include "aidge/backend/cuda/operator/MulImpl.hpp"
 #include "aidge/backend/cuda/operator/PadImpl.hpp"
 #include "aidge/backend/cuda/operator/PowImpl.hpp"
 #include "aidge/backend/cuda/operator/ReduceMeanImpl.hpp"
 #include "aidge/backend/cuda/operator/ReduceSumImpl.hpp"
 #include "aidge/backend/cuda/operator/ReLUImpl.hpp"
-#include "aidge/backend/cuda/operator/ShiftMaxImpl.hpp"
+#include "aidge/backend/cuda/operator/RoundImpl.hpp"
 #include "aidge/backend/cuda/operator/ShiftGELUImpl.hpp"
-#include "aidge/backend/cuda/operator/ReshapeImpl.hpp"
+#include "aidge/backend/cuda/operator/ShiftMaxImpl.hpp"
 #include "aidge/backend/cuda/operator/SigmoidImpl.hpp"
+#include "aidge/backend/cuda/operator/SoftmaxImpl.hpp"
+#include "aidge/backend/cuda/operator/SqrtImpl.hpp"
 #include "aidge/backend/cuda/operator/SubImpl.hpp"
 #include "aidge/backend/cuda/operator/TanhImpl.hpp"

-#include "aidge/backend/cuda/operator/ShiftMaxImpl.hpp"
-#include "aidge/backend/cuda/operator/ShiftGELUImpl.hpp"
-#include "aidge/backend/cuda/operator/ILayerNormImpl.hpp"
-
-
 #endif /* AIDGE_BACKEND_CUDA_IMPORTS_H_ */
--- a/include/aidge/backend/cuda/data/TensorImpl.hpp
+++ b/include/aidge/backend/cuda/data/TensorImpl.hpp
@@ -4,6 +4,10 @@
 #include <cstddef>  // std::size_t
 #include <memory>
 #include <string>
+#include <vector>
+
+#include <cuda.h>
+#include <type_traits>  // std::enable_if, std::is_same

 #include "aidge/backend/TensorImpl.hpp"
 #include "aidge/data/Tensor.hpp"
@@ -17,14 +21,9 @@

 namespace Aidge {

+
 template <typename SRC_T, typename DST_T>
-void thrust_copy(const SRC_T* /*srcData*/, DST_T* /*dstData*/, size_t /*size*/);
-template <typename SRC_T, typename std::enable_if<!std::is_same<half_float::half, SRC_T>::value>::type* = nullptr>
-void thrust_copy(const SRC_T* srcData, half_float::half* dstData, size_t size);
-template <typename DST_T, typename std::enable_if<!std::is_same<half_float::half, DST_T>::value>::type* = nullptr>
-void thrust_copy(const half_float::half* srcData, DST_T* dstData, size_t size);
-template <>
-void thrust_copy(const half_float::half* srcData, half_float::half* dstData, size_t size);
+void thrust_copy(const SRC_T* srcData, DST_T* dstData, size_t size);

 /**
 * @brief Abstract class for the TensorImpl_cuda class template.
@@ -94,7 +93,7 @@ public:
    }

    void copy(const void *src, NbElts_t length, NbElts_t offset = 0) override {
-        AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "TensorImpl_cuda<{}>::copy(): copy length ({}) is above capacity ({})", typeid(T).name(), length, mNbElts);
+        AIDGE_ASSERT(offset + length <= mNbElts, "TensorImpl_cuda<{}>::copy(): copy offset ({}) + length ({}) is above capacity ({})", typeid(T).name(), offset, length, mNbElts);
        const T* srcT = static_cast<const T *>(src);
        T* dstT = static_cast<T *>(rawPtr(offset));

@@ -107,62 +106,62 @@ public:
            return;
        }

-        AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "TensorImpl_cuda<{}>::copyCast(): copy length ({}) is above capacity ({})", typeid(T).name(), length, mNbElts);
+        AIDGE_ASSERT(offset + length <= mNbElts, "TensorImpl_cuda<{}>::copyCast(): copy offset ({}) + length ({}) is above capacity ({})", typeid(T).name(), offset, length, mNbElts);
        switch (srcDt) {
        case DataType::Float64:
            thrust_copy(static_cast<const double*>(src),
                        static_cast<T*>(rawPtr(offset)),
-                        length);
+                        static_cast<size_t>(length));
            break;
        case DataType::Float32:
            thrust_copy(static_cast<const float*>(src),
                        static_cast<T*>(rawPtr(offset)),
-                        length);
+                        static_cast<size_t>(length));
            break;
        case DataType::Float16:
            thrust_copy(static_cast<const half_float::half*>(src),
                        static_cast<T*>(rawPtr(offset)),
-                        length);
+                        static_cast<size_t>(length));
            break;
        case DataType::Int64:
            thrust_copy(static_cast<const int64_t*>(src),
                        static_cast<T*>(rawPtr(offset)),
-                        length);
+                        static_cast<size_t>(length));
            break;
        case DataType::UInt64:
            thrust_copy(static_cast<const uint64_t*>(src),
                        static_cast<T*>(rawPtr(offset)),
-                        length);
+                        static_cast<size_t>(length));
            break;
        case DataType::Int32:
            thrust_copy(static_cast<const int32_t*>(src),
                        static_cast<T*>(rawPtr(offset)),
-                        length);
+                        static_cast<size_t>(length));
            break;
        case DataType::UInt32:
            thrust_copy(static_cast<const uint32_t*>(src),
                        static_cast<T*>(rawPtr(offset)),
-                        length);
+                        static_cast<size_t>(length));
            break;
        case DataType::Int16:
            thrust_copy(static_cast<const int16_t*>(src),
                        static_cast<T*>(rawPtr(offset)),
-                        length);
+                        static_cast<size_t>(length));
            break;
        case DataType::UInt16:
            thrust_copy(static_cast<const uint16_t*>(src),
                        static_cast<T*>(rawPtr(offset)),
-                        length);
+                        static_cast<size_t>(length));
            break;
        case DataType::Int8:
            thrust_copy(static_cast<const int8_t*>(src),
                        static_cast<T*>(rawPtr(offset)),
-                        length);
+                        static_cast<size_t>(length));
            break;
        case DataType::UInt8:
            thrust_copy(static_cast<const uint8_t*>(src),
                        static_cast<T*>(rawPtr(offset)),
-                        length);
+                        static_cast<size_t>(length));
            break;
        default:
            AIDGE_THROW_OR_ABORT(std::runtime_error, "TensorImpl_cuda<{}>::copyCast(): unsupported data type {}.", typeid(T).name(), srcDt);
@@ -171,21 +170,22 @@ public:
    }

    void copyFromDevice(const void *src, const std::pair<std::string, DeviceIdx_t>& device, NbElts_t length, NbElts_t offset = 0) override {
-        AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "TensorImpl_cuda<{}>::copyFromDevice(): copy length ({}) is above capacity ({})", typeid(T).name(), length, mNbElts);
+        AIDGE_ASSERT(offset + length <= mNbElts, "TensorImpl_cuda<{}>::copyFromDevice(): copy offset ({}) + length ({}) is above capacity ({})", typeid(T).name(), offset, length, mNbElts);
        CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(offset), src, length * sizeof(T), cudaMemcpyDeviceToDevice));
    }

    void copyFromHost(const void *src, NbElts_t length, NbElts_t offset = 0) override {
-        AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "TensorImpl_cuda<{}>::copyFromHost(): copy length ({}) is above capacity ({})", typeid(T).name(), length, mNbElts);
+        AIDGE_ASSERT(offset + length <= mNbElts, "TensorImpl_cuda<{}>::copyFromHost(): copy offset ({}) + length ({}) is above capacity ({})", typeid(T).name(), offset, length, mNbElts);
        CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(offset), src, length * sizeof(T), cudaMemcpyHostToDevice));
    }

    void copyToHost(void *dst, NbElts_t length, NbElts_t offset = 0) const override {
-        AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "TensorImpl_cuda<{}>::copyToHost(): copy length ({}) is above capacity ({})", typeid(T).name(), length, mNbElts);
+        AIDGE_ASSERT(offset + length <= mNbElts, "TensorImpl_cuda<{}>::copyToHost(): copy offset ({}) + length ({}) is above capacity ({})", typeid(T).name(), offset, length, mNbElts);
        CHECK_CUDA_STATUS(cudaMemcpy(dst, rawPtr(offset), length * sizeof(T), cudaMemcpyDeviceToHost));
    }

    void *rawPtr(NbElts_t offset = 0) override {
+        cudaSetDevice(mDevice);
        lazyInit();
        return (mData.data() + offset);
    };
@@ -279,16 +279,17 @@ private:
 template <typename T>
 const std::string TensorImpl_cuda<T>::Backend = "cuda";

-namespace {
-static Registrar<Tensor> registrarTensorImpl_cuda_Float64(
-        {"cuda", DataType::Float64}, Aidge::TensorImpl_cuda<double>::create);
-static Registrar<Tensor> registrarTensorImpl_cuda_Float32(
-        {"cuda", DataType::Float32}, Aidge::TensorImpl_cuda<float>::create);
-static Registrar<Tensor> registrarTensorImpl_cuda_Float16(
-        {"cuda", DataType::Float16}, Aidge::TensorImpl_cuda<half_float::half>::create);
-static Registrar<Tensor> registrarTensorImpl_cuda_Int32(
-        {"cuda", DataType::Int32}, Aidge::TensorImpl_cuda<int32_t>::create);
-}  // namespace
+REGISTRAR(Tensor, {"cuda", DataType::Float64}, Aidge::TensorImpl_cuda<double>::create);
+REGISTRAR(Tensor, {"cuda", DataType::Float32}, Aidge::TensorImpl_cuda<float>::create);
+REGISTRAR(Tensor, {"cuda", DataType::Float16}, Aidge::TensorImpl_cuda<half_float::half>::create);
+REGISTRAR(Tensor, {"cuda", DataType::Int64}, Aidge::TensorImpl_cuda<int64_t>::create);
+REGISTRAR(Tensor, {"cuda", DataType::Int32}, Aidge::TensorImpl_cuda<int32_t>::create);
+REGISTRAR(Tensor, {"cuda", DataType::Int16}, Aidge::TensorImpl_cuda<int16_t>::create);
+REGISTRAR(Tensor, {"cuda", DataType::Int8}, Aidge::TensorImpl_cuda<int8_t>::create);
+REGISTRAR(Tensor, {"cuda", DataType::UInt64}, Aidge::TensorImpl_cuda<uint64_t>::create);
+REGISTRAR(Tensor, {"cuda", DataType::UInt32}, Aidge::TensorImpl_cuda<uint32_t>::create);
+REGISTRAR(Tensor, {"cuda", DataType::UInt16}, Aidge::TensorImpl_cuda<uint16_t>::create);
+REGISTRAR(Tensor, {"cuda", DataType::UInt8}, Aidge::TensorImpl_cuda<uint8_t>::create);
 }  // namespace Aidge

 #endif /* AIDGE_BACKEND_CUDA_DATA_TENSORIMPL_H_ */
--- a/include/aidge/backend/cuda/operator/AbsImpl.hpp
+++ b/include/aidge/backend/cuda/operator/AbsImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_BACKEND_CUDA_OPERATOR_ABSIMPL_H_
+#define AIDGE_BACKEND_CUDA_OPERATOR_ABSIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include <cudnn.h>
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/Abs.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+class AbsImpl_cuda : public OperatorImpl {
+public:
+    AbsImpl_cuda(const Abs_Op& op) : OperatorImpl(op, "cuda") {}
+
+    static std::unique_ptr<AbsImpl_cuda> create(const Abs_Op& op) {
+        return std::make_unique<AbsImpl_cuda>(op);
+    }
+
+    virtual std::vector<ImplSpec> getAvailableImplSpecs() const override {
+        return {
+            {DataType::Float64},
+            {DataType::Float32},
+            {DataType::Float16},
+        };
+    }
+
+    void forward() override;
+
+private:
+
+    std::shared_ptr<Tensor> mInputFallback;
+    std::shared_ptr<Tensor> mOutputGradFallback;
+
+    template <class T> void forward_(const Tensor& input);
+};
+
+// Implementation entry point registration to Operator
+REGISTRAR(Abs_Op, "cuda", Aidge::AbsImpl_cuda::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_BACKEND_CUDA_OPERATOR_ABSIMPL_H_ */
--- a/include/aidge/backend/cuda/operator/AbsImpl_CUDA_kernels.hpp
+++ b/include/aidge/backend/cuda/operator/AbsImpl_CUDA_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CUDA_OPERATOR_ABSIMPL_KERNELS_H_
+#define AIDGE_CUDA_OPERATOR_ABSIMPL_KERNELS_H_
+
+#include <stdexcept>
+#include <cfloat>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <cuda_fp16.h>
+
+#include "aidge/data/Data.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+
+template <class T>
+void absForward(const T* input, T* output, int size);
+
+}
+#endif /* AIDGE_CUDA_OPERATOR_ABSIMPL_KERNELS_H_ */
+
+
+
+
+
--- a/include/aidge/backend/cuda/operator/AddImpl.hpp
+++ b/include/aidge/backend/cuda/operator/AddImpl.hpp
@@ -36,7 +36,7 @@ public:
        return std::make_unique<AddImpl_cuda>(op);
    }

-    virtual std::set<ImplSpec> getAvailableImplSpecs() const override {
+    virtual std::vector<ImplSpec> getAvailableImplSpecs() const override {
        return {
            {DataType::Float64},
            {DataType::Float32},
@@ -46,10 +46,18 @@ public:

    void forward() override;
    void backward() override;
+    ~AddImpl_cuda();

 private:
-    template <class T> void forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides);
-    template <class T> void backward_(const Tensor& outGrad, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides);
+    std::vector<cudnnTensorDescriptor_t> mTensorDesc;
+    cudnnReduceTensorDescriptor_t mBwdReduceDesc = nullptr;
+    size_t mBwdWorkspaceSize = 0;
+    void* mBwdWorkspace = nullptr;
+    std::vector<std::shared_ptr<Tensor>> mInputFallbacks;
+    std::shared_ptr<Tensor> mOutputGradFallback;
+
+    template <class T> void forward_(const std::vector<std::reference_wrapper<Tensor>>& inputs);
+    template <class T> void backward_(const Tensor& outGrad);
 };

 // Implementation entry point registration to Operator

--- a/include/aidge/backend/cuda/operator/ArgMaxImpl.hpp
+++ b/include/aidge/backend/cuda/operator/ArgMaxImpl.hpp
@@ -36,7 +36,7 @@ public:
        return std::make_unique<ArgMaxImpl_cuda>(op);
    }

-    virtual std::set<ImplSpec> getAvailableImplSpecs() const override {
+    virtual std::vector<ImplSpec> getAvailableImplSpecs() const override {
        return {
            {DataType::Float64},
            {DataType::Float32},

--- a/include/aidge/backend/cuda/operator/AvgPoolingImpl.hpp
+++ b/include/aidge/backend/cuda/operator/AvgPoolingImpl.hpp
@@ -37,7 +37,7 @@ public:
        return std::make_unique<AvgPoolingImpl_cuda>(op);
    }

-    virtual std::set<ImplSpec> getAvailableImplSpecs() const override {
+    virtual std::vector<ImplSpec> getAvailableImplSpecs() const override {
        return {
            {DataType::Float64},
            {DataType::Float32},

--- a/include/aidge/backend/cuda/operator/BatchNormImpl.hpp
+++ b/include/aidge/backend/cuda/operator/BatchNormImpl.hpp
@@ -37,7 +37,7 @@ public:
        return std::make_unique<BatchNormImpl_cuda>(op);
    }

-    virtual std::set<ImplSpec> getAvailableImplSpecs() const override {
+    virtual std::vector<ImplSpec> getAvailableImplSpecs() const override {
        return {
            {DataType::Float64},
            {DataType::Float32},

--- a/include/aidge/backend/cuda/operator/BitShiftImpl.hpp
+++ b/include/aidge/backend/cuda/operator/BitShiftImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_BACKEND_CUDA_OPERATOR_BITSHIFTIMPL_H_
+#define AIDGE_BACKEND_CUDA_OPERATOR_BITSHIFTIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include <cudnn.h>
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/BitShift.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+class BitShiftImpl_cuda : public OperatorImpl {
+public:
+    BitShiftImpl_cuda(const BitShift_Op& op) : OperatorImpl(op, "cuda") {}
+
+    static std::unique_ptr<BitShiftImpl_cuda> create(const BitShift_Op& op) {
+        return std::make_unique<BitShiftImpl_cuda>(op);
+    }
+
+    virtual std::vector<ImplSpec> getAvailableImplSpecs() const override {
+        return {
+            {DataType::Int32},
+            {DataType::Int64}
+        };
+    }
+
+    void forward() override;
+
+private:
+    template <class T> void forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides);
+};
+
+// Implementation entry point registration to Operator
+REGISTRAR(BitShift_Op, "cuda", Aidge::BitShiftImpl_cuda::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_BACKEND_CUDA_OPERATOR_BITSHIFTIMPL_H_ */
--- a/include/aidge/backend/cuda/operator/BitShiftImpl_CUDA_kernels.hpp
+++ b/include/aidge/backend/cuda/operator/BitShiftImpl_CUDA_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CUDA_OPERATOR_BITSHIFTIMPL_KERNELS_H_
+#define AIDGE_CUDA_OPERATOR_BITSHIFTIMPL_KERNELS_H_
+
+#include <stdexcept>
+#include <cfloat>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <cuda_fp16.h>
+
+#include "aidge/data/Data.hpp"
+#include "aidge/operator/BitShift.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+
+template <class T>
+void bitShiftForward(const BitShift_Op::BitShiftDirection direction,
+                const bool rounding,
+                const T* input1, T* output, const T* intput2,
+                const std::vector<int>& input1Dims,const std::vector<int>& input2Dims, const std::vector<int>& outputDims,
+                const std::vector<int>& input1Strides, const std::vector<int>& input2Strides,const std::vector<int>& outputStrides,
+                int outSize);
+
+}
+#endif /* AIDGE_CUDA_OPERATOR_BITSHIFTIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cuda/operator/ClipImpl.hpp
+++ b/include/aidge/backend/cuda/operator/ClipImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_BACKEND_CUDA_OPERATOR_CLIPIMPL_H_
+#define AIDGE_BACKEND_CUDA_OPERATOR_CLIPIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include <cudnn.h>
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/Clip.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+class ClipImpl_cuda : public OperatorImpl {
+public:
+    ClipImpl_cuda(const Clip_Op& op) : OperatorImpl(op, "cuda") {}
+
+    static std::unique_ptr<ClipImpl_cuda> create(const Clip_Op& op) {
+        return std::make_unique<ClipImpl_cuda>(op);
+    }
+
+    virtual std::vector<ImplSpec> getAvailableImplSpecs() const override {
+        return {
+            {DataType::Float64},
+            {DataType::Float32},
+            {DataType::Float16},
+        };
+    }
+
+    void forward() override;
+
+private:
+    template <class T> void forward_();
+};
+
+// Implementation entry point registration to Operator
+REGISTRAR(Clip_Op, "cuda", Aidge::ClipImpl_cuda::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_BACKEND_CUDA_OPERATOR_CLIPIMPL_H_ */
No results found