diff --git a/.gitignore b/.gitignore
index 9fbfccca6dfda997d8a0dbfc4b373590feeecad8..f3571f3fefd133675c1989b50247a12d107bc685 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,6 @@
+# common
+.cache
+
 # C++ Build
 build*/
 install*/
@@ -10,6 +13,9 @@ install*/
 __pycache__
 *.pyc
 *.egg-info
+dist*/
+wheelhouse/*
+_version.py
 
 # Mermaid
 *.mmd
@@ -18,4 +24,4 @@ __pycache__
 xml*/
 
 # ONNX
-*.onnx
\ No newline at end of file
+*.onnx
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 271581e4d845ea93d5fd3a09f471edec69913277..1c2606707d16cd401e575354a1cbb99a11451ff8 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,39 +1,64 @@
-################################################################################
-# Pre-configured CI/CD for your Aidge module.
-#
-# Three stages are already pre-configured to run on Eclipse Aidge CI:
-# - build: ubuntu_cpp, ubuntu_python and windows_cpp;
-# - test: ubuntu_cpp, ubuntu_python and windows_cpp;
-# - coverage: ubuntu_cpp and ubuntu_python.
-#
-# If your project is pure C++ or pure Python, you can remove the "_python" or 
-# "_cpp" jobs respectively.
-# "ubuntu" jobs require an Ubuntu runner with a docker executor with tag 
-# "docker".
-# "windows" jobs require a Windows runner with a docker-windows executor with 
-# tag "windows".
-#
-# You can change the docker images in the YML scripts directly. The default 
-# images are:
-# - nvidia/cuda:12.2.0-devel-ubuntu22.04 for Ubuntu jobs;
-# - buildtools for Windows jobs, built on top of 
-#   mcr.microsoft.com/windows/servercore:ltsc2022 with Microsoft Visual Studio 
-#   2022 BuildTools installed.
-#
-# See Aidge project wiki for more details on how to setup your own docker images
-# and Gitlab runners.
-################################################################################
+###############################################################################
+#                Aidge Continuous Integration and Deployment                  #
+#                                                                             #
+###############################################################################
 
 stages:
-  # Build
+  - static_analysis
   - build
-  # Unit test stage
   - test
-  # Code coverage
   - coverage
+  - release
+  - deploy
 
 include:
-  - local: '/.gitlab/ci/_global.gitlab-ci.yml'
-  - local: '/.gitlab/ci/build.gitlab-ci.yml'
-  - local: '/.gitlab/ci/test.gitlab-ci.yml'
-  - local: '/.gitlab/ci/coverage.gitlab-ci.yml'
+  - project: 'eclipse/aidge/gitlab_shared_files' 
+    ref: 'main'
+    file: 
+      # choose which jobs to run by including the corresponding files.
+      - '.gitlab/ci/ubuntu_cpp.gitlab-ci.yml'
+
+      - '.gitlab/ci/ubuntu_python.gitlab-ci.yml'
+      - '.gitlab/ci/release/cibuildwheel_ubuntu.gitlab-ci.yml'   
+
+      # - '.gitlab/ci/windows_cpp.gitlab-ci.yml'
+
+      # - '.gitlab/ci/windows_python.gitlab-ci.yml'   
+      # - '.gitlab/ci/release/cibuildwheel_windows.gitlab-ci.yml'   
+
+
+release:pip:ubuntu:
+  tags: 
+    - release:cuda
+  variables:
+    DOCKER_HOST: unix:///var/run/docker.sock
+    CIBW_ENVIRONMENT: >-
+      BUILD_WITH_CUDA=1
+      AIDGE_DEPENDENCIES='aidge_core aidge_backend_cpu'
+      AIDGE_INSTALL='/AIDGE_INSTALL_CIBUILDWHEEL'
+      CUDA_TOOLKIT_VERSION='11-8'
+      DOCKER_HOST='unix:///var/run/docker.sock'
+      ARCH='x86_64'
+      CUDNN_VERSION='9'
+      CUDA_MAJOR_VERSION='11'
+      CUDA_MINOR_VERSION='8'
+      SEARCH_PATH='/home/ubuntu/builds/$CI_RUNNER_SHORT_TOKEN/$CI_CONCURRENT_ID'
+
+  parallel:
+    matrix:
+      - CIBW_BUILD: "cp38-manylinux_x86_64"
+      - CIBW_BUILD: "cp39-manylinux_x86_64"
+      - CIBW_BUILD: "cp310-manylinux_x86_64"
+
+  before_script:
+    # retrieve aidge dependencies
+    - DEPENDENCY_JOB="build:ubuntu_python"
+    - !reference [.ubuntu:download:repositories, before_script] # located in common.gitlab-ci.yml
+
+  script:
+    - /home/ubuntu/.local/bin/cibuildwheel --output-dir wheelhouse
+
+  after_script:
+    # Ensure all files are owned by the correct user at the end of the job
+    - sudo chown -R $(whoami):$(whoami) .
+
diff --git a/.gitlab/ci/_global.gitlab-ci.yml b/.gitlab/ci/_global.gitlab-ci.yml
deleted file mode 100644
index ccc83c5d24623f1f00eaa78bc596f0cb1ff429dc..0000000000000000000000000000000000000000
--- a/.gitlab/ci/_global.gitlab-ci.yml
+++ /dev/null
@@ -1,25 +0,0 @@
-################################################################################
-# Centralized definitions of common job parameter values.                      #
-# Parameters with many optional configurations may be in separate files.       #
-#                                                                              #
-################################################################################
-variables:
-  GIT_SUBMODULE_STRATEGY: recursive
-  OMP_NUM_THREADS: 4
-  GIT_SSL_NO_VERIFY: 1
-  DEBIAN_FRONTEND: noninteractive
-
-# See https://docs.gitlab.com/ee/ci/yaml/workflow.html#switch-between-branch-pipelines-and-merge-request-pipelines
-workflow:
-  rules:
-    - if: $CI_PIPELINE_SOURCE == "merge_request_event"
-    - if: $CI_COMMIT_BRANCH && $CI_OPEN_MERGE_REQUESTS
-      when: never
-    - if: $CI_COMMIT_BRANCH
-
-default:
-  image: nvidia/cuda:12.2.0-devel-ubuntu22.04
-  before_script:
-    - apt update
-    - apt install -y cmake cppcheck python-is-python3 pip git gcovr unzip curl
-    - apt install -y libcudnn8-dev
diff --git a/.gitlab/ci/build.gitlab-ci.yml b/.gitlab/ci/build.gitlab-ci.yml
deleted file mode 100644
index c5d22e779753ee0fbfa0bcbd828f85639ace8b9f..0000000000000000000000000000000000000000
--- a/.gitlab/ci/build.gitlab-ci.yml
+++ /dev/null
@@ -1,242 +0,0 @@
-include:
-  #- remote: 'https://gitlab.eclipse.org/eclipse/aidge/gitlab_shared_files/-/raw/main/.gitlab/ci/shared_script.gitlab-ci.yml'
-  - remote: 'https://gitlab.eclipse.org/hrouis/gitlab_shared_files/-/raw/test_hro/.gitlab/ci/shared_script.gitlab-ci.yml'
-
-build:ubuntu_cpp:
-  stage: build
-  needs: []
-  tags:
-    - docker
-
-  script:
-    # Download dependencies
-    - DEPENDENCY_JOB="build:ubuntu_cpp"
-    # aidge_core
-    - DEPENDENCY_NAME="aidge_core"
-    - !reference [.download_dependency, script]
-    # aidge_backend_cpu
-    - DEPENDENCY_NAME="aidge_backend_cpu"
-    - !reference [.download_dependency, script]
-
-    # Build current module
-    - export CMAKE_PREFIX_PATH=../install_cpp
-    - mkdir -p build_cpp
-    - cd build_cpp
-    - cmake -DCMAKE_INSTALL_PREFIX:PATH=../install_cpp -DCMAKE_BUILD_TYPE=Debug -DWERROR=ON -DCOVERAGE=ON ..
-    - make -j4 all install
-
-  artifacts:
-    expire_in: 1 week
-    paths:
-      - build_cpp/
-      - install_cpp/
-
-build:ubuntu_cpp_g++10:
-  stage: build
-  needs: []
-  tags:
-    - docker
-
-  script:
-        # Download dependencies
-    - DEPENDENCY_JOB="build:ubuntu_cpp"
-    # aidge_core
-    - DEPENDENCY_NAME="aidge_core"
-    - !reference [.download_dependency, script]
-    # aidge_backend_cpu
-    - DEPENDENCY_NAME="aidge_backend_cpu"
-    - !reference [.download_dependency, script]
-
-    # Build current module
-    - export CMAKE_PREFIX_PATH=../install_cpp
-    - apt install -y g++-10
-    - mkdir -p build_cpp
-    - mkdir -p install_cpp
-    - cd build_cpp
-    - export CXX=/usr/bin/g++-10
-    - cmake -DCMAKE_INSTALL_PREFIX:PATH=../install_cpp -DCMAKE_BUILD_TYPE=Debug -DWERROR=ON -DCOVERAGE=ON ..
-    - make -j4 all install
-
-build:ubuntu_cpp_g++12:
-  stage: build
-  needs: []
-  tags:
-    - docker
-
-  script:
-    # Download dependencies
-    - DEPENDENCY_JOB="build:ubuntu_cpp"
-    # aidge_core
-    - DEPENDENCY_NAME="aidge_core"
-    - !reference [.download_dependency, script]
-    # aidge_backend_cpu
-    - DEPENDENCY_NAME="aidge_backend_cpu"
-    - !reference [.download_dependency, script]
-
-
-    # Build current module
-    - export CMAKE_PREFIX_PATH=../install_cpp
-    - apt install -y g++-12
-    - mkdir -p build_cpp
-    - mkdir -p install_cpp
-    - cd build_cpp
-    - export CXX=/usr/bin/g++-12
-    - cmake -DCMAKE_INSTALL_PREFIX:PATH=../install_cpp -DCMAKE_BUILD_TYPE=Debug -DWERROR=ON -DCOVERAGE=ON ..
-    - make -j4 all install
-
-build:ubuntu_cpp_clang12:
-  stage: build
-  needs: []
-  tags:
-    - docker
-
-  script:
-    # Download dependencies
-    - DEPENDENCY_JOB="build:ubuntu_cpp"
-    # aidge_core
-    - DEPENDENCY_NAME="aidge_core"
-    - !reference [.download_dependency, script]
-    # aidge_backend_cpu
-    - DEPENDENCY_NAME="aidge_backend_cpu"
-    - !reference [.download_dependency, script]
-
-
-    # Build current module
-    - export CMAKE_PREFIX_PATH=../install_cpp
-    - apt install -y clang-12
-    - mkdir -p build_cpp
-    - mkdir -p install_cpp
-    - cd build_cpp
-    - export CXX=/usr/bin/clang++-12
-    - cmake -DCMAKE_INSTALL_PREFIX:PATH=../install_cpp -DCMAKE_BUILD_TYPE=Debug -DWERROR=ON -DCOVERAGE=ON ..
-    - make -j4 all install
-
-build:ubuntu_cpp_clang15:
-  stage: build
-  needs: []
-  tags:
-    - docker
-
-  script:
-    # Download dependencies
-    - DEPENDENCY_JOB="build:ubuntu_cpp"
-    # aidge_core
-    - DEPENDENCY_NAME="aidge_core"
-    - !reference [.download_dependency, script]
-    # aidge_backend_cpu
-    - DEPENDENCY_NAME="aidge_backend_cpu"
-    - !reference [.download_dependency, script]
-
-    # Build current module
-    - export CMAKE_PREFIX_PATH=../install_cpp
-    - apt install -y clang-15
-    - mkdir -p build_cpp
-    - mkdir -p install_cpp
-    - cd build_cpp
-    - export CXX=/usr/bin/clang++-15
-    - cmake -DCMAKE_INSTALL_PREFIX:PATH=../install_cpp -DCMAKE_BUILD_TYPE=Debug -DWERROR=ON -DCOVERAGE=ON ..
-    - make -j4 all install
-
-build:ubuntu_python:
-  stage: build
-  needs: []
-  tags:
-    - docker
-
-  script:
-    # Download dependencies
-    - DEPENDENCY_JOB="build:ubuntu_python"
-    # aidge_core (python)
-    - DEPENDENCY_NAME="aidge_core"
-    - !reference [.download_dependency, script]
-    # aidge_backend_cpu (python)
-    - DEPENDENCY_NAME="aidge_backend_cpu"
-    - !reference [.download_dependency, script]
-
-    - python3 -m pip install virtualenv
-    - virtualenv venv
-    - source venv/bin/activate
-    - python3 -m pip install -r requirements.txt
-    - python3 -m pip install .
-
-  artifacts:
-    expire_in: 1 week
-    paths:
-      - venv/
-
-# build:windows_cpp:
-#   stage: build
-#   needs: []
-#   tags:
-#     - windows
-
-#   image: buildtools
-#   before_script:
-#     # Install Chocolatey
-#     - Set-ExecutionPolicy Bypass -Scope Process -Force; [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072; iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))
-#     # Install dependencies
-#     - choco install cmake.install --installargs '"ADD_CMAKE_TO_PATH=System"' -Y
-#     - choco install git -Y
-#     - choco install python -Y
-#     - choco install cuda -Y
-#     # Update PATH
-#     - $env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User")
-#   script:
-#     # Download dependencies
-#     # aidge_core
-#     - 'curl "https://gitlab.eclipse.org/api/v4/projects/5139/jobs/artifacts/main/download?job=build:windows_cpp" -o build_artifacts.zip'
-#     - Expand-Archive -Path .\build_artifacts.zip -DestinationPath . -Force
-#     - Remove-Item .\build_cpp\ -Recurse
-#     # aidge_backend_cpu
-#     - 'curl "https://gitlab.eclipse.org/api/v4/projects/5140/jobs/artifacts/main/download?job=build:windows_cpp" -o build_artifacts.zip'
-#     - Expand-Archive -Path .\build_artifacts.zip -DestinationPath . -Force
-#     - Remove-Item .\build_cpp\ -Recurse
-
-#     - $env:CMAKE_PREFIX_PATH = '../install_cpp'
-#     - mkdir -p build_cpp
-#     - cd build_cpp
-#     - cmake -DCMAKE_INSTALL_PREFIX:PATH=../install_cpp -DCMAKE_BUILD_TYPE=Debug ..
-#     - cmake --build . -j2
-#     - cmake --install . --config Debug
-
-#   artifacts:
-#     expire_in: 1 week
-#     paths:
-#       - build_cpp/
-#       - install_cpp/
-
-# build:windows_python:
-#   stage: build
-#   needs: []
-#   tags:
-#     - windows
-
-#   image: buildtools
-#   before_script:
-#     # Install Chocolatey
-#     - Set-ExecutionPolicy Bypass -Scope Process -Force; [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072; iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))
-#     # Install dependencies
-#     - choco install cmake.install --installargs '"ADD_CMAKE_TO_PATH=System"' -Y
-#     - choco install git -Y
-#     - choco install python -Y
-#     - choco install cuda -Y
-#     # Update PATH
-#     - $env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User")
-#   script:
-#     # Download dependencies
-#     # aidge_core (Python)
-#     - 'curl "https://gitlab.eclipse.org/api/v4/projects/5139/jobs/artifacts/main/download?job=build:windows_python" -o build_artifacts.zip'
-#     - Expand-Archive -Path .\build_artifacts.zip -DestinationPath . -Force
-#     # aidge_backend_cpu (Python)
-#     - 'curl "https://gitlab.eclipse.org/api/v4/projects/5140/jobs/artifacts/main/download?job=build:windows_python" -o build_artifacts.zip'
-#     - Expand-Archive -Path .\build_artifacts.zip -DestinationPath . -Force
-
-#     - python -m pip install virtualenv
-#     - virtualenv venv
-#     - venv\Scripts\Activate.ps1
-#     - python -m pip install -r requirements.txt
-#     - python -m pip install .
-#   artifacts:
-#     expire_in: 1 week
-#     paths:
-#       - venv/
diff --git a/.gitlab/ci/cibuildwheel_build_deps_before_build_wheel.ps1 b/.gitlab/ci/cibuildwheel_build_deps_before_build_wheel.ps1
new file mode 100644
index 0000000000000000000000000000000000000000..c2715ea5550432838d3cc8692e97204b278d2c85
--- /dev/null
+++ b/.gitlab/ci/cibuildwheel_build_deps_before_build_wheel.ps1
@@ -0,0 +1,23 @@
+$ErrorActionPreference = "Stop"
+
+# Retrieve and clean the dependencies string from the environment variable
+$AIDGE_DEPENDENCIES = $env:AIDGE_DEPENDENCIES -split ' '
+Write-Host "Aidge dependencies : $AIDGE_DEPENDENCIES"
+if ( $($AIDGE_DEPENDENCIES.Length) -eq 0) {
+        Write-Host "- No dependencies provided for current repsitory"
+        New-Item -ItemType Directory -Force -Path ".\build" | Out-Null
+        Remove-Item -Path ".\build\*" -Recurse -Force
+    } else {
+        Write-Host "Retrieving given dependencies to build current package : $AIDGE_DEPENDENCIES"
+    foreach ($dep in $($AIDGE_DEPENDENCIES -split " ")) {
+        Write-Host "Retrieving : $dep"
+        $curr_loc=$(Get-Location)
+        Set-Location ../$dep
+        Get-Location 
+        Get-ChildItem .
+        New-Item -Path ".\build" -ItemType Directory -Force | Out-Null
+        Get-ChildItem -Path ".\build" -File | Remove-Item -Force
+        python -m pip install . -v
+        Set-Location $curr_loc
+    }
+}
diff --git a/.gitlab/ci/cibuildwheel_build_deps_before_build_wheel.sh b/.gitlab/ci/cibuildwheel_build_deps_before_build_wheel.sh
new file mode 100755
index 0000000000000000000000000000000000000000..4f74488ae41714a4ce03ba7514bf93842768c5ae
--- /dev/null
+++ b/.gitlab/ci/cibuildwheel_build_deps_before_build_wheel.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+set -e
+if [[ "$1" == "" ]]; then 
+  echo "build aidge deps in cibuildwheel container before building wheel."
+  echo "search path defines where the dependencies will be searched."
+  echo "Hint : In wheel containers, files are mounted on /host by default."
+  echo "\nusage : ./cibuildwheel_build_deps_before_build_wheel.sh $search_path"
+fi
+set -x
+if [[ $AIDGE_DEPENDENCIES ==  "" ]]; then # case for aidge_ core
+  mkdir -p build # creating build if its not already there to hold the build of cpp files
+  rm -rf build/* # build from scratch
+else 
+  for repo in $AIDGE_DEPENDENCIES ; do # case for other projects
+    search_path=$1
+    REPO_PATH=$(find $search_path ! -writable -prune -o  -type d     \
+                                    -name "$repo"                    \
+                                    -not -path "*/install/*"         \
+                                    -not -path "*/.git/*"            \
+                                    -not -path "*/miniconda/*"       \
+                                    -not -path "*/conda/*"           \
+                                    -not -path "*/.local/*"          \
+                                    -not -path "*/lib/*"             \
+                                    -not -path "*/$repo/$repo/*"     \
+                                    -not -path "*/proc/*"            \
+                                    -print -quit)
+    if [[ -z "$REPO_PATH" ]]; then 
+      echo "ERROR : dependency $repo not found in search_path \"$search_path\". ABORTING."
+      exit -1
+    fi
+
+    cd $REPO_PATH
+    mkdir -p build # creating build if its not already there to hold the build of cpp files
+    rm -rf build/* # build from scratch
+    pip install . -v
+    cd -
+  done
+fi
+set +x
+set +e
diff --git a/.gitlab/ci/coverage.gitlab-ci.yml b/.gitlab/ci/coverage.gitlab-ci.yml
deleted file mode 100644
index 33547fc3f52771c456fba3d34a6e8d96eebafd8a..0000000000000000000000000000000000000000
--- a/.gitlab/ci/coverage.gitlab-ci.yml
+++ /dev/null
@@ -1,41 +0,0 @@
-coverage:ubuntu_cpp:
-  stage: coverage
-  needs: ["build:ubuntu_cpp"]
-  tags:
-    - docker
-  script:
-    - cd build_cpp
-    - ctest --output-on-failure
-    # HTML report for visualization
-    - gcovr --html-details --exclude-unreachable-branches -o coverage.html --root ${CI_PROJECT_DIR} --filter '\.\./include/' --filter '\.\./src/'
-    # Coberta XML report for Gitlab integration
-    - gcovr --xml-pretty --exclude-unreachable-branches --print-summary -o coverage.xml --root ${CI_PROJECT_DIR} --filter '\.\./include/' --filter '\.\./src/'
-  coverage: /^\s*lines:\s*\d+.\d+\%/
-  artifacts:
-    name: ${CI_JOB_NAME}-${CI_COMMIT_REF_NAME}-${CI_COMMIT_SHA}
-    expire_in: 2 days
-    reports:
-      coverage_report:
-        coverage_format: cobertura
-        path: build_cpp/coverage.xml
-
-coverage:ubuntu_python:
-  stage: coverage
-  needs: ["build:ubuntu_python"]
-  tags:
-    - docker
-  script:
-    - source venv/bin/activate
-    - python3 -m pip install numpy coverage
-    - cd ${CI_PROJECT_NAME}
-    # Retrieve the installation path of the module, since it is installed with pip.
-    - export MODULE_LOCATION=`python -c "import ${CI_PROJECT_NAME} as _; print(_.__path__[0])"`
-    - python3 -m coverage run --source=$MODULE_LOCATION -m unittest discover -s unit_tests/ -v -b
-    - python3 -m coverage report
-    - python3 -m coverage xml
-  coverage: '/(?i)total.*? (100(?:\.0+)?\%|[1-9]?\d(?:\.\d+)?\%)$/'
-  artifacts:
-    reports:
-      coverage_report:
-        coverage_format: cobertura
-        path: ${CI_PROJECT_NAME}/coverage.xml
diff --git a/.gitlab/ci/test.gitlab-ci.yml b/.gitlab/ci/test.gitlab-ci.yml
deleted file mode 100644
index 92b932f86193d1525b9bba8cad0b92271f3c966f..0000000000000000000000000000000000000000
--- a/.gitlab/ci/test.gitlab-ci.yml
+++ /dev/null
@@ -1,48 +0,0 @@
-test:ubuntu_cpp:
-  stage: test
-  needs: ["build:ubuntu_cpp"]
-  tags:
-    - docker
-  script:
-    - cd build_cpp
-    - ctest --output-junit ctest-results.xml --output-on-failure
-  artifacts:
-    reports:
-      junit: build_cpp/ctest-results.xml
-
-test:ubuntu_python:
-  stage: test
-  needs: ["build:ubuntu_python"]
-  tags:
-    - docker
-  script:
-    - source venv/bin/activate
-    - cd ${CI_PROJECT_NAME}
-    - python3 -m pip install numpy unittest-xml-reporting
-    - python3 -m pip list
-    # Run on discovery all tests located in core/unit_tests/python
-    - python3 -m xmlrunner discover -s unit_tests/ -v -b --output-file xmlrunner-results.xml
-  artifacts:
-    reports:
-      junit: ${CI_PROJECT_NAME}/xmlrunner-results.xml
-
-# test:windows_cpp:
-#   stage: test
-#   needs: ["build:windows_cpp"]
-#   tags:
-#     - windows
-#   image: buildtools
-#   before_script:
-#     # Install Chocolatey
-#     - Set-ExecutionPolicy Bypass -Scope Process -Force; [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072; iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))
-#     # Install dependencies
-#     - choco install cmake.install --installargs '"ADD_CMAKE_TO_PATH=System"' -Y
-#     - choco install python -Y
-#     # Update PATH
-#     - $env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User")
-#   script:
-#     - cd build_cpp
-#     - ctest --output-junit ctest-results.xml --output-on-failure
-#   artifacts:
-#     reports:
-#       junit: build_cpp/ctest-results.xml
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 01ebb6f258b173aee6df867c5c5c991ec936df57..bea1b398ced94f3ec4da97b5db6237fd9882d87d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,11 +1,15 @@
 # CMake >= 3.18 is required for good support of FindCUDAToolkit
 cmake_minimum_required(VERSION 3.18)
+set(CXX_STANDARD 14)
+
+file(STRINGS "${CMAKE_SOURCE_DIR}/version.txt" version)
 
-file(READ "${CMAKE_SOURCE_DIR}/version.txt" version)
-add_definitions(-DPROJECT_VERSION="${version}")
-file(READ "${CMAKE_SOURCE_DIR}/project_name.txt" project)
+project(aidge_backend_cuda
+        VERSION ${version}
+        DESCRIPTION "CUDA implementations of the operators of aidge framework."
+        LANGUAGES CXX)
 
-message(STATUS "Project name: ${project}")
+message(STATUS "Project name: ${CMAKE_PROJECT_NAME}")
 message(STATUS "Project version: ${version}")
 
 execute_process(
@@ -13,23 +17,18 @@ execute_process(
     WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
     OUTPUT_VARIABLE GIT_COMMIT_HASH
     OUTPUT_STRIP_TRAILING_WHITESPACE
+    ERROR_QUIET
 )
 message(STATUS "Latest git commit: ${GIT_COMMIT_HASH}")
-
 # Define a preprocessor macro with the Git commit version
 add_definitions(-DGIT_COMMIT_HASH="${GIT_COMMIT_HASH}")
 
-
-# Note : project name is {project} and python module name is also {project}
-set(module_name _${project}) # target name
-
-
-project(${project})
-set(CXX_STANDARD 14)
+# Note : project name is ${CMAKE_PROJECT_NAME} and python module name is also ${CMAKE_PROJECT_NAME}
+set(module_name _${CMAKE_PROJECT_NAME}) # target name
 
 ##############################################
 # Define options
-option(PYBIND "python binding" ON)
+option(PYBIND "python binding" OFF)
 option(WERROR "Warning as error" OFF)
 option(TEST "Enable tests" ON)
 option(COVERAGE "Enable coverage" OFF)
@@ -38,34 +37,76 @@ option(ENABLE_ASAN "Enable ASan (AddressSanitizer) for runtime analysis of memor
 ##############################################
 # Import utils CMakeLists
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
-include(PybindModuleCreation)
 
 if(CMAKE_COMPILER_IS_GNUCXX AND COVERAGE)
     Include(CodeCoverage)
 endif()
 
-enable_language(CUDA)
+##############################################
+# Find system dependencies
+##############################################
+# FIND AIDGE Dependencies
+if(NOT $ENV{AIDGE_INSTALL} STREQUAL "")
+    set(CMAKE_INSTALL_PREFIX $ENV{AIDGE_INSTALL})
+    list(APPEND CMAKE_PREFIX_PATH $ENV{AIDGE_INSTALL})
+    message(WARNING "Env var AIDGE_INSTALL detected : $ENV{AIDGE_INSTALL}. Set CMAKE_INSTALL_PREFIX to AIDGE_INSTALL & added to CMAKE_PREFIX_PATH"
+                    "\n\tCMAKE_INSTALL_PREFIX = ${CMAKE_INSTALL_PREFIX}"
+                    "\n\tCMAKE_PREFIX_PATH = ${CMAKE_PREFIX_PATH}")
+endif()
+find_package(aidge_core REQUIRED)
+if(TEST)
+    find_package(aidge_backend_cpu REQUIRED)
+endif()
+
+##########
+# CUDA
+if(NOT $ENV{AIDGE_INSTALL} STREQUAL "")
+    message(WARNING "Env var CIBUILDWHEEL detected : currently building for a release job."
+                    "\nSetting manually CUDACXX, PATH & LD_LIBRARY_PATH Variables")
+    list(APPEND ENV{LD_LIBRARY_PATH} /usr/local/cuda/lib64)
+    list(APPEND ENV{PATH} /usr/local/cuda/bin)
+    set(ENV{CUDACXX} /usr/local/cuda/bin/nvcc)
+endif()
+find_package(CUDAToolkit REQUIRED)
+if(NOT DEFINED CMAKE_CUDA_STANDARD)
+    set(CMAKE_CUDA_STANDARD 14)
+    set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+endif()
+if(NOT DEFINED CMAKE_CUDA_ARCHITECURE)
+    set(CMAKE_CUDA_ARCHITECTURE native)
+endif()
 
 message(STATUS "Cuda compiler version = ${CMAKE_CUDA_COMPILER_VERSION}")
 # Define a preprocessor macro with the Cuda compiler version
 add_definitions(-DCUDA_COMPILER_VERSION="${CMAKE_CUDA_COMPILER_VERSION}")
 
+message(STATUS "CUDA STANDARD : ${CMAKE_CUDA_STANDARD}")
+message(STATUS "CUDA ARCHITECTURE : ${CMAKE_CUDA_ARCHITECTURES}")
 
-##############################################
-# Find system dependencies
-find_package(CUDAToolkit REQUIRED)
+enable_language(CUDA)
 
-find_package(aidge_core REQUIRED)
-if(TEST)
-    find_package(aidge_backend_cpu REQUIRED)
-endif()
 ##############################################
 # Create target and set properties
-
 file(GLOB_RECURSE src_files "src/*.cpp" "src/*.cu")
 file(GLOB_RECURSE inc_files "include/*.hpp")
 
 add_library(${module_name} ${src_files} ${inc_files})
+
+# PYTHON BINDING
+if (PYBIND)
+    # Handles Python + pybind11 headers dependencies
+    include(PybindModuleCreation)
+    # creates a target of the same name as CMAKE_PROJECT_NAME
+        generate_python_binding(${CMAKE_PROJECT_NAME} ${module_name}) # the python bindings module has the same name as the project.
+
+    target_link_libraries(${module_name}
+        PUBLIC
+            pybind11::pybind11
+        PRIVATE
+            Python::Module
+        )
+endif()
+
 target_link_libraries(${module_name}
     PUBLIC
         _aidge_core # _ is added because we link the target not the project
@@ -75,7 +116,7 @@ target_link_libraries(${module_name}
 )
 
 if( ${ENABLE_ASAN} )
-    message("Building ${module_name} with ASAN.")
+    message("Building ${module_name} with ASAN.")
     set(SANITIZE_FLAGS -fsanitize=address -fno-omit-frame-pointer)
     target_link_libraries(${module_name}
         PUBLIC
@@ -103,27 +144,9 @@ target_include_directories(${module_name}
         ${CMAKE_CURRENT_SOURCE_DIR}/src
 )
 
-if(NOT DEFINED CMAKE_CUDA_STANDARD)
-    set(CMAKE_CUDA_STANDARD 14)
-    set(CMAKE_CUDA_STANDARD_REQUIRED ON)
-endif()
-
 set_property(TARGET ${module_name} PROPERTY POSITION_INDEPENDENT_CODE ON)
 set_target_properties(${module_name} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
 
-# PYTHON BINDING
-if (PYBIND)
-    generate_python_binding(${project} ${module_name})
-
-    # Handles Python + pybind11 headers dependencies
-    target_link_libraries(${module_name}
-        PUBLIC
-            pybind11::pybind11
-        PRIVATE
-            Python::Python
-        )
-endif()
-
 target_compile_features(${module_name} PRIVATE cxx_std_14)
 
 target_compile_options(${module_name} PRIVATE
@@ -142,11 +165,10 @@ endif()
 
 ##############################################
 # Installation instructions
-
 include(GNUInstallDirs)
-set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/${project})
+set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/${CMAKE_PROJECT_NAME})
 
-install(TARGETS ${module_name} EXPORT ${project}-targets
+install(TARGETS ${module_name} EXPORT ${CMAKE_PROJECT_NAME}-targets
   LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
   ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
   RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
@@ -157,8 +179,8 @@ install(DIRECTORY include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
 
 #Export the targets to a script
 
-install(EXPORT ${project}-targets
- FILE "${project}-targets.cmake"
+install(EXPORT ${CMAKE_PROJECT_NAME}-targets
+ FILE "${CMAKE_PROJECT_NAME}-targets.cmake"
  DESTINATION ${INSTALL_CONFIGDIR}
 #  COMPONENT ${module_name}
 )
@@ -167,32 +189,34 @@ install(EXPORT ${project}-targets
 include(CMakePackageConfigHelpers)
 
 write_basic_package_version_file(
-    "${CMAKE_CURRENT_BINARY_DIR}/${project}-config-version.cmake"
+    "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_PROJECT_NAME}-config-version.cmake"
     VERSION ${version}
     COMPATIBILITY AnyNewerVersion
 )
 
-configure_package_config_file("${project}-config.cmake.in"
-    "${CMAKE_CURRENT_BINARY_DIR}/${project}-config.cmake"
+configure_package_config_file("${CMAKE_PROJECT_NAME}-config.cmake.in"
+    "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_PROJECT_NAME}-config.cmake"
     INSTALL_DESTINATION ${INSTALL_CONFIGDIR}
 )
 
 #Install the config, configversion and custom find modules
 install(FILES
-    "${CMAKE_CURRENT_BINARY_DIR}/${project}-config.cmake"
-    "${CMAKE_CURRENT_BINARY_DIR}/${project}-config-version.cmake"
+    "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_PROJECT_NAME}-config.cmake"
+    "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_PROJECT_NAME}-config-version.cmake"
     DESTINATION ${INSTALL_CONFIGDIR}
 )
 
 ##############################################
 ## Exporting from the build tree
-export(EXPORT ${project}-targets
-    FILE "${CMAKE_CURRENT_BINARY_DIR}/${project}-targets.cmake")
-
+export(EXPORT ${CMAKE_PROJECT_NAME}-targets
+    FILE "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_PROJECT_NAME}-targets.cmake")
 
 ##############################################
 ## Add test
 if(TEST)
+    if(PYBIND)
+        message(FATAL_ERROR "PYBIND and TEST are both enabled. But cannot compile with catch_2.\nChoose between pybind and Catch2 for compilation.")
+    endif()
     enable_testing()
     add_subdirectory(unit_tests)
 endif()
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000000000000000000000000000000000000..7cbc972fca5c7af1ffb6df6c08480ebad982884c
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,8 @@
+include README.md LICENCE
+recursive-include aidge_backend_cuda *.py 
+recursive-exclude aidge_backend_cuda/unit_tests *.py
+
+recursive-include include *.hpp
+recursive-include src *.cpp
+recursive-include python_binding *.cpp
+include CMakeLists.txt
diff --git a/README.md b/README.md
index 30b34248e75429d321b3b1bfa1861496ed50878f..09e16ed4f48371ff11093aa69a17c576c2b8d173 100644
--- a/README.md
+++ b/README.md
@@ -3,15 +3,28 @@
 # Aidge CUDA library
 
 You can find in this folder the library that implements the CUDA operators.
+[TOC]
 
-## Pip installation
+## Installation
 
-You will need to install first the aidge_core library before installing aidge_backend_cuda.
-Also, make sure that the install path was set before installing aidge_core library.
-Then run in your python environnement : 
+### Dependencies
+- `GCC`
+- `Make`/`Ninja`
+- `CMake`
+- `Python` (optional, if you have no intend to use this library in python with pybind)
+
+#### Aidge dependencies
+ - `aidge_core`
+ - `aidge_backend_cpu`
+
+### Pip installation
 ``` bash
 pip install . -v
 ```
+> **TIPS:** Use environment variables to change compilation options:
+> - `AIDGE_INSTALL`: to set the installation folder. Defaults to /usr/local/lib. :warning: This path must be identical to aidge_core install path.
+> - `AIDGE_PYTHON_BUILD_TYPE`: to set the compilation mode to **Debug** or **Release** 
+> - `AIDGE_BUILD_GEN`: to set the build backend with 
 
 ## Standard C++ Compilation
 
diff --git a/aidge_backend_cuda/__init__.py b/aidge_backend_cuda/__init__.py
index a2c06b0ec8a29b7100b0cf7c461092c491197331..c59afd66efc6197d93268e9e205f35048a26d60e 100644
--- a/aidge_backend_cuda/__init__.py
+++ b/aidge_backend_cuda/__init__.py
@@ -1 +1,2 @@
-from aidge_backend_cuda.aidge_backend_cuda import * # import so generated by PyBind
+from aidge_backend_cuda.aidge_backend_cuda import *  # import so generated by PyBind
+from ._version import *
diff --git a/aidge_backend_cuda/unit_tests/test_tensor.py b/aidge_backend_cuda/unit_tests/test_tensor.py
index 6c4717442803badd3d0ac2ea96fb3be44baeaaff..035e0f96d9dee1e948c2aa621181c3f215a457c3 100644
--- a/aidge_backend_cuda/unit_tests/test_tensor.py
+++ b/aidge_backend_cuda/unit_tests/test_tensor.py
@@ -6,15 +6,17 @@ import numpy as np
 
 
 class test_tensor(unittest.TestCase):
-    """Test tensor binding
-    """
+    """Test tensor binding"""
+
     def setUp(self):
         pass
+
     def tearDown(self):
         pass
 
     def test_getavailable_backends(self):
         self.assertTrue("cuda" in aidge_core.Tensor.get_available_backends())
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     unittest.main()
diff --git a/cmake/PybindModuleCreation.cmake b/cmake/PybindModuleCreation.cmake
index 8030c1a8639e4b7ae0c5fb865e928a4260c6ae7d..8f386bef59ed86dfa366eca5d4fccae24b28d24e 100644
--- a/cmake/PybindModuleCreation.cmake
+++ b/cmake/PybindModuleCreation.cmake
@@ -1,21 +1,25 @@
-function(generate_python_binding name target_to_bind)
+function(generate_python_binding pybind_module_name target_to_bind) 
     add_definitions(-DPYBIND)
     Include(FetchContent)
 
+    set(PYBIND_VERSION v2.10.4)
+    set(PYBIND11_FINDPYTHON ON)
+    message(STATUS "Retrieving pybind ${PYBIND_VERSION} from git")
+
     FetchContent_Declare(
-    PyBind11
-    GIT_REPOSITORY https://github.com/pybind/pybind11.git
-    GIT_TAG        v2.10.4 # or a later release
+        PyBind11
+        GIT_REPOSITORY https://github.com/pybind/pybind11.git
+        GIT_TAG        ${PYBIND_VERSION} # or a later release
     )
 
     # Use the New FindPython mode, recommanded. Requires CMake 3.15+
-    find_package(Python COMPONENTS Interpreter Development)
+    find_package(Python COMPONENTS Interpreter Development.Module)
     FetchContent_MakeAvailable(PyBind11)
 
-    message(STATUS "Creating binding for module ${name}")
+    message(STATUS "Creating binding for module ${pybind_module_name}")
     file(GLOB_RECURSE pybind_src_files "python_binding/*.cpp")
 
-    pybind11_add_module(${name} MODULE ${pybind_src_files} "NO_EXTRAS") # NO EXTRA recquired for pip install
-    target_include_directories(${name} PUBLIC "python_binding")
-    target_link_libraries(${name} PUBLIC ${target_to_bind})
+    pybind11_add_module(${pybind_module_name} MODULE ${pybind_src_files} "NO_EXTRAS") # NO EXTRA recquired for pip install
+    target_include_directories(${pybind_module_name} PUBLIC "python_binding")
+    target_link_libraries(${pybind_module_name} PUBLIC ${target_to_bind})
 endfunction()
diff --git a/include/aidge/backend/cuda.hpp b/include/aidge/backend/cuda.hpp
index 580dce246b4c43e9a82fc977103145f79ae0976e..d5e9d1654f0a4fe894ed0e965a25b32c9e5caa06 100644
--- a/include/aidge/backend/cuda.hpp
+++ b/include/aidge/backend/cuda.hpp
@@ -14,17 +14,32 @@
 
 #include "aidge/backend/cuda/data/TensorImpl.hpp"
 #include "aidge/backend/cuda/operator/AddImpl.hpp"
+#include "aidge/backend/cuda/operator/AndImpl.hpp"
+#include "aidge/backend/cuda/operator/ArgMaxImpl.hpp"
 #include "aidge/backend/cuda/operator/AvgPoolingImpl.hpp"
 #include "aidge/backend/cuda/operator/BatchNormImpl.hpp"
 #include "aidge/backend/cuda/operator/ConvImpl.hpp"
+#include "aidge/backend/cuda/operator/DivImpl.hpp"
 #include "aidge/backend/cuda/operator/FCImpl.hpp"
 #include "aidge/backend/cuda/operator/GlobalAveragePoolingImpl.hpp"
+#include "aidge/backend/cuda/operator/LnImpl.hpp"
 #include "aidge/backend/cuda/operator/MaxPoolingImpl.hpp"
+#include "aidge/backend/cuda/operator/MulImpl.hpp"
 #include "aidge/backend/cuda/operator/PadImpl.hpp"
+#include "aidge/backend/cuda/operator/PowImpl.hpp"
+#include "aidge/backend/cuda/operator/ReduceMeanImpl.hpp"
+#include "aidge/backend/cuda/operator/ReduceSumImpl.hpp"
 #include "aidge/backend/cuda/operator/ReLUImpl.hpp"
+#include "aidge/backend/cuda/operator/ShiftMaxImpl.hpp"
+#include "aidge/backend/cuda/operator/ShiftGELUImpl.hpp"
 #include "aidge/backend/cuda/operator/ReshapeImpl.hpp"
 #include "aidge/backend/cuda/operator/SigmoidImpl.hpp"
 #include "aidge/backend/cuda/operator/SubImpl.hpp"
 #include "aidge/backend/cuda/operator/TanhImpl.hpp"
 
+#include "aidge/backend/cuda/operator/ShiftMaxImpl.hpp"
+#include "aidge/backend/cuda/operator/ShiftGELUImpl.hpp"
+#include "aidge/backend/cuda/operator/ILayerNormImpl.hpp"
+
+
 #endif /* AIDGE_BACKEND_CUDA_IMPORTS_H_ */
diff --git a/include/aidge/backend/cuda/data/TensorImpl.hpp b/include/aidge/backend/cuda/data/TensorImpl.hpp
index 96045781647f93f0627ca0853a0cdaa66a08af83..541afeecc751332d41ff082b790282abcad5a1b0 100644
--- a/include/aidge/backend/cuda/data/TensorImpl.hpp
+++ b/include/aidge/backend/cuda/data/TensorImpl.hpp
@@ -221,7 +221,39 @@ public:
                                             &strides[0]));
             }
         }
+        else {
+            // Compare if the shape of the tensor has changed
+            cudnnDataType_t currentDataType;
+            int currentNbDims;
+            // Since we don't know the nb dims of the current tensor, we init with CUDNN_DIM_MAX then remove the trailing zeros
+            std::vector<int> currentDims(CUDNN_DIM_MAX);
+            std::vector<int> currentStrides(CUDNN_DIM_MAX);
+
+            CHECK_CUDNN_STATUS(cudnnGetTensorNdDescriptor(mCudnnTensor, CUDNN_DIM_MAX, &currentDataType, &currentNbDims, currentDims.data(), currentStrides.data()));
+            // Remove the trailing zeros
+            currentDims.erase(std::find_if(currentDims.rbegin(), currentDims.rend(), [](int x) { return x != 0; }).base(),
+                              currentDims.end());
+
+            std::vector<int> dims(tensor.dims().cbegin(), tensor.dims().cend());
+            if (dims.size() < 4) {
+                dims.resize(4, 1);
+            }
+
+            // Update descriptor if shape has changed
+            if (dims!=currentDims) {
+                std::vector<int> strides(tensor.strides().cbegin(), tensor.strides().cend());
 
+                if (strides.size() < 4) {
+                    strides.resize(4, 1);
+                }
+
+                CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(mCudnnTensor,
+                                            CudaContext::data_type<T>::value,
+                                            dims.size(),
+                                            &dims[0],
+                                            &strides[0]));
+            }
+        }
         return mCudnnTensor;
     }
 
@@ -255,7 +287,7 @@ static Registrar<Tensor> registrarTensorImpl_cuda_Float32(
 static Registrar<Tensor> registrarTensorImpl_cuda_Float16(
         {"cuda", DataType::Float16}, Aidge::TensorImpl_cuda<half_float::half>::create);
 static Registrar<Tensor> registrarTensorImpl_cuda_Int32(
-        {"cuda", DataType::Int32}, Aidge::TensorImpl_cuda<int>::create);
+        {"cuda", DataType::Int32}, Aidge::TensorImpl_cuda<int32_t>::create);
 }  // namespace
 }  // namespace Aidge
 
diff --git a/include/aidge/backend/cuda/operator/AddImpl.hpp b/include/aidge/backend/cuda/operator/AddImpl.hpp
index cd1819753cd00a325443d9c9c992f3d2347bb377..429d6f1b04489d9e38ce96d584a1ce9528dd0b2d 100644
--- a/include/aidge/backend/cuda/operator/AddImpl.hpp
+++ b/include/aidge/backend/cuda/operator/AddImpl.hpp
@@ -27,30 +27,33 @@
 #include "aidge/backend/cuda/utils/CudaUtils.hpp"
 
 namespace Aidge {
+// Operator implementation entry point for the backend
 class AddImpl_cuda : public OperatorImpl {
-private:
-
-
 public:
-    AddImpl_cuda(const Add_Op &op) : OperatorImpl(op, "cuda") {}
+    AddImpl_cuda(const Add_Op& op) : OperatorImpl(op, "cuda") {}
 
-    static std::unique_ptr<AddImpl_cuda> create(const Add_Op &op) {
+    static std::unique_ptr<AddImpl_cuda> create(const Add_Op& op) {
         return std::make_unique<AddImpl_cuda>(op);
     }
 
-public:
-    void forward();
-    void backward();
-    // ~AddImpl_cuda();
+    virtual std::set<ImplSpec> getAvailableImplSpecs() const override {
+        return {
+            {DataType::Float64},
+            {DataType::Float32},
+            {DataType::Float16},
+        };
+    }
+
+    void forward() override;
+    void backward() override;
+
 private:
     template <class T> void forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides);
     template <class T> void backward_(const Tensor& outGrad, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides);
 };
 
-namespace {
-// add cuda backend to Add_Op implementation registry
-static Registrar<Add_Op> registrarAddImpl_cuda("cuda", Aidge::AddImpl_cuda::create);
-}  // namespace
+// Implementation entry point registration to Operator
+REGISTRAR(Add_Op, "cuda", Aidge::AddImpl_cuda::create);
 }  // namespace Aidge
 
 #endif /* AIDGE_BACKEND_CUDA_OPERATOR_ADDIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/AndImpl.hpp b/include/aidge/backend/cuda/operator/AndImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..4105ec87db2c58e218c629a1c94f31efd37c80ee
--- /dev/null
+++ b/include/aidge/backend/cuda/operator/AndImpl.hpp
@@ -0,0 +1,57 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_BACKEND_CUDA_OPERATOR_ANDIMPL_H_
+#define AIDGE_BACKEND_CUDA_OPERATOR_ANDIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include <cudnn.h>
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/And.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+class AndImpl_cuda : public OperatorImpl {
+public:
+    AndImpl_cuda(const And_Op& op) : OperatorImpl(op, "cuda") {}
+
+    static std::unique_ptr<AndImpl_cuda> create(const And_Op& op) {
+        return std::make_unique<AndImpl_cuda>(op);
+    }
+
+    virtual std::set<ImplSpec> getAvailableImplSpecs() const override {
+        return {
+            {DataType::Float64},
+            {DataType::Float32},
+            {DataType::Float16},
+        };
+    }
+
+    void forward() override;
+
+private:
+    template <class T> void forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides);
+};
+
+// Implementation entry point registration to Operator
+REGISTRAR(And_Op, "cuda", Aidge::AndImpl_cuda::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_BACKEND_CUDA_OPERATOR_ANDIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/AndImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/AndImpl_CUDA_kernels.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..bae79a03d03cd5fb7d5fdc4fbebf1dd7562370ae
--- /dev/null
+++ b/include/aidge/backend/cuda/operator/AndImpl_CUDA_kernels.hpp
@@ -0,0 +1,37 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CUDA_OPERATOR_ANDIMPL_KERNELS_H_
+#define AIDGE_CUDA_OPERATOR_ANDIMPL_KERNELS_H_
+
+#include <stdexcept>
+#include <cfloat>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <cuda_fp16.h>
+
+#include "aidge/data/Data.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+
+template <class T>
+void AndForward(const T* input1, const T* input2, T* output,
+                const std::vector<int>& input1Dims,const std::vector<int>& input2Dims,
+                const std::vector<int>& inputStrides, const std::vector<int>& input2Strides,const std::vector<int>& outputStrides,
+                int outSize);
+}
+#endif /* AIDGE_CUDA_OPERATOR_ANDIMPL_KERNELS_H_ */
+
+
+
+
+
diff --git a/include/aidge/backend/cuda/operator/ArgMaxImpl.hpp b/include/aidge/backend/cuda/operator/ArgMaxImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a89aebf96914f258f6be616b940ec195ec9ae2a9
--- /dev/null
+++ b/include/aidge/backend/cuda/operator/ArgMaxImpl.hpp
@@ -0,0 +1,60 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_BACKEND_CUDA_OPERATOR_ARGMAXIMPL_H_
+#define AIDGE_BACKEND_CUDA_OPERATOR_ARGMAXIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include <cudnn.h>
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/ArgMax.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+class ArgMaxImpl_cuda : public OperatorImpl {
+public:
+    ArgMaxImpl_cuda(const ArgMax_Op& op) : OperatorImpl(op, "cuda") {}
+
+    static std::unique_ptr<ArgMaxImpl_cuda> create(const ArgMax_Op& op) {
+        return std::make_unique<ArgMaxImpl_cuda>(op);
+    }
+
+    virtual std::set<ImplSpec> getAvailableImplSpecs() const override {
+        return {
+            {DataType::Float64},
+            {DataType::Float32},
+            {DataType::Float16},
+        };
+    }
+
+    void forward() override;
+
+private:
+    // CuDNN specific variables
+    std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback;
+
+    template <class T> void forward_(const Tensor& input, std::int32_t axis, DimSize_t selectLastIdx);
+};
+
+// Implementation entry point registration to Operator
+REGISTRAR(ArgMax_Op, "cuda", Aidge::ArgMaxImpl_cuda::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_BACKEND_CUDA_OPERATOR_ARGMAXIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/ArgMaxImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/ArgMaxImpl_CUDA_kernels.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..8c07bf597f6422a26cedd4176fdb1ef29bcabcef
--- /dev/null
+++ b/include/aidge/backend/cuda/operator/ArgMaxImpl_CUDA_kernels.hpp
@@ -0,0 +1,31 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CUDA_OPERATOR_ARGMAXIMPL_KERNEL_H_
+#define AIDGE_CUDA_OPERATOR_ARGMAXIMPL_KERNEL_H_
+
+#include <stdexcept>
+#include <cfloat>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <cuda_fp16.h>
+
+#include "aidge/data/Data.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge
+{
+    template <class T>
+    void ArgMax_cuda_forward_kernel(const T* input, T* output,
+                                    const std::vector<int>& inputDims, const std::vector<int>& inputStrides,
+                                    int axis, int total_elems, std::size_t selectLastIdx);
+}
+#endif /* AIDGE_CUDA_OPERATOR_ARGMAXIMPL_KERNEL_H_ */
\ No newline at end of file
diff --git a/include/aidge/backend/cuda/operator/AvgPoolingImpl.hpp b/include/aidge/backend/cuda/operator/AvgPoolingImpl.hpp
index 540ec574f9b5fbcea8b8f28e390cbe05f1e0fa8e..7f8fb4075affd3e5f17533ea67b051dbb6395f04 100644
--- a/include/aidge/backend/cuda/operator/AvgPoolingImpl.hpp
+++ b/include/aidge/backend/cuda/operator/AvgPoolingImpl.hpp
@@ -27,35 +27,41 @@
 #include "aidge/backend/cuda/utils/CudaUtils.hpp"
 
 namespace Aidge {
+// Operator implementation entry point for the backend
 template <DimIdx_t DIM>
 class AvgPoolingImpl_cuda : public OperatorImpl {
-private:
-    // CuDNN specific variables
-    cudnnPoolingDescriptor_t mAvgPoolingDesc = nullptr;
-    cudnnPoolingMode_t mMode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
-    std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback;
-
 public:
-    AvgPoolingImpl_cuda(const AvgPooling_Op<DIM> &op) : OperatorImpl(op, "cuda") {}
+    AvgPoolingImpl_cuda(const AvgPooling_Op<DIM>& op) : OperatorImpl(op, "cuda") {}
 
-    static std::unique_ptr<AvgPoolingImpl_cuda> create(const AvgPooling_Op<2> &op) {
+    static std::unique_ptr<AvgPoolingImpl_cuda> create(const AvgPooling_Op<DIM>& op) {
         return std::make_unique<AvgPoolingImpl_cuda>(op);
     }
 
-public:
-    void forward();
-    void backward();
+    virtual std::set<ImplSpec> getAvailableImplSpecs() const override {
+        return {
+            {DataType::Float64},
+            {DataType::Float32},
+            {DataType::Float16},
+        };
+    }
+
+    void forward() override;
+    void backward() override;
     ~AvgPoolingImpl_cuda();
 
 private:
+    // CuDNN specific variables
+    cudnnPoolingDescriptor_t mAvgPoolingDesc = nullptr;
+    cudnnPoolingMode_t mMode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
+    std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback;
+
     template <class T> void forward_(const Tensor& input);
     template <class T> void backward_(const Tensor& output_grad);
 };
 
-namespace {
-// add cuda backend to AvgPooling_Op<2> implementation registry
-static Registrar<AvgPooling_Op<2>> registrarAvgPoolingImpl_cuda("cuda", Aidge::AvgPoolingImpl_cuda<2>::create);
-}  // namespace
+// Implementation entry point registration to Operator
+using AvgPooling2D_Op = AvgPooling_Op<2>;
+REGISTRAR(AvgPooling2D_Op, "cuda", Aidge::AvgPoolingImpl_cuda<2>::create);
 }  // namespace Aidge
 
 #endif /* AIDGE_BACKEND_CUDA_OPERATOR_AVGPOOLINGIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/BatchNormImpl.hpp b/include/aidge/backend/cuda/operator/BatchNormImpl.hpp
index 3451d07f289371202570434f96546344c0c4fb26..5ba8656ef8a25ffa53584641a938f637ecff9b94 100644
--- a/include/aidge/backend/cuda/operator/BatchNormImpl.hpp
+++ b/include/aidge/backend/cuda/operator/BatchNormImpl.hpp
@@ -27,35 +27,41 @@
 #include "aidge/backend/cuda/utils/CudaUtils.hpp"
 
 namespace Aidge {
+// Operator implementation entry point for the backend
 template <DimIdx_t DIM>
 class BatchNormImpl_cuda : public OperatorImpl {
-private:
-    // CuDNN specific variables
-    cudnnTensorDescriptor_t mBNDesc = nullptr;
-    cudnnBatchNormMode_t mMode;
-    double mEpsilon;
-
 public:
-    BatchNormImpl_cuda(const BatchNorm_Op<DIM> &op) : OperatorImpl(op, "cuda") {}
+    BatchNormImpl_cuda(const BatchNorm_Op<DIM>& op) : OperatorImpl(op, "cuda") {}
 
-    static std::unique_ptr<BatchNormImpl_cuda> create(const BatchNorm_Op<DIM> &op) {
+    static std::unique_ptr<BatchNormImpl_cuda> create(const BatchNorm_Op<DIM>& op) {
         return std::make_unique<BatchNormImpl_cuda>(op);
     }
 
-public:
-    void forward();
-    void backward();
+    virtual std::set<ImplSpec> getAvailableImplSpecs() const override {
+        return {
+            {DataType::Float64},
+            {DataType::Float32},
+            {DataType::Float16},
+        };
+    }
+
+    void forward() override;
+    void backward() override;
     ~BatchNormImpl_cuda();
 
 private:
+    // CuDNN specific variables
+    cudnnTensorDescriptor_t mBNDesc = nullptr;
+    cudnnBatchNormMode_t mMode;
+    double mEpsilon;
+
     template <class T> void forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, const Tensor& input3, const Tensor& input4);
     template <class T> void backward_(const Tensor& input0, const Tensor& input1, const Tensor& input2);
 };
 
-namespace {
-// add cuda backend to BatchNorm_Op<2> implementation registry
-static Registrar<BatchNorm_Op<2>> registrarBatchNormImpl_cuda("cuda", Aidge::BatchNormImpl_cuda<2>::create);
-}  // namespace
+// Implementation entry point registration to Operator
+using BatchNorm2D_Op = BatchNorm_Op<2>;
+REGISTRAR(BatchNorm2D_Op, "cuda", Aidge::BatchNormImpl_cuda<2>::create);
 }  // namespace Aidge
 
 #endif /* AIDGE_BACKEND_CUDA_OPERATOR_BATCHNORMIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/ConvImpl.hpp b/include/aidge/backend/cuda/operator/ConvImpl.hpp
index 0722048f7cf021104a9694a621b1c0dad00ce423..ce94ec6695735c93d5c8d0acfdc6153e91e7147d 100644
--- a/include/aidge/backend/cuda/operator/ConvImpl.hpp
+++ b/include/aidge/backend/cuda/operator/ConvImpl.hpp
@@ -29,13 +29,35 @@
 
 
 namespace Aidge {
+// Operator implementation entry point for the backend
 template <DimIdx_t DIM>
 class ConvImpl_cuda : public OperatorImpl {
+public:
+    ConvImpl_cuda(const Operator&op, bool depthWise = false) : OperatorImpl(op, "cuda"), mDepthWise(depthWise) {}
+
+    static std::unique_ptr<ConvImpl_cuda<DIM>> create(const Conv_Op<DIM>& op) {
+        return std::make_unique<ConvImpl_cuda<DIM>>(op);
+    }
+
+    static std::unique_ptr<ConvImpl_cuda<DIM>> createDW(const ConvDepthWise_Op<DIM> &op) {
+        return std::make_unique<ConvImpl_cuda<DIM>>(op, true);
+    }
+
+    virtual std::set<ImplSpec> getAvailableImplSpecs() const override {
+        return {
+            {DataType::Any}
+        };
+    }
+
+    void forward() override;
+    void backward() override;
+    ~ConvImpl_cuda();
+
 private:
     // CuDNN specific variables
     cudnnConvolutionDescriptor_t mConvDesc = nullptr;
     cudnnFilterDescriptor_t mFilterDesc = nullptr;
-    cudnnConvolutionFwdAlgo_t mFwdAlgo;
+    cudnnConvolutionFwdAlgo_t mFwdAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
     cudnnConvolutionBwdFilterAlgo_t mBwdFilterAlgo;
     cudnnConvolutionBwdDataAlgo_t mBwdDataAlgo;
     size_t mWorkspaceSize = 0;
@@ -46,31 +68,15 @@ private:
     std::shared_ptr<Tensor> mInput2Fallback;
     bool mDepthWise = false;
 
-public:
-    ConvImpl_cuda(const Operator&op, bool depthWise = false) : OperatorImpl(op, "cuda"), mDepthWise(depthWise) {}
-
-    static std::unique_ptr<ConvImpl_cuda> create(const Conv_Op<DIM> &op) {
-        return std::make_unique<ConvImpl_cuda>(op);
-    }
-
-    static std::unique_ptr<ConvImpl_cuda> createDW(const ConvDepthWise_Op<DIM> &op) {
-        return std::make_unique<ConvImpl_cuda>(op, true);
-    }
-
-public:
-    void forward();
-    void backward();
-    ~ConvImpl_cuda();
-
-private:
     template <class T> void forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2);
     template <class T> void backward_(const Tensor& input0, const Tensor& input1, const Tensor& input2);
 };
 
-namespace {
-static Registrar<Conv_Op<2>> registrarConvImpl_cuda("cuda", Aidge::ConvImpl_cuda<2>::create);
-static Registrar<ConvDepthWise_Op<2>> registrarConvDepthWiseImpl_cuda("cuda", Aidge::ConvImpl_cuda<2>::createDW);
-}  // namespace
+// Implementation entry point registration to Operator
+using Conv2D_Op = Conv_Op<2>;
+using ConvDepthWise2D_Op = ConvDepthWise_Op<2>;
+REGISTRAR(Conv2D_Op, "cuda", Aidge::ConvImpl_cuda<2>::create);
+REGISTRAR(ConvDepthWise2D_Op, "cuda", Aidge::ConvImpl_cuda<2>::createDW);
 }  // namespace Aidge
 
 #endif /* AIDGE_BACKEND_CUDA_OPERATOR_CONVIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/DivImpl.hpp b/include/aidge/backend/cuda/operator/DivImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..4b15445cb791aa1cf2520018d1015e19aaf10ce3
--- /dev/null
+++ b/include/aidge/backend/cuda/operator/DivImpl.hpp
@@ -0,0 +1,59 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_BACKEND_CUDA_OPERATOR_DIVIMPL_H_
+#define AIDGE_BACKEND_CUDA_OPERATOR_DIVIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include <cudnn.h>
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/Div.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+class DivImpl_cuda : public OperatorImpl {
+public:
+    DivImpl_cuda(const Div_Op& op) : OperatorImpl(op, "cuda") {}
+
+    static std::unique_ptr<DivImpl_cuda> create(const Div_Op& op) {
+        return std::make_unique<DivImpl_cuda>(op);
+    }
+
+    virtual std::set<ImplSpec> getAvailableImplSpecs() const override {
+        return {
+            {DataType::Float64},
+            {DataType::Float32},
+            {DataType::Float16},
+        };
+    }
+
+    void forward() override;
+    void backward() override;
+
+private:
+    template <class T> void forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides);
+    template <class T> void backward_(const Tensor& outGrad);
+};
+
+// Implementation entry point registration to Operator
+REGISTRAR(Div_Op, "cuda", Aidge::DivImpl_cuda::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_BACKEND_CUDA_OPERATOR_DIVIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/DivImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/DivImpl_CUDA_kernels.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..512bec77bb63570ffeb8f1681e4e25cd323535fa
--- /dev/null
+++ b/include/aidge/backend/cuda/operator/DivImpl_CUDA_kernels.hpp
@@ -0,0 +1,39 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CUDA_OPERATOR_DIVIMPL_KERNELS_H_
+#define AIDGE_CUDA_OPERATOR_DIVIMPL_KERNELS_H_
+
+#include <stdexcept>
+#include <cfloat>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <cuda_fp16.h>
+
+#include "aidge/data/Data.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+
+template <class T>
+void divForward(const T* input1, T* output, const T* intput2,
+                const std::vector<int>& input1Dims,const std::vector<int>& input2Dims, const std::vector<int>& outputDims,
+                const std::vector<int>& input1Strides, const std::vector<int>& input2Strides,const std::vector<int>& outputStrides,
+                int outSize);
+
+}
+#endif /* AIDGE_CUDA_OPERATOR_DIVIMPL_KERNELS_H_ */
+
+
+
+
+
diff --git a/include/aidge/backend/cuda/operator/FCImpl.hpp b/include/aidge/backend/cuda/operator/FCImpl.hpp
index 46f7849d1f17aab5496bdbde013ef078ad1f5a7c..f2dd0c90c0096a1b57fb6860e5991d0c1e824be9 100644
--- a/include/aidge/backend/cuda/operator/FCImpl.hpp
+++ b/include/aidge/backend/cuda/operator/FCImpl.hpp
@@ -27,34 +27,37 @@
 #include "aidge/backend/cuda/utils/CudaUtils.hpp"
 
 namespace Aidge {
+// Operator implementation entry point for the backend
 class FCImpl_cuda : public OperatorImpl {
-private:
-    std::shared_ptr<Tensor> mInput0Fallback;
-    std::shared_ptr<Tensor> mInput1Fallback;
-    std::shared_ptr<Tensor> mInput2Fallback;
-
-
 public:
-    FCImpl_cuda(const FC_Op &op) : OperatorImpl(op, "cuda") {}
+    FCImpl_cuda(const FC_Op& op) : OperatorImpl(op, "cuda") {}
 
-    static std::unique_ptr<FCImpl_cuda> create(const FC_Op &op) {
+    static std::unique_ptr<FCImpl_cuda> create(const FC_Op& op) {
         return std::make_unique<FCImpl_cuda>(op);
     }
 
-public:
-    void forward();
-    void backward();
-    // ~FCImpl_cuda();
+    virtual std::set<ImplSpec> getAvailableImplSpecs() const override {
+        return {
+            {DataType::Float64},
+            {DataType::Float32},
+            {DataType::Float16},
+        };
+    }
+
+    void forward() override;
+    void backward() override;
 
 private:
+    std::shared_ptr<Tensor> mInput0Fallback;
+    std::shared_ptr<Tensor> mInput1Fallback;
+    std::shared_ptr<Tensor> mInput2Fallback;
+
     template <class T> void forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, std::size_t outChannels);
     template <class T> void backward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, std::size_t outChannels);
 };
 
-namespace {
-// add cuda backend to FC_Op implementation registry
-static Registrar<FC_Op> registrarFCImpl_cuda("cuda", Aidge::FCImpl_cuda::create);
-}  // namespace
+// Implementation entry point registration to Operator
+REGISTRAR(FC_Op, "cuda", Aidge::FCImpl_cuda::create);
 }  // namespace Aidge
 
 #endif /* AIDGE_BACKEND_CUDA_OPERATOR_FCIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp
index 8d1af8f7c5954c2eae9179926aec433eee34414f..a956960df0a4dccb4ef9eb0634e5f61b9ddede0a 100644
--- a/include/aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp
+++ b/include/aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp
@@ -9,8 +9,8 @@
  *
  ********************************************************************************/
 
-#ifndef AIDGE_CUDA_OPERATOR_FCIMPL_FORWARD_KERNEL_H_
-#define AIDGE_CUDA_OPERATOR_FCIMPL_FORWARD_KERNEL_H_
+#ifndef AIDGE_CUDA_OPERATOR_FCIMPL_KERNELS_H_
+#define AIDGE_CUDA_OPERATOR_FCIMPL_KERNELS_H_
 
 #include <stdexcept>
 #include <cfloat>
@@ -42,4 +42,4 @@ cublasStatus_t cublasGemv(cublasHandle_t handle, cublasOperation_t trans,
                           const T *beta,
                           T *y, int incy);
 }
-#endif /* AIDGE_CUDA_OPERATOR_FCIMPL_FORWARD_KERNEL_H_ */
\ No newline at end of file
+#endif /* AIDGE_CUDA_OPERATOR_FCIMPL_KERNELS_H_ */
\ No newline at end of file
diff --git a/include/aidge/backend/cuda/operator/GlobalAveragePoolingImpl.hpp b/include/aidge/backend/cuda/operator/GlobalAveragePoolingImpl.hpp
index 6e0fad5c01efb6474f527dee0bfbfdc594788bc6..3f0386dcfa68d4b55bebeb524dfedfd5edeb0fe9 100644
--- a/include/aidge/backend/cuda/operator/GlobalAveragePoolingImpl.hpp
+++ b/include/aidge/backend/cuda/operator/GlobalAveragePoolingImpl.hpp
@@ -27,34 +27,37 @@
 #include "aidge/backend/cuda/utils/CudaUtils.hpp"
 
 namespace Aidge {
+// Operator implementation entry point for the backend
 class GlobalAveragePoolingImpl_cuda : public OperatorImpl {
-private:
-    // CuDNN specific variables
-    cudnnPoolingDescriptor_t mGlobalAveragePoolingDesc = nullptr;
-    cudnnPoolingMode_t mMode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
-    std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback;
-
 public:
-    GlobalAveragePoolingImpl_cuda(const GlobalAveragePooling_Op &op) : OperatorImpl(op, "cuda") {}
+    GlobalAveragePoolingImpl_cuda(const GlobalAveragePooling_Op& op) : OperatorImpl(op, "cuda") {}
 
-    static std::unique_ptr<GlobalAveragePoolingImpl_cuda> create(const GlobalAveragePooling_Op &op) {
+    static std::unique_ptr<GlobalAveragePoolingImpl_cuda> create(const GlobalAveragePooling_Op& op) {
         return std::make_unique<GlobalAveragePoolingImpl_cuda>(op);
     }
 
-public:
-    void forward();
-    void backward();
+    virtual std::set<ImplSpec> getAvailableImplSpecs() const override {
+        return {
+            {DataType::Any}
+        };
+    }
+
+    void forward() override;
+    void backward() override;
     ~GlobalAveragePoolingImpl_cuda();
 
 private:
+    // CuDNN specific variables
+    cudnnPoolingDescriptor_t mGlobalAveragePoolingDesc = nullptr;
+    cudnnPoolingMode_t mMode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
+    std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback;
+
     template <class T> void forward_(const Tensor& input);
     template <class T> void backward_(const Tensor& output_grad);
 };
 
-namespace {
-// add cuda backend to GlobalAveragePooling_Op implementation registry
-static Registrar<GlobalAveragePooling_Op> registrarGlobalAveragePoolingImpl_cuda("cuda", Aidge::GlobalAveragePoolingImpl_cuda::create);
-}  // namespace
+// Implementation entry point registration to Operator
+REGISTRAR(GlobalAveragePooling_Op, "cuda", Aidge::GlobalAveragePoolingImpl_cuda::create);
 }  // namespace Aidge
 
 #endif /* AIDGE_BACKEND_CUDA_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/ILayerNormImpl.hpp b/include/aidge/backend/cuda/operator/ILayerNormImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..742401de7903f19ab4d8f51a153b0e864f21dd47
--- /dev/null
+++ b/include/aidge/backend/cuda/operator/ILayerNormImpl.hpp
@@ -0,0 +1,65 @@
+/********************************************************************************
+ * Copyright (c) 2024 Thales
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ * Author: Lucas RAKOTOARIVONY, Thales Research & Technology France
+ * Date: 10.09.2024
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_BACKEND_CUDA_OPERATOR_ILAYERNORMIMPL_H_
+#define AIDGE_BACKEND_CUDA_OPERATOR_ILAYERNORMIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include <cudnn.h>
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/ILayerNorm.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+class ILayerNormImpl_cuda : public OperatorImpl {
+public:
+    ILayerNormImpl_cuda(const ILayerNorm_Op &op) : OperatorImpl(op, "cuda") {}
+
+    static std::unique_ptr<ILayerNormImpl_cuda> create(const ILayerNorm_Op &op) {
+        return std::make_unique<ILayerNormImpl_cuda>(op);
+    }
+
+    virtual std::set<ImplSpec> getAvailableImplSpecs() const override {
+        return {
+            {DataType::Float64},
+            {DataType::Float32},
+            {DataType::Float16},
+        };
+    }
+
+    void forward() override;
+    void backward() override;
+
+private:
+    std::shared_ptr<Tensor> mInput0Fallback;
+    std::shared_ptr<Tensor> mInput1Fallback;
+    std::shared_ptr<Tensor> mInput2Fallback;
+    std::shared_ptr<Tensor> mOutputGradFallback;
+
+    template <class T> void forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2);
+    template <class T> void backward_(const Tensor& output_grad);
+};
+
+// Implementation entry point registration to Operator
+REGISTRAR(ILayerNorm_Op, "cuda", Aidge::ILayerNormImpl_cuda::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_BACKEND_CUDA_OPERATOR_ILAYERNORMIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/ILayerNormImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/ILayerNormImpl_CUDA_kernels.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..aa54029ea29bc46809f227038a1a23d91bc161ee
--- /dev/null
+++ b/include/aidge/backend/cuda/operator/ILayerNormImpl_CUDA_kernels.hpp
@@ -0,0 +1,92 @@
+/********************************************************************************
+ * Copyright (c) 2024 Thales
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ * Author: Lucas RAKOTOARIVONY, Thales Research & Technology France
+ * Date: 10.09.2024
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CUDA_OPERATOR_ILAYERNORMIMPL_FORWARD_KERNEL_H_
+#define AIDGE_CUDA_OPERATOR_ILAYERNORMIMPL_FORWARD_KERNEL_H_
+
+#include <stdexcept>
+#include <cfloat>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <cuda_fp16.h>
+
+#include "aidge/data/Data.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+
+/**
+    * @brief Compute the forward for ILayerNorm
+    * @param input: Input tensor
+    * @param SF: Scaling factor of input tensor
+    * @param dims: Dimensions of input tensor
+    * @param quantized_tensor: Quantized output tensor
+    * @param square_tensor: Tensor use for computation
+    * @param weight: weight of ILayerNorm layer 
+    * @param bias: bias of ILayerNorm layer
+    * @param new_SF: Scaling factor of output that can be use to dequantify
+*/
+template <class T>
+__global__ void ILayerNormforward_(T* input, double SF, int* dims, int* quantized_tensor,long long int* square_tensor, T* weight, T* biase, double new_SF);
+
+/**
+    * @brief Wrapper function to execute ILayerNormforward_
+    * @note Output correspond to the non-quantized tensor, to obtain the quantized tensor we need to copy quantized_tensor and not input_cuda_tensor
+    * @param input: Input tensor
+    * @param output: Output tensor (not quantized)
+    * @param SF: Scaling factor of input tensor
+    * @param weight_raw: weight of ILayerNorm layer 
+    * @param bias_raw: bias of ILayerNorm layer
+    * @param size: Number of elements in the input tensor
+    * @param dims: Dimensions of input tensor
+*/
+template <class T>
+void ILayerNormforward(const T* input, T* output, double SF, const T* weight_raw, const T* bias_raw, size_t size, std::vector<long unsigned int> dims_input);
+
+/**
+    * @brief Compute the backward for ILayerNorm
+    * @param output_grad: Gradient of output tensor
+    * @param input_tensor: Input tensor
+    * @param output_tensor: Output tensor obtained after forward
+    * @param mean: Arithmetic mean of input tensor
+    * @param var: Arithmetic variance of input tensor
+    * @param weight: weight of ILayerNorm layer 
+    * @param bias: bias of ILayerNorm layer
+    * @param input_grad: Gradient of input tensor 
+    * @param weight_grad: Gradient of ILayerNorm weight 
+    * @param bias_grad: Gradient of ILayerNorm bias 
+    * @param size: Number of elements in the input tensor
+*/
+template <class T>
+__global__ void ILayerNormbackward_(T* output_grad, T* input_tensor, T* output_tensor, T* mean, T* var, T* weight, T* bias, T* input_grad, T* weight_grad, T* bias_grad, int size);
+
+/**
+    * @brief Wrapper function to execute ILayerNormbackward_
+    * @param input_tensor: Input tensor
+    * @param output_grad: Gradient of output tensor
+    * @param output_tensor: Output tensor obtained after forward
+    * @param mean: Arithmetic mean of input tensor
+    * @param var: Arithmetic variance of input tensor
+    * @param weight: weight of ILayerNorm layer 
+    * @param bias: bias of ILayerNorm layer
+    * @param input_grad: Gradient of input tensor 
+    * @param weight_grad: Gradient of ILayerNorm weight 
+    * @param bias_grad: Gradient of ILayerNorm bias 
+    * @param size: Number of elements in the input tensor
+*/
+template <class T>
+void ILayerNormbackward(const T* input_tensor, const T* output_grad, const T* output_tensor,const T* mean,const T* var, const T* weight, const T* bias, T* input_grad, T* weight_grad, T* bias_grad, size_t size);
+
+}
+
+#endif /* AIDGE_CUDA_OPERATOR_ILAYERNORMIMPL_FORWARD_KERNEL_H_ */
\ No newline at end of file
diff --git a/include/aidge/backend/cuda/operator/LnImpl.hpp b/include/aidge/backend/cuda/operator/LnImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1617754fbf5dd52e099a9787a25a827851933af9
--- /dev/null
+++ b/include/aidge/backend/cuda/operator/LnImpl.hpp
@@ -0,0 +1,62 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_BACKEND_CUDA_OPERATOR_LNIMPL_H_
+#define AIDGE_BACKEND_CUDA_OPERATOR_LNIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include <cudnn.h>
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/Ln.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+class LnImpl_cuda : public OperatorImpl {
+public:
+    LnImpl_cuda(const Ln_Op& op) : OperatorImpl(op, "cuda") {}
+
+    static std::unique_ptr<LnImpl_cuda> create(const Ln_Op& op) {
+        return std::make_unique<LnImpl_cuda>(op);
+    }
+
+    virtual std::set<ImplSpec> getAvailableImplSpecs() const override {
+        return {
+            {DataType::Float64},
+            {DataType::Float32},
+            {DataType::Float16},
+        };
+    }
+
+    void forward() override;
+    void backward() override;
+
+private:
+    std::shared_ptr<Tensor> mInputFallback;
+    std::shared_ptr<Tensor> mOutputGradFallback;
+
+    template <class T> void forward_(const Tensor& input);
+    template <class T> void backward_(const Tensor& output_grad);
+};
+
+// Implementation entry point registration to Operator
+REGISTRAR(Ln_Op, "cuda", Aidge::LnImpl_cuda::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_BACKEND_CUDA_OPERATOR_LNIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/LnImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/LnImpl_CUDA_kernels.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9652d88116ca2cac92abbc517f8bc650655f43cc
--- /dev/null
+++ b/include/aidge/backend/cuda/operator/LnImpl_CUDA_kernels.hpp
@@ -0,0 +1,36 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CUDA_OPERATOR_LNIMPL_KERNELS_H_
+#define AIDGE_CUDA_OPERATOR_LNIMPL_KERNELS_H_
+
+#include <stdexcept>
+#include <cfloat>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <cuda_fp16.h>
+
+#include "aidge/data/Data.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+
+template <class T>
+void lnForward(const T* input, T* output, int size);
+
+}
+#endif /* AIDGE_CUDA_OPERATOR_LNIMPL_KERNELS_H_ */
+
+
+
+
+
diff --git a/include/aidge/backend/cuda/operator/MaxPoolingImpl.hpp b/include/aidge/backend/cuda/operator/MaxPoolingImpl.hpp
index db7f1e376013db52aeb1b27f8cc3ff192c7f0629..a203e761beaeccec96b36bbd5a424a193cdb6387 100644
--- a/include/aidge/backend/cuda/operator/MaxPoolingImpl.hpp
+++ b/include/aidge/backend/cuda/operator/MaxPoolingImpl.hpp
@@ -27,35 +27,39 @@
 #include "aidge/backend/cuda/utils/CudaUtils.hpp"
 
 namespace Aidge {
+// Operator implementation entry point for the backend
 template <DimIdx_t DIM>
 class MaxPoolingImpl_cuda : public OperatorImpl {
-private:
-    // CuDNN specific variables
-    cudnnPoolingDescriptor_t mMaxPoolingDesc = nullptr;
-    cudnnPoolingMode_t mMode = CUDNN_POOLING_MAX;
-    std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback;
-
 public:
-    MaxPoolingImpl_cuda(const MaxPooling_Op<DIM> &op) : OperatorImpl(op, "cuda") {}
+    MaxPoolingImpl_cuda(const MaxPooling_Op<DIM>& op) : OperatorImpl(op, "cuda") {}
 
-    static std::unique_ptr<MaxPoolingImpl_cuda> create(const MaxPooling_Op<2> &op) {
+    static std::unique_ptr<MaxPoolingImpl_cuda> create(const MaxPooling_Op<DIM>& op) {
         return std::make_unique<MaxPoolingImpl_cuda>(op);
     }
 
-public:
-    void forward();
-    void backward();
+    virtual std::set<ImplSpec> getAvailableImplSpecs() const override {
+        return {
+            {DataType::Any}
+        };
+    }
+
+    void forward() override;
+    void backward() override;
     ~MaxPoolingImpl_cuda();
 
 private:
+    // CuDNN specific variables
+    cudnnPoolingDescriptor_t mMaxPoolingDesc = nullptr;
+    cudnnPoolingMode_t mMode = CUDNN_POOLING_MAX;
+    std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback;
+
     template <class T> void forward_(const Tensor& input);
     template <class T> void backward_(const Tensor& output_grad);
 };
 
-namespace {
-// add cuda backend to MaxPooling_Op<2> implementation registry
-static Registrar<MaxPooling_Op<2>> registrarMaxPoolingImpl_cuda("cuda", Aidge::MaxPoolingImpl_cuda<2>::create);
-}  // namespace
+// Implementation entry point registration to Operator
+using MaxPooling2D_Op = MaxPooling_Op<2>;
+REGISTRAR(MaxPooling2D_Op, "cuda", Aidge::MaxPoolingImpl_cuda<2>::create);
 }  // namespace Aidge
 
 #endif /* AIDGE_BACKEND_CUDA_OPERATOR_MAXPOOLINGIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/MulImpl.hpp b/include/aidge/backend/cuda/operator/MulImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..37d3d5a0df7b63dc63ad13737d8a8b463bf315c8
--- /dev/null
+++ b/include/aidge/backend/cuda/operator/MulImpl.hpp
@@ -0,0 +1,59 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_BACKEND_CUDA_OPERATOR_MULIMPL_H_
+#define AIDGE_BACKEND_CUDA_OPERATOR_MULIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include <cudnn.h>
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/Mul.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+class MulImpl_cuda : public OperatorImpl {
+public:
+    MulImpl_cuda(const Mul_Op& op) : OperatorImpl(op, "cuda") {}
+
+    static std::unique_ptr<MulImpl_cuda> create(const Mul_Op& op) {
+        return std::make_unique<MulImpl_cuda>(op);
+    }
+
+    virtual std::set<ImplSpec> getAvailableImplSpecs() const override {
+        return {
+            {DataType::Float64},
+            {DataType::Float32},
+            {DataType::Float16},
+        };
+    }
+
+    void forward() override;
+    void backward() override;
+
+private:
+    template <class T> void forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides);
+    template <class T> void backward_(const Tensor& outputGrad, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides);
+};
+
+// Implementation entry point registration to Operator
+REGISTRAR(Mul_Op, "cuda", Aidge::MulImpl_cuda::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_BACKEND_CUDA_OPERATOR_MULIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/PadImpl.hpp b/include/aidge/backend/cuda/operator/PadImpl.hpp
index 4452d3408e7b4780c1e5c4ea6553ba0b713df231..d51361d6ee5a3ec9a858d290b3f5fe5251b6fa97 100644
--- a/include/aidge/backend/cuda/operator/PadImpl.hpp
+++ b/include/aidge/backend/cuda/operator/PadImpl.hpp
@@ -27,35 +27,41 @@
 #include "aidge/backend/cuda/utils/CudaUtils.hpp"
 
 namespace Aidge {
+// Operator implementation entry point for the backend
 template <DimIdx_t DIM>
 class PadImpl_cuda : public OperatorImpl {
-private:
-    // CuDNN specific variables
-    std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback;
-    int mLeftPad, mTopPad;
-    double mPadVal;
-    unsigned int mPadType;
-
 public:
-    PadImpl_cuda(const Pad_Op<DIM> &op) : OperatorImpl(op, "cuda") {}
+    PadImpl_cuda(const Pad_Op<DIM>& op) : OperatorImpl(op, "cuda") {}
 
-    static std::unique_ptr<PadImpl_cuda> create(const Pad_Op<2> &op) {
+    static std::unique_ptr<PadImpl_cuda> create(const Pad_Op<DIM>& op) {
         return std::make_unique<PadImpl_cuda>(op);
     }
 
-public:
-    void forward();
-    void backward();
+    virtual std::set<ImplSpec> getAvailableImplSpecs() const override {
+        return {
+            {DataType::Float64},
+            {DataType::Float32},
+            {DataType::Float16},
+        };
+    }
+
+    void forward() override;
+    void backward() override;
 
 private:
+    // CuDNN specific variables
+    std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback;
+    int mLeftPad, mTopPad;
+    double mPadVal;
+    unsigned int mPadType;
+
     template <class T> void forward_(const Tensor& input);
     template <class T> void backward_(const Tensor& outGrad);
 };
 
-namespace {
-// add cuda backend to Pad_Op<2> implementation registry
-static Registrar<Pad_Op<2>> registrarPadImpl_cuda("cuda", Aidge::PadImpl_cuda<2>::create);
-}  // namespace
+// Implementation entry point registration to Operator
+using Pad2D_Op = Pad_Op<2>;
+REGISTRAR(Pad2D_Op, "cuda", Aidge::PadImpl_cuda<2>::create);
 }  // namespace Aidge
 
 #endif /* AIDGE_BACKEND_CUDA_OPERATOR_PADIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/PadImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/PadImpl_CUDA_kernels.hpp
index c6a83160da5cf3fea3d3415959c965e16c1eb4ff..11ddb0ea8b0e6603bf009c4ae0a7fa3247a8904f 100644
--- a/include/aidge/backend/cuda/operator/PadImpl_CUDA_kernels.hpp
+++ b/include/aidge/backend/cuda/operator/PadImpl_CUDA_kernels.hpp
@@ -9,8 +9,8 @@
  *
  ********************************************************************************/
 
-#ifndef AIDGE_CUDA_OPERATOR_PADIMPL_FORWARD_KERNEL_H_
-#define AIDGE_CUDA_OPERATOR_PADIMPL_FORWARD_KERNEL_H_
+#ifndef AIDGE_CUDA_OPERATOR_PADIMPL_KERNELS_H_
+#define AIDGE_CUDA_OPERATOR_PADIMPL_KERNELS_H_
 
 #include "aidge/data/Data.hpp"
 #include "aidge/backend/cuda/utils/CudaUtils.hpp"
@@ -34,4 +34,4 @@ namespace Aidge
                      const T *input,
                      T *outputs);
 }
-#endif /* AIDGE_CUDA_OPERATOR_PADIMPL_FORWARD_KERNEL_H_ */
\ No newline at end of file
+#endif /* AIDGE_CUDA_OPERATOR_PADIMPL_KERNELS_H_ */
\ No newline at end of file
diff --git a/include/aidge/backend/cuda/operator/PowImpl.hpp b/include/aidge/backend/cuda/operator/PowImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..403648d9a294ee598f117c8b05e6f0875e998307
--- /dev/null
+++ b/include/aidge/backend/cuda/operator/PowImpl.hpp
@@ -0,0 +1,59 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_BACKEND_CUDA_OPERATOR_POWIMPL_H_
+#define AIDGE_BACKEND_CUDA_OPERATOR_POWIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include <cudnn.h>
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/Pow.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+class PowImpl_cuda : public OperatorImpl {
+public:
+    PowImpl_cuda(const Pow_Op& op) : OperatorImpl(op, "cuda") {}
+
+    static std::unique_ptr<PowImpl_cuda> create(const Pow_Op& op) {
+        return std::make_unique<PowImpl_cuda>(op);
+    }
+
+    virtual std::set<ImplSpec> getAvailableImplSpecs() const override {
+        return {
+            {DataType::Float64},
+            {DataType::Float32},
+            {DataType::Float16},
+        };
+    }
+
+    void forward() override;
+    void backward() override;
+
+private:
+    template <class T> void forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides);
+    template <class T> void backward_(const Tensor& outGrad);
+};
+
+// Implementation entry point registration to Operator
+REGISTRAR(Pow_Op, "cuda", Aidge::PowImpl_cuda::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_BACKEND_CUDA_OPERATOR_POWIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/PowImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/PowImpl_CUDA_kernels.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e89bea53ba766b0bd90f0c7acd631b0370d96298
--- /dev/null
+++ b/include/aidge/backend/cuda/operator/PowImpl_CUDA_kernels.hpp
@@ -0,0 +1,38 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CUDA_OPERATOR_POWIMPL_KERNELS_H_
+#define AIDGE_CUDA_OPERATOR_POWIMPL_KERNELS_H_
+
+#include <stdexcept>
+#include <cfloat>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <cuda_fp16.h>
+
+#include "aidge/data/Data.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+
+template <class T>
+void powForward(const T* input, T* output, const T* exponent,
+                const std::vector<int>& inputDims,const std::vector<int>& exponentDims, const std::vector<int>& outputDims,
+                const std::vector<int>& inputStrides, const std::vector<int>& exponentStrides,const std::vector<int>& outputStrides,
+                int outSize);
+
+}
+#endif /* AIDGE_CUDA_OPERATOR_POWIMPL_KERNELS_H_ */
+
+
+
+
+
diff --git a/include/aidge/backend/cuda/operator/ReLUImpl.hpp b/include/aidge/backend/cuda/operator/ReLUImpl.hpp
index 285713f460b9d5b5e868c0c07ab23804f30dd694..344923ba1ee08642a3e3e5f685bfd2c7de8a74b4 100644
--- a/include/aidge/backend/cuda/operator/ReLUImpl.hpp
+++ b/include/aidge/backend/cuda/operator/ReLUImpl.hpp
@@ -27,7 +27,25 @@
 #include "aidge/backend/cuda/utils/CudaUtils.hpp"
 
 namespace Aidge {
+// Operator implementation entry point for the backend
 class ReLUImpl_cuda : public OperatorImpl {
+public:
+    ReLUImpl_cuda(const ReLU_Op& op) : OperatorImpl(op, "cuda") {}
+
+    static std::unique_ptr<ReLUImpl_cuda> create(const ReLU_Op& op) {
+        return std::make_unique<ReLUImpl_cuda>(op);
+    }
+
+    virtual std::set<ImplSpec> getAvailableImplSpecs() const override {
+        return {
+            {DataType::Any}
+        };
+    }
+
+    void forward() override;
+    void backward() override;
+    ~ReLUImpl_cuda();
+
 private:
     // CuDNN specific variables
     #if CUDNN_VERSION >= 5000
@@ -38,27 +56,12 @@ private:
     std::shared_ptr<Tensor> mInputFallback;
     std::shared_ptr<Tensor> mOutputGradFallback;
 
-public:
-    ReLUImpl_cuda(const ReLU_Op &op) : OperatorImpl(op, "cuda") {}
-
-    static std::unique_ptr<ReLUImpl_cuda> create(const ReLU_Op &op) {
-        return std::make_unique<ReLUImpl_cuda>(op);
-    }
-
-public:
-    void forward();
-    void backward();
-    ~ReLUImpl_cuda();
-
-private:
     template <class T> void forward_(const Tensor& input);
     template <class T> void backward_(const Tensor& output_grad);
 };
 
-namespace {
-// add cuda backend to ReLU_Op implementation registry
-static Registrar<ReLU_Op> registrarReLUImpl_cuda("cuda", Aidge::ReLUImpl_cuda::create);
-}  // namespace
+// Implementation entry point registration to Operator
+REGISTRAR(ReLU_Op, "cuda", Aidge::ReLUImpl_cuda::create);
 }  // namespace Aidge
 
 #endif /* AIDGE_BACKEND_CUDA_OPERATOR_RELUIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/ReduceImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/ReduceImpl_CUDA_kernels.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9d352b8b1d14aeaa4230accd7aa81c279c18b7a8
--- /dev/null
+++ b/include/aidge/backend/cuda/operator/ReduceImpl_CUDA_kernels.hpp
@@ -0,0 +1,30 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CUDA_OPERATOR_REDUCEIMPL_KERNEL_H_
+#define AIDGE_CUDA_OPERATOR_REDUCEIMPL_KERNEL_H_
+
+#include "aidge/data/Data.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge
+{
+
+    template <class T>
+    void ReduceBackward(const T* input,
+                                   T* output,
+                                   const std::vector<std::size_t>& inputDims,
+                                   const std::vector<std::size_t>& outputDims,
+                                   const std::vector<int>& axes,
+                                   const std::vector<std::size_t>& factors,
+                                   int outSize);
+}
+#endif /* AIDGE_CUDA_OPERATOR_REDUCEIMPL_KERNEL_H_ */
\ No newline at end of file
diff --git a/include/aidge/backend/cuda/operator/ReduceMeanImpl.hpp b/include/aidge/backend/cuda/operator/ReduceMeanImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a50ff21b35f0b062c6a9c327ea2892c15055a175
--- /dev/null
+++ b/include/aidge/backend/cuda/operator/ReduceMeanImpl.hpp
@@ -0,0 +1,62 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_BACKEND_CUDA_OPERATOR_REDUCEMEANIMPL_H_
+#define AIDGE_BACKEND_CUDA_OPERATOR_REDUCEMEANIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include <cudnn.h>
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/ReduceMean.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+class ReduceMeanImpl_cuda : public OperatorImpl {
+public:
+    ReduceMeanImpl_cuda(const ReduceMean_Op& op) : OperatorImpl(op, "cuda") {}
+
+    static std::unique_ptr<ReduceMeanImpl_cuda> create(const ReduceMean_Op& op) {
+        return std::make_unique<ReduceMeanImpl_cuda>(op);
+    }
+
+    virtual std::set<ImplSpec> getAvailableImplSpecs() const override {
+        return {
+            {DataType::Float64},
+            {DataType::Float32},
+            {DataType::Float16},
+        };
+    }
+
+    void forward() override;
+    void backward() override;
+
+private:
+    // CuDNN specific variables
+    std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback;
+
+    template <class T> void forward_(const Tensor& input, const std::vector<int>& axes, bool keepDims);
+    template <class T> void backward_(const Tensor& output_grad, const std::vector<int>& axes);
+};
+
+// Implementation entry point registration to Operator
+REGISTRAR(ReduceMean_Op, "cuda", Aidge::ReduceMeanImpl_cuda::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_BACKEND_CUDA_OPERATOR_REDUCEMEANIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/ReduceSumImpl.hpp b/include/aidge/backend/cuda/operator/ReduceSumImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a5a7ae48d7e5bd8f370964d7f81795ecbaa5986b
--- /dev/null
+++ b/include/aidge/backend/cuda/operator/ReduceSumImpl.hpp
@@ -0,0 +1,62 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_BACKEND_CUDA_OPERATOR_REDUCESUMIMPL_H_
+#define AIDGE_BACKEND_CUDA_OPERATOR_REDUCESUMIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include <cudnn.h>
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/ReduceSum.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+class ReduceSumImpl_cuda : public OperatorImpl {
+public:
+    ReduceSumImpl_cuda(const ReduceSum_Op& op) : OperatorImpl(op, "cuda") {}
+
+    static std::unique_ptr<ReduceSumImpl_cuda> create(const ReduceSum_Op& op) {
+        return std::make_unique<ReduceSumImpl_cuda>(op);
+    }
+
+    virtual std::set<ImplSpec> getAvailableImplSpecs() const override {
+        return {
+            {DataType::Float64},
+            {DataType::Float32},
+            {DataType::Float16},
+        };
+    }
+
+    void forward() override;
+    void backward() override;
+
+private:
+    // CuDNN specific variables
+    std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback;
+
+    template <class T> void forward_(const Tensor& input, const std::vector<int>& axes, bool keepDims);
+    template <class T> void backward_(const Tensor& output_grad, const std::vector<int>& axes);
+};
+
+// Implementation entry point registration to Operator
+REGISTRAR(ReduceSum_Op, "cuda", Aidge::ReduceSumImpl_cuda::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_BACKEND_CUDA_OPERATOR_REDUCESUMIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/ReshapeImpl.hpp b/include/aidge/backend/cuda/operator/ReshapeImpl.hpp
index 7b43df680bef115310669f0d55f2f78ef4fe9fa6..d412590c63f925806973038d67ee18e0847f79c2 100644
--- a/include/aidge/backend/cuda/operator/ReshapeImpl.hpp
+++ b/include/aidge/backend/cuda/operator/ReshapeImpl.hpp
@@ -27,27 +27,32 @@
 #include "aidge/backend/cuda/utils/CudaUtils.hpp"
 
 namespace Aidge {
+// Operator implementation entry point for the backend
 class ReshapeImpl_cuda : public OperatorImpl {
-private:
-    std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback;
-
 public:
-    ReshapeImpl_cuda(const Reshape_Op &op) : OperatorImpl(op, "cuda") {}
+    ReshapeImpl_cuda(const Reshape_Op& op) : OperatorImpl(op, "cuda") {}
 
-    static std::unique_ptr<ReshapeImpl_cuda> create(const Reshape_Op &op) {
+    static std::unique_ptr<ReshapeImpl_cuda> create(const Reshape_Op& op) {
         return std::make_unique<ReshapeImpl_cuda>(op);
     }
 
-public:
-    void forward();
-    void backward();
-    ~ReshapeImpl_cuda();
+    virtual std::set<ImplSpec> getAvailableImplSpecs() const override {
+        return {
+            {DataType::Float64},
+            {DataType::Float32},
+            {DataType::Float16},
+        };
+    }
+
+    void forward() override;
+    void backward() override;
+
+private:
+    std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback;
 };
 
-namespace {
-// add cuda backend to Reshape_Op implementation registry
-static Registrar<Reshape_Op> registrarReshapeImpl_cuda("cuda", Aidge::ReshapeImpl_cuda::create);
-}  // namespace
+// Implementation entry point registration to Operator
+REGISTRAR(Reshape_Op, "cuda", Aidge::ReshapeImpl_cuda::create);
 }  // namespace Aidge
 
 #endif /* AIDGE_BACKEND_CUDA_OPERATOR_RESHAPEIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/ShiftGELUImpl.hpp b/include/aidge/backend/cuda/operator/ShiftGELUImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f83b41ae139482cdb0cd1060846c77ba78fcc0ee
--- /dev/null
+++ b/include/aidge/backend/cuda/operator/ShiftGELUImpl.hpp
@@ -0,0 +1,65 @@
+/********************************************************************************
+ * Copyright (c) 2024 Thales
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ * Author: Lucas RAKOTOARIVONY, Thales Research & Technology France
+ * Date: 25.06.2024
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_BACKEND_CUDA_OPERATOR_SHIFTGELUIMPL_H_
+#define AIDGE_BACKEND_CUDA_OPERATOR_SHIFTGELUIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include <cudnn.h>
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/ShiftGELU.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+class ShiftGELUImpl_cuda : public OperatorImpl {
+public:
+    ShiftGELUImpl_cuda(const ShiftGELU_Op &op) : OperatorImpl(op, "cuda") {}
+
+    static std::unique_ptr<ShiftGELUImpl_cuda> create(const ShiftGELU_Op &op) {
+        return std::make_unique<ShiftGELUImpl_cuda>(op);
+    }
+
+    virtual std::set<ImplSpec> getAvailableImplSpecs() const override {
+        return {
+            {DataType::Float64},
+            {DataType::Float32},
+            {DataType::Float16},
+        };
+    }
+
+
+    void forward() override;
+    void backward() override;
+
+private:
+    std::shared_ptr<Tensor> mInputFallback;
+    std::shared_ptr<Tensor> mOutputGradFallback;
+
+    template <class T> void forward_(const Tensor& input);
+    template <class T> void backward_(const Tensor& output_grad);
+    
+};
+
+// Implementation entry point registration to Operator
+REGISTRAR(ShiftGELU_Op, "cuda", Aidge::ShiftGELUImpl_cuda::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_BACKEND_CUDA_OPERATOR_SHIFTGELUIMPL_H_ */
\ No newline at end of file
diff --git a/include/aidge/backend/cuda/operator/ShiftGELUImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/ShiftGELUImpl_CUDA_kernels.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..14268521451a631ccb9194d44ed7543af8d494f5
--- /dev/null
+++ b/include/aidge/backend/cuda/operator/ShiftGELUImpl_CUDA_kernels.hpp
@@ -0,0 +1,78 @@
+/********************************************************************************
+ * Copyright (c) 2024 Thales
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ * Author: Lucas RAKOTOARIVONY, Thales Research & Technology France
+ * Date: 25.06.2024
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CUDA_OPERATOR_SHIFTGELUIMPL_KERNELS_H_
+#define AIDGE_CUDA_OPERATOR_SHIFTGELUIMPL_KERNELS_H_
+
+#include <stdexcept>
+#include <cfloat>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <cuda_fp16.h>
+
+#include "aidge/data/Data.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+
+/**
+    * @brief Compute the forward for ShiftGELU
+    * @param input: Input tensor
+    * @param quantized_tensor: Quantized output tensor
+    * @param GELUtensor: Pointer to an empty memory block allocated on the GPU (just use for computation)
+    * @param SumTensor: Pointer to an empty memory block allocated on the GPU (just use for computation)
+    * @param dims: Dimensions of input tensor
+    * @param SF: Scaling factor of input tensor
+    * @param N: Arithmetic precision, currently set at 15 like I-ViT (the greater the N, the more precise the operation, but the greater the number of bits required)
+    * @param output_bits: Desired bit precision (8 for int8, for example)
+*/
+template <class T>
+__global__ void ShiftGELUforward_(T* input,int* quantized_tensor,int* GELUtensor,int* SumTensor, int* dims, double SF, int N, int output_bits);
+
+/**
+    * @brief Wrapper function to execute ShiftGELUforward_
+    * @note Output correspond to the non-quantized tensor, to obtain the quantized tensor we need to copy quantized_tensor and not input_cuda_tensor
+    * @param input: Input tensor
+    * @param output: Output tensor (not quantized)
+    * @param SF: Scaling factor of input tensor
+    * @param N: Arithmetic precision, currently set at 15 like I-ViT (the greater the N, the more precise the operation, but the greater the number of bits required)
+    * @param output_bits: Desired bit precision (8 for int8, for example)
+    * @param size: Number of elements in the input tensor
+    * @param dims_input: Dimensions of input tensor
+*/
+template <class T>
+void ShiftGELUforward(const T* input, T* output, double SF,int N, int output_bits, size_t size, std::vector<long unsigned int> dims_input);
+
+/**
+    * @brief Compute the backward for ShiftGELU
+    * @param input_grad: Gradient of input tensor (that we want to obtain)
+    * @param output_tensor: Output tensor obtained after forward
+    * @param output_grad: Gradient of output tensor
+    * @param size: Number of elements in the input tensor
+*/
+template <class T>
+__global__ void ShiftGELUbackward_(T* input_grad, const T* output_tensor, const T* output_grad, int size);
+
+/**
+    * @brief Wrapper function to execute ShiftGELUbackward_
+    * @param output_tensor: Output tensor obtained after forward
+    * @param output_grad: Gradient of output tensor
+    * @param input_grad: Gradient of input tensor (that we want to obtain)
+    * @param size: Number of elements in the input tensor
+*/
+template <class T>
+void ShiftGELUbackward(const T* output_tensor, const T* output_grad, T* input_grad, size_t size);
+
+}
+
+#endif /* AIDGE_CUDA_OPERATOR_SHIFTGELUIMPL_FORWARD_KERNEL_H_ */
diff --git a/include/aidge/backend/cuda/operator/ShiftMaxImpl.hpp b/include/aidge/backend/cuda/operator/ShiftMaxImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..707b5616fde120f7e8ef38e6dc9f1552cfdb0d59
--- /dev/null
+++ b/include/aidge/backend/cuda/operator/ShiftMaxImpl.hpp
@@ -0,0 +1,64 @@
+/********************************************************************************
+ * Copyright (c) 2024 Thales
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ * Author: Lucas RAKOTOARIVONY, Thales Research & Technology France
+ * Date: 25.06.2024
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_BACKEND_CUDA_OPERATOR_SHIFTMAXIMPL_H_
+#define AIDGE_BACKEND_CUDA_OPERATOR_SHIFTMAXIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include <cudnn.h>
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/ShiftMax.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+class ShiftMaxImpl_cuda : public OperatorImpl {
+public:
+    ShiftMaxImpl_cuda(const ShiftMax_Op &op) : OperatorImpl(op, "cuda") {}
+
+    static std::unique_ptr<ShiftMaxImpl_cuda> create(const ShiftMax_Op &op) {
+        return std::make_unique<ShiftMaxImpl_cuda>(op);
+    }
+
+    virtual std::set<ImplSpec> getAvailableImplSpecs() const override {
+        return {
+            {DataType::Float64},
+            {DataType::Float32},
+            {DataType::Float16},
+        };
+    }
+
+    void forward() override;
+    void backward() override;
+
+private:
+    std::shared_ptr<Tensor> mInputFallback;
+    std::shared_ptr<Tensor> mOutputGradFallback;
+
+    template <class T> void forward_(const Tensor& input);
+    template <class T> void backward_(const Tensor& output_grad);
+    
+};
+
+// Implementation entry point registration to Operator
+REGISTRAR(ShiftMax_Op, "cuda", Aidge::ShiftMaxImpl_cuda::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_BACKEND_CUDA_OPERATOR_SHIFTMAXIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/ShiftMaxImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/ShiftMaxImpl_CUDA_kernels.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..037a7cbb6362a8eca5a9e6f5a277b29a6a6bd907
--- /dev/null
+++ b/include/aidge/backend/cuda/operator/ShiftMaxImpl_CUDA_kernels.hpp
@@ -0,0 +1,79 @@
+/********************************************************************************
+ * Copyright (c) 2024 Thales
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ * Author: Lucas RAKOTOARIVONY, Thales Research & Technology France
+ * Date: 25.06.2024
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CUDA_OPERATOR_SHIFTMAXIMPL_KERNELS_H_
+#define AIDGE_CUDA_OPERATOR_SHIFTMAXIMPL_KERNELS_H_
+
+#include <stdexcept>
+#include <cfloat>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <cuda_fp16.h>
+
+#include "aidge/data/Data.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+
+/**
+    * @brief Compute the forward for ShiftMax
+    * @param input: Input tensor
+    * @param quantized_tensor: Quantized output tensor
+    * @param factor: Pointer to an empty memory block allocated on the GPU (just use for computation)
+    * @param dims: Dimensions of input tensor
+    * @param SF: Scaling factor of input tensor
+    * @param N: Arithmetic precision, currently set at 15 like I-ViT (the greater the N, the more precise the operation, but the greater the number of bits required)
+    * @param output_bits: Desired bit precision (8 for int8, for example)
+    * @param new_SF: Scaling factor of output that can be use to dequantify
+*/
+template <class T>
+__global__ void ShiftMaxforward_(T* input,int* quantized_tensor,int* factor, int* dims, double SF, int N, int output_bits,double new_SF);
+
+/**
+    * @brief Wrapper function to execute ShiftMaxforward_
+    * @note Output correspond to the non-quantized tensor, to obtain the quantized tensor we need to copy quantized_tensor and not input_cuda_tensor
+    * @param input: Input tensor
+    * @param output: Output tensor (not quantized)
+    * @param SF: Scaling factor of input tensor
+    * @param N: Arithmetic precision, currently set at 15 like I-ViT (the greater the N, the more precise the operation, but the greater the number of bits required)
+    * @param output_bits: Desired bit precision (8 for int8, for example)
+    * @param size: Number of elements in the input tensor
+    * @param dims_input: Dimensions of input tensor
+*/
+template <class T>
+void ShiftMaxforward(const T* input, T* output, double SF,int N, int output_bits, size_t size, std::vector<long unsigned int> dims_input);
+
+/**
+    * @brief Compute the backward for ShiftMax
+    * @param input_grad: Gradient of input tensor (that we want to obtain)
+    * @param output_tensor: Output tensor obtained after forward
+    * @param output_grad: Gradient of output tensor
+    * @param dims: Dimensions of input tensor
+*/
+template <class T>
+__global__ void ShiftMaxbackward_(T* input_grad, const T* output_tensor, const T* output_grad, const int* dims);
+
+/**
+    * @brief Wrapper function to execute ShiftMaxbackward_
+    * @param output_tensor: Output tensor obtained after forward
+    * @param output_grad: Gradient of output tensor
+    * @param input_grad: Gradient of input tensor (that we want to obtain)
+    * @param size: Number of elements in the input tensor
+    * @param dims: Dimensions of input tensor
+*/
+template <class T>
+void ShiftMaxbackward(const T* output_tensor, const T* output_grad, T* input_grad, size_t size, std::vector<long unsigned int> dims);
+
+}
+
+#endif /* AIDGE_CUDA_OPERATOR_SHIFTMAXIMPL_FORWARD_KERNEL_H_ */
diff --git a/include/aidge/backend/cuda/operator/SigmoidImpl.hpp b/include/aidge/backend/cuda/operator/SigmoidImpl.hpp
index 90dbb717732ad788b868fdc95eb55579a5e0b9f6..bc29b9e5f53716641a692cd63c29f4600f3cdd02 100644
--- a/include/aidge/backend/cuda/operator/SigmoidImpl.hpp
+++ b/include/aidge/backend/cuda/operator/SigmoidImpl.hpp
@@ -27,7 +27,25 @@
 #include "aidge/backend/cuda/utils/CudaUtils.hpp"
 
 namespace Aidge {
+// Operator implementation entry point for the backend
 class SigmoidImpl_cuda : public OperatorImpl {
+public:
+    SigmoidImpl_cuda(const Sigmoid_Op& op) : OperatorImpl(op, "cuda") {}
+
+    static std::unique_ptr<SigmoidImpl_cuda> create(const Sigmoid_Op& op) {
+        return std::make_unique<SigmoidImpl_cuda>(op);
+    }
+
+    virtual std::set<ImplSpec> getAvailableImplSpecs() const override {
+        return {
+            {DataType::Any}
+        };
+    }
+
+    void forward() override;
+    void backward() override;
+    ~SigmoidImpl_cuda();
+
 private:
     // CuDNN specific variables
     #if CUDNN_VERSION >= 5000
@@ -38,27 +56,12 @@ private:
     std::shared_ptr<Tensor> mInputFallback;
     std::shared_ptr<Tensor> mOutputGradFallback;
 
-public:
-    SigmoidImpl_cuda(const Sigmoid_Op &op) : OperatorImpl(op, "cuda") {}
-
-    static std::unique_ptr<SigmoidImpl_cuda> create(const Sigmoid_Op &op) {
-        return std::make_unique<SigmoidImpl_cuda>(op);
-    }
-
-public:
-    void forward();
-    void backward();
-    ~SigmoidImpl_cuda();
-
-private:
     template <class T> void forward_(const Tensor& input);
     template <class T> void backward_(const Tensor& output_grad);
 };
 
-namespace {
-// add cuda backend to Sigmoid_Op implementation registry
-static Registrar<Sigmoid_Op> registrarSigmoidImpl_cuda("cuda", Aidge::SigmoidImpl_cuda::create);
-}  // namespace
+// Implementation entry point registration to Operator
+REGISTRAR(Sigmoid_Op, "cuda", Aidge::SigmoidImpl_cuda::create);
 }  // namespace Aidge
 
 #endif /* AIDGE_BACKEND_CUDA_OPERATOR_SIGMOIDIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/SubImpl.hpp b/include/aidge/backend/cuda/operator/SubImpl.hpp
index fd1a76692abdf16b9854b90f535f68329ae5877a..45c833f3e7f9f25258469a4d1e34e8598df068ef 100644
--- a/include/aidge/backend/cuda/operator/SubImpl.hpp
+++ b/include/aidge/backend/cuda/operator/SubImpl.hpp
@@ -27,30 +27,33 @@
 #include "aidge/backend/cuda/utils/CudaUtils.hpp"
 
 namespace Aidge {
+// Operator implementation entry point for the backend
 class SubImpl_cuda : public OperatorImpl {
-private:
-
-
 public:
-    SubImpl_cuda(const Sub_Op &op) : OperatorImpl(op, "cuda") {}
+    SubImpl_cuda(const Sub_Op& op) : OperatorImpl(op, "cuda") {}
 
-    static std::unique_ptr<SubImpl_cuda> create(const Sub_Op &op) {
+    static std::unique_ptr<SubImpl_cuda> create(const Sub_Op& op) {
         return std::make_unique<SubImpl_cuda>(op);
     }
 
-public:
-    void forward();
-    void backward();
-    // ~SubImpl_cuda();
+    virtual std::set<ImplSpec> getAvailableImplSpecs() const override {
+        return {
+            {DataType::Float64},
+            {DataType::Float32},
+            {DataType::Float16},
+        };
+    }
+
+    void forward() override;
+    void backward() override;
+
 private:
     template <class T> void forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides);
     template <class T> void backward_(const Tensor& outGrad, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides);
 };
 
-namespace {
-// add cuda backend to Sub_Op implementation registry
-static Registrar<Sub_Op> registrarSubImpl_cuda("cuda", Aidge::SubImpl_cuda::create);
-}  // namespace
+// Implementation entry point registration to Operator
+REGISTRAR(Sub_Op, "cuda", Aidge::SubImpl_cuda::create);
 }  // namespace Aidge
 
 #endif /* AIDGE_BACKEND_CUDA_OPERATOR_SUBIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/TanhImpl.hpp b/include/aidge/backend/cuda/operator/TanhImpl.hpp
index 35e879513fee0ec9354edecefd3d53860e54a0b1..166acd6adee397a3f284363a9db1e71152467b94 100644
--- a/include/aidge/backend/cuda/operator/TanhImpl.hpp
+++ b/include/aidge/backend/cuda/operator/TanhImpl.hpp
@@ -27,7 +27,25 @@
 #include "aidge/backend/cuda/utils/CudaUtils.hpp"
 
 namespace Aidge {
+// Operator implementation entry point for the backend
 class TanhImpl_cuda : public OperatorImpl {
+public:
+    TanhImpl_cuda(const Tanh_Op& op) : OperatorImpl(op, "cuda") {}
+
+    static std::unique_ptr<TanhImpl_cuda> create(const Tanh_Op& op) {
+        return std::make_unique<TanhImpl_cuda>(op);
+    }
+
+    virtual std::set<ImplSpec> getAvailableImplSpecs() const override {
+        return {
+            {DataType::Any}
+        };
+    }
+
+    void forward() override;
+    void backward() override;
+    ~TanhImpl_cuda();
+
 private:
     // CuDNN specific variables
     #if CUDNN_VERSION >= 5000
@@ -38,27 +56,12 @@ private:
     std::shared_ptr<Tensor> mInputFallback;
     std::shared_ptr<Tensor> mOutputGradFallback;
 
-public:
-    TanhImpl_cuda(const Tanh_Op &op) : OperatorImpl(op, "cuda") {}
-
-    static std::unique_ptr<TanhImpl_cuda> create(const Tanh_Op &op) {
-        return std::make_unique<TanhImpl_cuda>(op);
-    }
-
-public:
-    void forward();
-    void backward();
-    ~TanhImpl_cuda();
-
-private:
     template <class T> void forward_(const Tensor& input);
     template <class T> void backward_(const Tensor& output_grad);
 };
 
-namespace {
-// add cuda backend to Tanh_Op implementation registry
-static Registrar<Tanh_Op> registrarTanhImpl_cuda("cuda", Aidge::TanhImpl_cuda::create);
-}  // namespace
+// Implementation entry point registration to Operator
+REGISTRAR(Tanh_Op, "cuda", Aidge::TanhImpl_cuda::create);
 }  // namespace Aidge
 
 #endif /* AIDGE_BACKEND_CUDA_OPERATOR_TANHIMPL_H_ */
diff --git a/include/aidge/backend/cuda/utils/CudaContext.hpp b/include/aidge/backend/cuda/utils/CudaContext.hpp
index 7218cc24aed718f57a1866be74e7ba9124a5a7f1..f21886e502b9017aa55e250e7257d16bc5d04501 100644
--- a/include/aidge/backend/cuda/utils/CudaContext.hpp
+++ b/include/aidge/backend/cuda/utils/CudaContext.hpp
@@ -157,8 +157,10 @@ namespace Aidge {
             return CUDNN_DATA_UINT8;
         case DataType::Int32:
             return CUDNN_DATA_INT32;
+#if CUDNN_VERSION >= 8100
         case DataType::Int64:
             return CUDNN_DATA_INT64;
+#endif
         default:
             assert(false && "Unsupported CuDNN type");
         }
diff --git a/project_name.txt b/project_name.txt
deleted file mode 100644
index d029f485dbbd8bc3386a50eee3a4d8aa201aeb74..0000000000000000000000000000000000000000
--- a/project_name.txt
+++ /dev/null
@@ -1 +0,0 @@
-aidge_backend_cuda
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..911af058463e100bc35453315919ec8bda3f2845
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,114 @@
+[project]
+name = "aidge_backend_cuda"
+description="CUDA implementations of the operators of aidge framework"
+dependencies = [
+    "numpy",
+]
+requires-python = ">= 3.7"
+readme = "README.md"
+license = { file = "LICENSE" }
+classifiers = [
+    "Development Status :: 2 - Pre-Alpha",
+    "Programming Language :: Python :: 3"
+]
+dynamic = ["version"]
+
+#####################################################
+# SETUPTOOLS
+[tool.setuptools]
+[tool.setuptools.packages.find]
+where = ["."]  # list of folders that contain the packages (["."] by default)
+include = ["aidge_backend_cuda*"]  # package names should match these glob patterns (["*"] by default)
+exclude = ["aidge_backend_cuda.unit_tests*"]  # exclude packages matching these glob patterns (empty by default)
+namespaces = false  # to disable scanning PEP 420 namespaces (true by default)
+# SETUPTOOLS_SCM
+[tool.setuptools_scm]
+write_to = "aidge_backend_cuda/_version.py"
+
+[build-system]
+requires = [
+    "setuptools>=68",
+    "setuptools-scm",
+    "cmake>=3.18.0",
+    "toml"
+]
+build-backend = "setuptools.build_meta"
+
+#####################################################
+# CIBUILDWHEEL
+[tool.cibuildwheel]
+build-frontend = "build"
+test-requires = "pytest"
+test-command = "pytest {project}/aidge_backend_cuda/unit_tests"
+# uncomment to run cibuildwheel locally on selected distros
+# build=[
+#     "cp38-manylinux_x86_64",
+#     "cp39-manylinux_x86_64",
+#     "cp310-manylinux_x86_64"
+# ]
+
+[tool.cibuildwheel.container-engine]
+# pass command line options to 'docker run'
+name = "docker"
+create-args = [
+    "--runtime=nvidia",
+    "--gpus", "all",
+    "--privileged",
+    "-v","/cache",
+    "-v","/var/run/docker.sock:/var/run/docker.sock",
+]
+
+### AIDGE DEPENDENCIES DECLARATION
+[tool.cibuildwheel.environment]
+# These variables are here for debug purpose but their values when called from CI are set in .gitlab-ci.yml
+BUILD_WITH_CUDA=1
+AIDGE_DEPENDENCIES = "aidge_core aidge_backend_cpu" # format => "dep_1 dep_2 ... dep_n"
+AIDGE_INSTALL="/AIDGE_INSTALL_CIBUILDWHEEL"
+ARCH="x86_64"
+CUDNN_VERSION="9"
+CUDA_MAJOR_VERSION="11"
+CUDA_MINOR_VERSION="8"
+DOCKER_HOST="unix:///var/run/docker.sock"
+SEARCH_PATH="/home/ubuntu/aidge/aidge" # debug path
+# these two following variables are set within CMakeLists.txt when calling cibuildwheel from CI
+LD_LIBRARY_PATH="/usr/local/cuda/lib64:$LD_LIBRARY_PATH"
+PATH="/usr/local/cuda/bin:$PATH"
+[tool.cibuildwheel.linux]
+before-build = [
+    "export CUDA_TOOLKIT_VERSION=$CUDA_MAJOR_VERSION-$CUDA_MINOR_VERSION",
+    "echo '\n\n\n\n yum -y install cuda-toolkit-$CUDA_TOOLKIT_VERSION.$ARCH'",
+    "yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/$ARCH/cuda-rhel7.repo",
+    "yum clean all",
+    "yum -y install cuda-toolkit-$CUDA_TOOLKIT_VERSION.$ARCH",
+    "yum list available | grep cudnn",
+    "yum -y install libcudnn$CUDNN_VERSION-cuda-$CUDA_MAJOR_VERSION.$ARCH",
+    "yum -y install libcudnn$CUDNN_VERSION-devel-cuda-$CUDA_MAJOR_VERSION.$ARCH",
+    "export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH",
+    "export PATH=/usr/local/cuda/bin:$PATH",
+    "which nvcc",
+    "nvcc --version",
+    "echo '\n\n\n\nInstalling required dependencies for aidge_backend_cuda.\n\n'",
+    "bash .gitlab/ci/cibuildwheel_build_deps_before_build_wheel.sh /host/$SEARCH_PATH"
+]
+before-test= [
+    "export CUDA_TOOLKIT_VERSION=$CUDA_MAJOR_VERSION-$CUDA_MINOR_VERSION",
+    "echo '\n\n\n\n yum -y install cuda-toolkit-$CUDA_TOOLKIT_VERSION.$ARCH'",
+    "yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/$ARCH/cuda-rhel7.repo",
+    "yum clean all",
+    "yum -y install cuda-toolkit-$CUDA_TOOLKIT_VERSION.$ARCH",
+    "yum list available | grep cudnn",
+    "yum -y install libcudnn$CUDNN_VERSION-cuda-$CUDA_MAJOR_VERSION.$ARCH",
+    "yum -y install libcudnn$CUDNN_VERSION-devel-cuda-$CUDA_MAJOR_VERSION.$ARCH",
+    "export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH",
+    "export PATH=/usr/local/cuda/bin:$PATH",
+    "nvcc --version",
+    "echo '\n\n\n\nInstalling required dependencies for aidge_backend_cuda.\n\n'",
+    "bash .gitlab/ci/cibuildwheel_build_deps_before_build_wheel.sh /host/$SEARCH_PATH"
+]
+[tool.cibuildwheel.windows]
+before-build = [
+    "powershell -File .\\.gitlab\\ci\\cibuildwheel_build_deps_before_build_wheel.ps1"
+]
+before-test = [
+    "powershell -File .\\.gitlab\\ci\\cibuildwheel_build_deps_before_build_wheel.ps1"
+]
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/setup.py b/setup.py
index 80500f3165dd87eb7b6dd73c78b89806cc8a874a..706fc53ca08319ee487ef789ebc85f0d513ab25b 100644
--- a/setup.py
+++ b/setup.py
@@ -1,37 +1,25 @@
 #!/usr/bin/env python3
-""" Aidge
-
-#TODO To change
-POC of the next framework named Aidge
-"""
-
-DOCLINES = (__doc__ or '').split("\n")
-
 import sys
 import os
 
-# Python supported version checks
-if sys.version_info[:2] < (3, 7):
-    raise RuntimeError("Python version >= 3.7 required.")
-
-
-CLASSIFIERS = """\
-Development Status :: 2 - Pre-Alpha
-"""
-
 import shutil
 import pathlib
-import subprocess
 import multiprocessing
 
 from math import ceil
 
+import toml
+
 from setuptools import setup, Extension
 from setuptools import find_packages
 from setuptools.command.build_ext import build_ext
 
+
 def get_project_name() -> str:
-    return open(pathlib.Path().absolute() / "project_name.txt", "r").read()
+    with open(pathlib.Path().absolute() / "pyproject.toml", "r") as file:
+        project_toml = toml.load(file)
+        return project_toml["project"]["name"]
+
 
 def get_project_version() -> str:
     aidge_root = pathlib.Path().absolute()
@@ -43,8 +31,8 @@ class CMakeExtension(Extension):
     def __init__(self, name):
         super().__init__(name, sources=[])
 
-class CMakeBuild(build_ext):
 
+class CMakeBuild(build_ext):
     def run(self):
         # This lists the number of processors available on the machine
         # The compilation will use half of them
@@ -62,17 +50,45 @@ class CMakeBuild(build_ext):
 
         os.chdir(str(build_temp))
 
-        # Impose to use the executable of the python
-        # used to launch setup.py to setup PythonInterp
-        param_py = "-DPYTHON_EXECUTABLE=" + sys.executable
+        compile_type = (
+            "Release"
+            if "AIDGE_PYTHON_BUILD_TYPE" not in os.environ
+            else os.environ["AIDGE_PYTHON_BUILD_TYPE"]
+        )
+
+        install_path = (
+            os.path.join(sys.prefix, "lib", "libAidge")
+            if "AIDGE_INSTALL" not in os.environ
+            else os.environ["AIDGE_INSTALL"]
+        )
+
+        # using ninja as default build system to build faster and with the same compiler as on windows
+        build_gen = (
+            ["-G", os.environ["AIDGE_BUILD_GEN"]]
+            if "AIDGE_BUILD_GEN" in os.environ
+            else []
+        )
+        
+        self.spawn(
+            [
+                "cmake",
+                *build_gen,
+                str(cwd),
+                "-DTEST=OFF",
+                f"-DCMAKE_INSTALL_PREFIX:PATH={install_path}",
+                f"-DCMAKE_BUILD_TYPE={compile_type}",
+                "-DPYBIND=ON",
+                "-DCMAKE_EXPORT_COMPILE_COMMANDS=ON",
+                "-DCOVERAGE=OFF",
+                "-DCMAKE_CUDA_ARCHITECTURES=native",
+            ]
+        )
 
-        compile_type = 'Debug'
-        install_path = os.path.join(sys.prefix, "lib", "libAidge")  if "AIDGE_INSTALL" not in os.environ else os.environ["AIDGE_INSTALL"]
-
-        self.spawn(['cmake', str(cwd), param_py, '-DTEST=OFF', f'-DCMAKE_INSTALL_PREFIX:PATH={install_path}', f'-DCMAKE_BUILD_TYPE={compile_type}'])
         if not self.dry_run:
-            self.spawn(['cmake', '--build', '.', '--config', compile_type, '-j', max_jobs])
-            self.spawn(['cmake', '--install', '.', '--config', compile_type])
+            self.spawn(
+                ["cmake", "--build", ".", "--config", compile_type, "-j", max_jobs]
+            )
+            self.spawn(["cmake", "--install", ".", "--config", compile_type])
         os.chdir(str(cwd))
 
         aidge_package = build_lib / (get_project_name())
@@ -83,8 +99,10 @@ class CMakeBuild(build_ext):
         # Copy all shared object files from build_temp/lib to aidge_package
         for root, _, files in os.walk(build_temp.absolute()):
             for file in files:
-                if (file.endswith('.so') or file.endswith('.pyd')) and (root != str(aidge_package.absolute())):
-                    currentFile=os.path.join(root, file)
+                if (file.endswith(".so") or file.endswith(".pyd")) and (
+                    root != str(aidge_package.absolute())
+                ):
+                    currentFile = os.path.join(root, file)
                     shutil.copy(currentFile, str(aidge_package.absolute()))
 
         # Copy version.txt in aidge_package
@@ -92,23 +110,12 @@ class CMakeBuild(build_ext):
         shutil.copy("version.txt", str(aidge_package.absolute()))
 
 
-if __name__ == '__main__':
-
+if __name__ == "__main__":
     setup(
-        name=get_project_name(),
-        version=get_project_version(),
-        python_requires='>=3.7',
-        description=DOCLINES[0],
-        long_description_content_type="text/markdown",
-        long_description="\n".join(DOCLINES[2:]),
-        classifiers=[c for c in CLASSIFIERS.split('\n') if c],
-        packages=find_packages(where="."),
         include_package_data=True,
         ext_modules=[CMakeExtension(get_project_name())],
         cmdclass={
-            'build_ext': CMakeBuild,
+            "build_ext": CMakeBuild,
         },
-        install_requires=['aidge_core'],
         zip_safe=False,
-
     )
diff --git a/src/data/TensorImpl.cu b/src/data/TensorImpl.cu
index 898475b5db325afcaedff44756cc2157cf9e2eec..c70b024fbab1a031ea69d5d9b169dc115b7320db 100644
--- a/src/data/TensorImpl.cu
+++ b/src/data/TensorImpl.cu
@@ -98,3 +98,34 @@ bool Aidge::TensorImpl_cuda<T>::operator==(const TensorImpl &otherImpl) const {
     thrust::device_ptr<T> thrustOtherData(otherImplCuda.mData.data());
     return thrust::equal(thrustData, thrustData + mNbElts, thrustOtherData);
 }
+
+template void Aidge::thrust_copy<double, double>(double const*, double*, unsigned long);
+template void Aidge::thrust_copy<double, float>(double const*, float*, unsigned long);
+template void Aidge::thrust_copy<double, int>(double const*, int*, unsigned long);
+template void Aidge::thrust_copy<float, double>(float const*, double*, unsigned long);
+template void Aidge::thrust_copy<float, float>(float const*, float*, unsigned long);
+template void Aidge::thrust_copy<float, int>(float const*, int*, unsigned long);
+template void Aidge::thrust_copy<int, double>(int const*, double*, unsigned long);
+template void Aidge::thrust_copy<int, float>(int const*, float*, unsigned long);
+template void Aidge::thrust_copy<int, int>(int const*, int*, unsigned long);
+template void Aidge::thrust_copy<long, double>(long const*, double*, unsigned long);
+template void Aidge::thrust_copy<long, float>(long const*, float*, unsigned long);
+template void Aidge::thrust_copy<long, int>(long const*, int*, unsigned long);
+template void Aidge::thrust_copy<short, double>(short const*, double*, unsigned long);
+template void Aidge::thrust_copy<short, float>(short const*, float*, unsigned long);
+template void Aidge::thrust_copy<short, int>(short const*, int*, unsigned long);
+template void Aidge::thrust_copy<signed char, double>(signed char const*, double*, unsigned long);
+template void Aidge::thrust_copy<signed char, float>(signed char const*, float*, unsigned long);
+template void Aidge::thrust_copy<signed char, int>(signed char const*, int*, unsigned long);
+template void Aidge::thrust_copy<unsigned char, double>(unsigned char const*, double*, unsigned long);
+template void Aidge::thrust_copy<unsigned char, float>(unsigned char const*, float*, unsigned long);
+template void Aidge::thrust_copy<unsigned char, int>(unsigned char const*, int*, unsigned long);
+template void Aidge::thrust_copy<unsigned int, double>(unsigned int const*, double*, unsigned long);
+template void Aidge::thrust_copy<unsigned int, float>(unsigned int const*, float*, unsigned long);
+template void Aidge::thrust_copy<unsigned int, int>(unsigned int const*, int*, unsigned long);
+template void Aidge::thrust_copy<unsigned long, double>(unsigned long const*, double*, unsigned long);
+template void Aidge::thrust_copy<unsigned long, float>(unsigned long const*, float*, unsigned long);
+template void Aidge::thrust_copy<unsigned long, int>(unsigned long const*, int*, unsigned long);
+template void Aidge::thrust_copy<unsigned short, double>(unsigned short const*, double*, unsigned long);
+template void Aidge::thrust_copy<unsigned short, float>(unsigned short const*, float*, unsigned long);
+template void Aidge::thrust_copy<unsigned short, int>(unsigned short const*, int*, unsigned long);
diff --git a/src/operator/AddImpl.cpp b/src/operator/AddImpl.cpp
index 74d89c405530766324407fc42345f237931dc2f4..de7ea925554906ea5fe1e5dcba268b17a06a47bd 100644
--- a/src/operator/AddImpl.cpp
+++ b/src/operator/AddImpl.cpp
@@ -44,6 +44,10 @@ void Aidge::AddImpl_cuda::forward() {
         std::copy(inputs[i].dims().begin(), inputs[i].dims().end(), std::back_inserter(dims[i]));
         dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1));
 
+        if (dims[i].size() < 4) {
+            dims[i].resize(4, 1);
+        }
+
         // Compute the corresponding strides
         std::vector<int> tensorStrides(dims[i].size());
         int product = 1;
@@ -191,7 +195,7 @@ void Aidge::AddImpl_cuda::backward_(const Tensor& outputGrad, const std::vector<
                                tensorDesc,
                                &workspaceSize));
 
-            float *d_workspace;
+            void *d_workspace;
             CHECK_CUDA_STATUS(cudaMalloc(&d_workspace, workspaceSize));
 
             CHECK_CUDNN_STATUS(cudnnReduceTensor(CudaContext::cudnnHandle(),
diff --git a/src/operator/AndImpl.cpp b/src/operator/AndImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e1ee9ebcb9437b89666da21a915907b5434ece26
--- /dev/null
+++ b/src/operator/AndImpl.cpp
@@ -0,0 +1,95 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <algorithm>
+#include <cassert>
+#include <numeric>
+#include <vector>
+
+#include "aidge/backend/cuda/data/TensorImpl.hpp"
+#include "aidge/backend/cuda/operator/AndImpl.hpp"
+#include "aidge/backend/cuda/operator/AndImpl_CUDA_kernels.hpp"
+#include "aidge/backend/cuda/utils/CudaContext.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/operator/And.hpp"
+#include "aidge/utils/Types.h"
+
+void Aidge::AndImpl_cuda::forward() {
+    const And_Op& op = static_cast<const And_Op&>(mOp);
+    // Check inputs
+    AIDGE_ASSERT(op.getInput(0), "missing input in And operator");
+    AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run And forward because the 0-th input has no implementation.");
+    DataType datatypeFirstInput = op.getInput(0)->dataType();
+    for (IOIndex_t i = 1; i < op.nbInputs(); ++i) {
+        AIDGE_ASSERT(op.getInput(i), "missing input in And operator");
+        AIDGE_ASSERT(op.getInput(i)->hasImpl(), "cannot run And forward because the {}-th input has no implementation.", i);
+        AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, "Cannot And inputs with two differents data type.");
+    }
+
+    std::vector<std::shared_ptr<Tensor>> inputFallbacks(op.nbInputs());
+    std::vector<Tensor> inputs(op.nbInputs());
+    std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims
+    std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides
+    for (IOIndex_t i = 0; i < op.nbInputs(); ++i) {
+        inputs[i] = op.getInput(i)->refCastFrom(inputFallbacks[i], *op.getOutput(0));
+
+        // Get tensor dims and broadcast them
+        std::copy(inputs[i].dims().begin(), inputs[i].dims().end(), std::back_inserter(dims[i]));
+        dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1));
+
+        if (dims[i].size() < 4) {
+            dims[i].resize(4, 1);
+        }
+
+        // Compute the corresponding strides
+        std::vector<int> tensorStrides(dims[i].size());
+        int product = 1;
+        for (size_t j = dims[i].size(); j > 0; --j) {
+            tensorStrides[j - 1] = product;
+            product *= dims[i][j - 1];
+        }
+        strides[i] = tensorStrides;
+    }
+
+    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+        case DataType::Float64:
+            forward_<double>(inputs, dims, strides);
+            break;
+        case DataType::Float32:
+            forward_<float>(inputs, dims, strides);
+            break;
+        case DataType::Float16:
+            forward_<half>(inputs, dims, strides);
+            break;
+        default:
+            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    }
+}
+
+template <class T>
+void Aidge::AndImpl_cuda::forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const T * input1Ptr = static_cast<const T*>(inputs[0].getImpl()->rawPtr());
+    const T * input2Ptr = static_cast<const T*>(inputs[1].getImpl()->rawPtr());
+    T * outputPtr = static_cast<T*>(op.getOutput(0)->getImpl()->rawPtr());
+
+    std::vector<int> outputStrides(op.getOutput(0)->nbDims(), 1);
+    if(op.getOutput(0)->nbDims()>1) {
+        for (int i = op.getOutput(0)->nbDims()-2; i >= 0; i--) {
+            outputStrides[i] = outputStrides[i+1] *  op.getOutput(0)->dims()[i+1];
+        }
+    }
+
+    Aidge::AndForward<T>(input1Ptr, input2Ptr, outputPtr,
+                inputsDims[0], inputsDims[1],
+                inputsStrides[0], inputsStrides[1], outputStrides,
+                static_cast<int>(op.getOutput(0)->size()));
+}
\ No newline at end of file
diff --git a/src/operator/AndImpl_CUDA_kernels.cu b/src/operator/AndImpl_CUDA_kernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..34bfccf98c013d8bfc934325f4e327cbae9e7b4a
--- /dev/null
+++ b/src/operator/AndImpl_CUDA_kernels.cu
@@ -0,0 +1,95 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <cuda_fp16.h>
+
+#include "aidge/backend/cuda/operator/AndImpl_CUDA_kernels.hpp"
+
+// Helper function for comparison
+template <typename T>
+__device__ bool compareE(T a, T b) {
+    return a == b;
+}
+template <>
+__device__ bool compareE<half>(half a, half b) {
+    return __half2float(a) == __half2float(b);
+}
+
+template <typename T>
+__global__ void and_cuda_Kernel(const T* input1, const T* input2, T* output,
+                          int* input1_shape, int* input2_shape,
+                          int* input1_strides, int* input2_strides, int* output_strides,
+                          int num_dims, int size) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= size) return;
+
+    int input1_idx = 0, input2_idx = 0;
+    int temp_idx = idx;
+    for (int i = 0; i < num_dims; ++i) {
+        int dim = temp_idx / output_strides[i];
+        temp_idx %= output_strides[i];
+        input1_idx += (input1_shape[i] == 1 ? 0 : dim) * input1_strides[i];
+        input2_idx += (input2_shape[i] == 1 ? 0 : dim) * input2_strides[i];
+    }
+
+    output[idx] = static_cast<T>(compareE(input1[input1_idx], input2[input2_idx]));
+}
+
+template <typename T>
+void Aidge::AndForward(const T* input1, const T* input2, T* output,
+                        const std::vector<int>& input1Dims,const std::vector<int>& input2Dims,
+                        const std::vector<int>& input1Strides, const std::vector<int>& input2Strides,const std::vector<int>& outputStrides,
+                        int outSize)
+{
+    int *d_input1_strides, *d_input2_strides, *d_output_strides, *d_input1_shape, *d_input2_shape;
+    // Allocate device memory
+    CHECK_CUDA_STATUS(cudaMalloc(&d_input1_shape, input1Dims.size() * sizeof(int)));
+    CHECK_CUDA_STATUS(cudaMalloc(&d_input2_shape, input1Dims.size() * sizeof(int)));
+    CHECK_CUDA_STATUS(cudaMalloc(&d_input1_strides, input1Dims.size() * sizeof(int)));
+    CHECK_CUDA_STATUS(cudaMalloc(&d_input2_strides, input1Dims.size() * sizeof(int)));
+    CHECK_CUDA_STATUS(cudaMalloc(&d_output_strides, input1Dims.size() * sizeof(int)));
+
+    // Copy data from host to device;
+    CHECK_CUDA_STATUS(cudaMemcpy(d_input1_shape, input1Dims.data(), input1Dims.size() * sizeof(int), cudaMemcpyHostToDevice));
+    CHECK_CUDA_STATUS(cudaMemcpy(d_input2_shape, input2Dims.data(), input1Dims.size() * sizeof(int), cudaMemcpyHostToDevice));
+    CHECK_CUDA_STATUS(cudaMemcpy(d_input1_strides, input1Strides.data(), input1Dims.size() * sizeof(int), cudaMemcpyHostToDevice));
+    CHECK_CUDA_STATUS(cudaMemcpy(d_input2_strides, input2Strides.data(), input1Dims.size() * sizeof(int), cudaMemcpyHostToDevice));
+    CHECK_CUDA_STATUS(cudaMemcpy(d_output_strides, outputStrides.data(), input1Dims.size() * sizeof(int), cudaMemcpyHostToDevice));
+    int blockSize = 256;
+    int numBlocks = (outSize + blockSize - 1) / blockSize;
+
+    int num_dims = input1Dims.size();
+    // Launch the kernel
+    and_cuda_Kernel<<<numBlocks, blockSize>>>(input1, input2, output,
+                                            d_input1_shape, d_input2_shape,
+                                            d_input1_strides, d_input2_strides, d_output_strides,
+                                            num_dims, outSize);
+    CHECK_CUDA_STATUS(cudaFree(d_input1_shape));
+    CHECK_CUDA_STATUS(cudaFree(d_input2_shape));
+    CHECK_CUDA_STATUS(cudaFree(d_input1_strides));
+    CHECK_CUDA_STATUS(cudaFree(d_input2_strides));
+    CHECK_CUDA_STATUS(cudaFree(d_output_strides));
+};
+
+template void Aidge::AndForward(const double* input1, const double* input2, double* output,
+                        const std::vector<int>& input1Dims,const std::vector<int>& input2Dims,
+                        const std::vector<int>& inputStrides, const std::vector<int>& input2Strides,const std::vector<int>& outputStrides,
+                        int outSize);
+
+template void Aidge::AndForward(const float* input1, const float* input2, float* output,
+                        const std::vector<int>& input1Dims,const std::vector<int>& input2Dims,
+                        const std::vector<int>& inputStrides, const std::vector<int>& input2Strides,const std::vector<int>& outputStrides,
+                        int outSize);
+
+template void Aidge::AndForward(const half* input1, const half* input2, half* output,
+                        const std::vector<int>& input1Dims,const std::vector<int>& input2Dims,
+                        const std::vector<int>& inputStrides, const std::vector<int>& input2Strides,const std::vector<int>& outputStrides,
+                        int outSize);
\ No newline at end of file
diff --git a/src/operator/ArgMaxImpl.cpp b/src/operator/ArgMaxImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..50d00592ca70333d6fbdd7a10761a0ea2e9beb4b
--- /dev/null
+++ b/src/operator/ArgMaxImpl.cpp
@@ -0,0 +1,74 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <algorithm>
+#include <cassert>
+#include <numeric>
+#include <vector>
+
+#include "aidge/backend/cuda/data/TensorImpl.hpp"
+#include "aidge/backend/cuda/operator/ArgMaxImpl.hpp"
+#include "aidge/backend/cuda/operator/ArgMaxImpl_CUDA_kernels.hpp"
+#include "aidge/backend/cuda/utils/CudaContext.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/operator/ArgMax.hpp"
+#include "aidge/utils/Types.h"
+
+void Aidge::ArgMaxImpl_cuda::forward() {
+    const ArgMax_Op& op = dynamic_cast<const ArgMax_Op&>(mOp);
+    AIDGE_ASSERT(mOp.getRawInput(0), "missing input in ArgMax operator");
+    AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run ArgMax forward because the input has no implementation.");
+
+    const auto& input = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->refCastFrom(mInputFallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
+    const std::int32_t axis = op.axis();
+    const DimSize_t selectLastIdx = op.selectLastIndex();
+    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+        case DataType::Float64:
+            forward_<double>(input, axis, selectLastIdx);
+            break;
+        case DataType::Float32:
+            forward_<float>(input, axis, selectLastIdx);
+            break;
+        case DataType::Float16:
+            forward_<half>(input, axis, selectLastIdx);
+            break;
+        default:
+            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    }
+}
+
+
+template <class T>
+void Aidge::ArgMaxImpl_cuda::forward_(const Tensor& input, std::int32_t axis, DimSize_t selectLastIdx) {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+
+
+    const T * inputPtr = static_cast<const T*>(input.getImpl()->rawPtr());
+    T * outputPtr = static_cast<T*>(op.getOutput(0)->getImpl()->rawPtr());
+
+    std::vector<int> inputStrides(op.getInput(0)->nbDims(), 1);
+    if(op.getInput(0)->nbDims()>1) {
+        for (int i = op.getInput(0)->nbDims()-2; i >= 0; i--) {
+            inputStrides[i] = inputStrides[i+1] *  op.getInput(0)->dims()[i+1];
+        }
+    }
+
+    std::vector<int> inputShape(input.nbDims());
+
+    // Use std::transform to convert each element
+    std::transform(input.dims().begin(), input.dims().end(), inputShape.begin(),
+                   [](size_t value) {
+                       return static_cast<int>(value);
+                   });
+    Aidge::ArgMax_cuda_forward_kernel<T>(inputPtr, outputPtr,
+                                      inputShape, inputStrides,
+                                      axis, static_cast<int>(op.getInput(0)->size()), selectLastIdx);
+}
diff --git a/src/operator/ArgMaxImpl_CUDA_kernels.cu b/src/operator/ArgMaxImpl_CUDA_kernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7010236d06135171309ac63f0e2d93fa85ff76d8
--- /dev/null
+++ b/src/operator/ArgMaxImpl_CUDA_kernels.cu
@@ -0,0 +1,161 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include "aidge/backend/cuda/operator/ArgMaxImpl_CUDA_kernels.hpp"
+#define MAX_ERRORS 10
+// Helper function for comparison
+template <typename T>
+__device__ bool compareGT(T a, T b) {
+    return a > b;
+}
+template <>
+__device__ bool compareGT<half>(half a, half b) {
+    return __half2float(a) > __half2float(b);
+}
+
+// Helper function for comparison
+template <typename T>
+__device__ bool compareGE(T a, T b) {
+    return a >= b;
+}
+template <>
+__device__ bool compareGE<half>(half a, half b) {
+    return __half2float(a) >= __half2float(b);
+}
+template <typename T>
+__global__ void argmax_forward(const T* input, T* output, int* dims, int* strides, int axis, int total_elems, T minValue) {
+    const unsigned int index = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned int stride = blockDim.x * gridDim.x;
+
+    for (unsigned int idx = index; idx < total_elems; idx += stride) {
+        int axis_stride = strides[axis];
+        int axis_dim = dims[axis];
+        int outer_stride = idx / axis_stride;
+        int inner_stride = idx % axis_stride;
+
+        T max_val = minValue;
+        int max_idx = 0;
+
+        for (int i = 0; i < axis_dim; ++i) {
+            int offset = outer_stride * axis_dim * axis_stride + i * axis_stride + inner_stride;
+            if (offset >= total_elems) {
+                return;
+            }
+            T val = input[offset];
+            if (compareGT(val, max_val)) {
+                max_val = val;
+                max_idx = i;
+            }
+        }
+
+        int output_index = outer_stride * axis_stride + inner_stride;
+        if (output_index >= (total_elems / axis_dim)) {
+            return;
+        }
+        output[output_index] = max_idx;
+    }
+}
+
+template <typename T>
+__global__ void argmax_forward_selectLastIdx(const T* input, T* output, int* dims, int* strides, int axis, int total_elems, T minValue) {
+    const unsigned int index = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned int stride = blockDim.x * gridDim.x;
+
+    for (unsigned int idx = index; idx < total_elems; idx += stride) {
+        int axis_stride = strides[axis];
+        int axis_dim = dims[axis];
+        int outer_stride = idx / axis_stride;
+        int inner_stride = idx % axis_stride;
+
+        T max_val = minValue;
+        int max_idx = 0;
+
+        for (int i = 0; i < axis_dim; ++i) {
+            int offset = outer_stride * axis_dim * axis_stride + i * axis_stride + inner_stride;
+            if (offset >= total_elems) {
+                return;
+            }
+            T val = input[offset];
+            if (compareGE(val, max_val)) {
+                max_val = val;
+                max_idx = i;
+            }
+        }
+
+        int output_index = outer_stride * axis_stride + inner_stride;
+        if (output_index >= (total_elems / axis_dim)) {
+            return;
+        }
+        output[output_index] = max_idx;
+    }
+}
+
+template <typename T>
+T minValue();
+
+template <>
+double minValue<double>() {
+    return  std::numeric_limits<double>::min();
+}
+
+template <>
+float minValue<float>() {
+    return std::numeric_limits<float>::min();
+}
+
+template <>
+half minValue<half>() {
+    return __float2half(std::numeric_limits<float>::min());
+}
+
+template <typename T>
+void Aidge::ArgMax_cuda_forward_kernel(const T* input, T* output,
+                                const std::vector<int>& inputDims, const std::vector<int>& inputStrides,
+                                int axis, int total_elems, std::size_t selectLastIdx) {
+
+    // Define block and grid sizes
+    int blockSize = 256;
+    int gridSize = (total_elems + blockSize - 1) / blockSize;
+
+
+    int *d_input_strides, *d_input_shape;
+    // Allocate device memory
+    CHECK_CUDA_STATUS(cudaMalloc(&d_input_shape, inputDims.size() * sizeof(int)));
+    CHECK_CUDA_STATUS(cudaMalloc(&d_input_strides, inputDims.size() * sizeof(int)));
+
+    // Copy data from host to device;
+    CHECK_CUDA_STATUS(cudaMemcpy(d_input_shape, inputDims.data(), inputDims.size() * sizeof(int), cudaMemcpyHostToDevice));
+    CHECK_CUDA_STATUS(cudaMemcpy(d_input_strides, inputStrides.data(), inputDims.size() * sizeof(int), cudaMemcpyHostToDevice));
+    // Launch the kernel
+    if (selectLastIdx) {
+        argmax_forward_selectLastIdx<<<gridSize, blockSize>>>(input, output, d_input_shape, d_input_strides, axis, total_elems, minValue<T>());
+    }
+    else {
+        argmax_forward<<<gridSize, blockSize>>>(input, output, d_input_shape, d_input_strides, axis, total_elems, minValue<T>());
+    }
+
+    CHECK_CUDA_STATUS(cudaFree(d_input_shape));
+    CHECK_CUDA_STATUS(cudaFree(d_input_strides));
+}
+
+
+
+template void Aidge::ArgMax_cuda_forward_kernel(const double* input, double* output,
+                                const std::vector<int>& inputDims, const std::vector<int>& inputStrides,
+                                int axis, int total_elems, std::size_t selectLastIdx);
+
+template void Aidge::ArgMax_cuda_forward_kernel(const float* input, float* output,
+                                const std::vector<int>& inputDims, const std::vector<int>& inputStrides,
+                                int axis, int total_elems, std::size_t selectLastIdx);
+
+template void Aidge::ArgMax_cuda_forward_kernel(const half* input, half* output,
+                                const std::vector<int>& inputDims, const std::vector<int>& inputStrides,
+                                int axis, int total_elems, std::size_t selectLastIdx);
\ No newline at end of file
diff --git a/src/operator/ConvImpl.cpp b/src/operator/ConvImpl.cpp
index b627f69a289340b42e1de4baa6bb09d1ea2e5e99..24e01db03692ffaa884b31a224a1947a9e1645a0 100644
--- a/src/operator/ConvImpl.cpp
+++ b/src/operator/ConvImpl.cpp
@@ -24,14 +24,18 @@ template <Aidge::DimIdx_t DIM>
 void Aidge::ConvImpl_cuda<DIM>::forward() {
     const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
 
-    // FIXME: uncomment the following code once memory handling will work
-    assert(mOp.getRawInput(0) && "missing input #0");
-    assert(mOp.getRawInput(1) && "missing input #1");
+    AIDGE_ASSERT(op.getInput(0), "missing input #0");
+    AIDGE_ASSERT(op.getInput(0)->hasImpl(), "the 0-th input has no implementation.");
+    AIDGE_ASSERT(op.getInput(1), "missing input #1");
+    AIDGE_ASSERT(op.getInput(1)->hasImpl(), "the 1-th input has no implementation.");
 
     // Convert input data (no overhead if not needed!)
     const auto& input0 = op.getInput(0)->refCastFrom(mInput0Fallback, *op.getOutput(0));
     const auto& input1 = op.getInput(1)->refCastFrom(mInput1Fallback, *op.getOutput(0));
-    const auto& input2 = op.getInput(2)->refCastFrom(mInput2Fallback, *op.getOutput(0));
+    Tensor input2;
+    if(op.getInput(2) && op.getInput(2)->hasImpl()) {
+        input2 = op.getInput(2)->refCastFrom(mInput2Fallback, *op.getOutput(0));
+    }
 
     // Lazy-initialize CuDNN convolution descriptor
     if (mConvDesc == nullptr) {
@@ -48,14 +52,14 @@ void Aidge::ConvImpl_cuda<DIM>::forward() {
             upscales = std::vector<int>(convOp.dilationDims().begin(), convOp.dilationDims().end());
         }
 
-            CHECK_CUDNN_STATUS(cudnnCreateConvolutionDescriptor(&mConvDesc));
-            CHECK_CUDNN_STATUS(cudnnSetConvolutionNdDescriptor(mConvDesc,
-                DIM,
-                &paddings[0],
-                &strides[0],
-                &upscales[0],
-                CUDNN_CROSS_CORRELATION,
-                DataTypeToCudnn(op.getOutput(0)->dataType())));
+        CHECK_CUDNN_STATUS(cudnnCreateConvolutionDescriptor(&mConvDesc));
+        CHECK_CUDNN_STATUS(cudnnSetConvolutionNdDescriptor(mConvDesc,
+            DIM,
+            &paddings[0],
+            &strides[0],
+            &upscales[0],
+            CUDNN_CROSS_CORRELATION,
+            DataTypeToCudnn(op.getOutput(0)->dataType())));
     }
 
     // Lazy-initialize CuDNN filter descriptor
@@ -72,27 +76,6 @@ void Aidge::ConvImpl_cuda<DIM>::forward() {
 
     // Set forward algorithm and allocate the required workspace
     if (mFwdWorkspace == nullptr) {
-        // Find the best CuDNN forward algorithm (the one with the lowest compute time)
-        int maxAlgoIterations = 0;
-        cudnnGetConvolutionForwardAlgorithmMaxCount(CudaContext::cudnnHandle(),
-                                                    &maxAlgoIterations);
-
-        assert(maxAlgoIterations > 0 && "No available CUDNN ConvolutionForwardAlgorithm");
-
-        int returnAlgoCounts = 0;
-        std::vector<cudnnConvolutionFwdAlgoPerf_t> returnFwdAlgo(maxAlgoIterations);
-
-        CHECK_CUDNN_STATUS(cudnnFindConvolutionForwardAlgorithm(
-            CudaContext::cudnnHandle(),
-            std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0),
-            mFilterDesc,
-            mConvDesc,
-            std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
-            maxAlgoIterations,
-            &returnAlgoCounts,
-            &returnFwdAlgo[0]));
-        mFwdAlgo = returnFwdAlgo[0].algo;
-
         // Allocate the workspace required by the chosen CuDNN forward algorithm
         size_t workspaceSize = 0;
 
@@ -166,14 +149,18 @@ template <Aidge::DimIdx_t DIM>
 void Aidge::ConvImpl_cuda<DIM>::backward() {
     const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
 
-    // FIXME: uncomment the following code once memory handling will work
-    assert(mOp.getRawInput(0) && "missing input #0");
-    assert(mOp.getRawInput(1) && "missing input #1");
+    AIDGE_ASSERT(op.getInput(0), "missing input #0");
+    AIDGE_ASSERT(op.getInput(0)->hasImpl(), "the 0-th input has no implementation.");
+    AIDGE_ASSERT(op.getInput(1), "missing input #1");
+    AIDGE_ASSERT(op.getInput(1)->hasImpl(), "the 1-th input has no implementation.");
 
     // Convert input data (no overhead if not needed!)
-    const auto& input0 = op.getInput(0)->ref(mInput0Fallback, *op.getOutput(0));
-    const auto& input1 = op.getInput(1)->ref(mInput1Fallback, *op.getOutput(0));
-    const auto& input2 = op.getInput(2)->ref(mInput2Fallback, *op.getOutput(0));
+    const auto& input0 = op.getInput(0)->refCastFrom(mInput0Fallback, *op.getOutput(0));
+    const auto& input1 = op.getInput(1)->refCastFrom(mInput1Fallback, *op.getOutput(0));
+    Tensor input2;
+    if(op.getInput(2) && op.getInput(2)->hasImpl()) {
+        input2 = op.getInput(2)->refCastFrom(mInput2Fallback, *op.getOutput(0));
+    }
 
     // Set forward algorithm and allocate the required workspace
     if (mBwdWorkspace == nullptr) {
diff --git a/src/operator/DivImpl.cpp b/src/operator/DivImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0326a60c1a3aabf43ca3a1d892328991d6d72366
--- /dev/null
+++ b/src/operator/DivImpl.cpp
@@ -0,0 +1,113 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <algorithm>
+#include <cassert>
+#include <numeric>
+#include <vector>
+
+#include "aidge/backend/cuda/data/TensorImpl.hpp"
+#include "aidge/backend/cuda/operator/DivImpl.hpp"
+#include "aidge/backend/cuda/operator/DivImpl_CUDA_kernels.hpp"
+#include "aidge/backend/cuda/utils/CudaContext.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/operator/Div.hpp"
+#include "aidge/utils/Types.h"
+
+void Aidge::DivImpl_cuda::forward() {
+    const Div_Op& op = static_cast<const Div_Op&>(mOp);
+    // Check inputs
+    AIDGE_ASSERT(op.getInput(0), "missing input in Div operator");
+    AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Div forward because the 0-th input has no implementation.");
+    DataType datatypeFirstInput = op.getInput(0)->dataType();
+    for (IOIndex_t i = 1; i < op.nbInputs(); ++i) {
+        AIDGE_ASSERT(op.getInput(i), "missing input in Div operator");
+        AIDGE_ASSERT(op.getInput(i)->hasImpl(), "cannot run Div forward because the {}-th input has no implementation.", i);
+        AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, "Cannot Div inputs with two differents data type.");
+    }
+
+    std::vector<std::shared_ptr<Tensor>> inputFallbacks(op.nbInputs());
+    std::vector<Tensor> inputs(op.nbInputs());
+    std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims
+    std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides
+    for (IOIndex_t i = 0; i < op.nbInputs(); ++i) {
+        inputs[i] = op.getInput(i)->refCastFrom(inputFallbacks[i], *op.getOutput(0));
+
+        // Get tensor dims and broadcast them
+        std::copy(inputs[i].dims().begin(), inputs[i].dims().end(), std::back_inserter(dims[i]));
+        dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1));
+
+        if (dims[i].size() < 4) {
+            dims[i].resize(4, 1);
+        }
+
+        // Compute the corresponding strides
+        std::vector<int> tensorStrides(dims[i].size());
+        int product = 1;
+        for (size_t j = dims[i].size(); j > 0; --j) {
+            tensorStrides[j - 1] = product;
+            product *= dims[i][j - 1];
+        }
+        strides[i] = tensorStrides;
+    }
+
+    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+        case DataType::Float64:
+            forward_<double>(inputs, dims, strides);
+            break;
+        case DataType::Float32:
+            forward_<float>(inputs, dims, strides);
+            break;
+        case DataType::Float16:
+            forward_<half>(inputs, dims, strides);
+            break;
+        default:
+            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    }
+}
+
+template <class T>
+void Aidge::DivImpl_cuda::forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    // const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
+    // const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
+    const T * input1Ptr = static_cast<const T*>(inputs[0].getImpl()->rawPtr());
+    const T * input2Ptr = static_cast<const T*>(inputs[1].getImpl()->rawPtr());
+    T * outputPtr = static_cast<T*>(op.getOutput(0)->getImpl()->rawPtr());
+
+    std::vector<int> outputStrides(op.getOutput(0)->nbDims(), 1);
+    if(op.getOutput(0)->nbDims()>1) {
+        for (int i = op.getOutput(0)->nbDims()-2; i >= 0; i--) {
+            outputStrides[i] = outputStrides[i+1] *  op.getOutput(0)->dims()[i+1];
+        }
+    }
+    std::vector<int> outDims(std::max(op.getOutput(0)->nbDims(),std::size_t(4)), 1);
+    for (std::size_t i = 0; i < op.getOutput(0)->nbDims(); i++) {
+        outDims[i] = static_cast<int>(op.getOutput(0)->dims()[i]);
+    }
+
+    Aidge::divForward<T>(input1Ptr, outputPtr, input2Ptr,
+                inputsDims[0], inputsDims[1], outDims,
+                inputsStrides[0], inputsStrides[1], outputStrides,
+                static_cast<int>(op.getOutput(0)->size()));
+}
+
+void Aidge::DivImpl_cuda::backward() {
+    // TODO
+}
+
+template <class T>
+void Aidge::DivImpl_cuda::backward_(const Tensor& outGrad) {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
+    const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
+    // TODO
+}
\ No newline at end of file
diff --git a/src/operator/DivImpl_CUDA_kernels.cu b/src/operator/DivImpl_CUDA_kernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7ff5a2865a32cb38bf92571bdea6bf90ca8516eb
--- /dev/null
+++ b/src/operator/DivImpl_CUDA_kernels.cu
@@ -0,0 +1,100 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include "aidge/backend/cuda/operator/DivImpl_CUDA_kernels.hpp"
+
+
+// Helper function for Div
+template <typename T>
+__device__ T div(T a, T b) {
+    return a / b;
+}
+template <>
+__device__ half div<half>(half a, half b) {
+#if __CUDA_ARCH__ >= 530 && defined(CUDART_VERSION) && CUDART_VERSION >= 8000
+    return __hdiv(a, b);
+#else
+    return __float2half(__half2float(a) / __half2float(b));
+#endif
+}
+
+template <class T>
+__global__ void divKernel(const T* input1, T* output, const T* input2,
+                          int* input1_shape, int* input2_shape, int* output_shape,
+                          int* input1_strides, int* input2_strides, int* output_strides,
+                          int num_dims, int size) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= size) return;
+
+    int input1_idx = 0, input2_idx = 0;
+    int temp_idx = idx;
+    for (int i = 0; i < num_dims; ++i) {
+        int dim = temp_idx / output_strides[i];
+        temp_idx %= output_strides[i];
+        input1_idx += (input1_shape[i] == 1 ? 0 : dim) * input1_strides[i];
+        input2_idx += (input2_shape[i] == 1 ? 0 : dim) * input2_strides[i];
+    }
+    output[idx] = div(input1[input1_idx], input2[input2_idx]);
+}
+
+template <class T>
+void Aidge::divForward(const T* input1, T* output, const T* input2,
+                        const std::vector<int>& input1Dims,const std::vector<int>& input2Dims, const std::vector<int>& outputDims,
+                        const std::vector<int>& input1Strides, const std::vector<int>& input2Strides,const std::vector<int>& outputStrides,
+                        int outSize)
+{
+    int *d_input1_strides, *d_input2_strides, *d_output_strides, *d_input1_shape, *d_input2_shape, *d_output_shape;
+    // Allocate device memory
+    CHECK_CUDA_STATUS(cudaMalloc(&d_input1_shape, input1Dims.size() * sizeof(int)));
+    CHECK_CUDA_STATUS(cudaMalloc(&d_input2_shape, input1Dims.size() * sizeof(int)));
+    CHECK_CUDA_STATUS(cudaMalloc(&d_output_shape, input1Dims.size() * sizeof(int)));
+    CHECK_CUDA_STATUS(cudaMalloc(&d_input1_strides, input1Dims.size() * sizeof(int)));
+    CHECK_CUDA_STATUS(cudaMalloc(&d_input2_strides, input1Dims.size() * sizeof(int)));
+    CHECK_CUDA_STATUS(cudaMalloc(&d_output_strides, input1Dims.size() * sizeof(int)));
+
+    // Copy data from host to device;
+    CHECK_CUDA_STATUS(cudaMemcpy(d_input1_shape, input1Dims.data(), input1Dims.size() * sizeof(int), cudaMemcpyHostToDevice));
+    CHECK_CUDA_STATUS(cudaMemcpy(d_input2_shape, input2Dims.data(), input1Dims.size() * sizeof(int), cudaMemcpyHostToDevice));
+    CHECK_CUDA_STATUS(cudaMemcpy(d_output_shape, outputDims.data(), input1Dims.size() * sizeof(int), cudaMemcpyHostToDevice));
+    CHECK_CUDA_STATUS(cudaMemcpy(d_input1_strides, input1Strides.data(), input1Dims.size() * sizeof(int), cudaMemcpyHostToDevice));
+    CHECK_CUDA_STATUS(cudaMemcpy(d_input2_strides, input2Strides.data(), input1Dims.size() * sizeof(int), cudaMemcpyHostToDevice));
+    CHECK_CUDA_STATUS(cudaMemcpy(d_output_strides, outputStrides.data(), input1Dims.size() * sizeof(int), cudaMemcpyHostToDevice));
+    int blockSize = 256;
+    int numBlocks = (outSize + blockSize - 1) / blockSize;
+
+    int num_dims = input1Dims.size();
+    // Launch the kernel
+    divKernel<<<numBlocks, blockSize>>>(input1, output, input2,
+                                        d_input1_shape, d_input2_shape, d_output_shape,
+                                        d_input1_strides, d_input2_strides, d_output_strides,
+                                        num_dims, outSize);
+    CHECK_CUDA_STATUS(cudaFree(d_input1_shape));
+    CHECK_CUDA_STATUS(cudaFree(d_input2_shape));
+    CHECK_CUDA_STATUS(cudaFree(d_output_shape));
+    CHECK_CUDA_STATUS(cudaFree(d_input1_strides));
+    CHECK_CUDA_STATUS(cudaFree(d_input2_strides));
+    CHECK_CUDA_STATUS(cudaFree(d_output_strides));
+};
+
+template void Aidge::divForward<double>(const double* input1, double* output, const double* input2,
+                                const std::vector<int>& input1Dims,const std::vector<int>& input2Dims, const std::vector<int>& outputDims,
+                                const std::vector<int>& input1Strides, const std::vector<int>& input2Strides,const std::vector<int>& outputStrides,
+                                int outSize);
+
+template void Aidge::divForward<float>(const float* input1, float* output, const float* input2,
+                                const std::vector<int>& input1Dims,const std::vector<int>& input2Dims, const std::vector<int>& outputDims,
+                                const std::vector<int>& input1Strides, const std::vector<int>& input2Strides,const std::vector<int>& outputStrides,
+                                int outSize);
+
+template void Aidge::divForward<half>(const half* input1, half* output, const half* input2,
+                                const std::vector<int>& input1Dims,const std::vector<int>& input2Dims, const std::vector<int>& outputDims,
+                                const std::vector<int>& input1Strides, const std::vector<int>& input2Strides,const std::vector<int>& outputStrides,
+                                int outSize);
diff --git a/src/operator/FCImpl.cpp b/src/operator/FCImpl.cpp
index 9948ee1356ad4fedb5d830016ae66ca69a033e38..1a7bb8edb51312d08467354e20723ad19176bfee 100644
--- a/src/operator/FCImpl.cpp
+++ b/src/operator/FCImpl.cpp
@@ -110,7 +110,7 @@ void Aidge::FCImpl_cuda::forward_(const Tensor& input0, const Tensor& input1, co
                                        output,
                                        n));
 
-        cudaFree(onesVector);
+        CHECK_CUDA_STATUS(cudaFree(onesVector));
     }
 
 }
@@ -125,7 +125,7 @@ void Aidge::FCImpl_cuda::backward() {
 
     const auto& input0 = fcOp.getInput(0)->refCastFrom(mInput0Fallback, *fcOp.getOutput(0));
     const auto& input1 = fcOp.getInput(1)->refCastFrom(mInput1Fallback, *fcOp.getOutput(0));
-    const auto& input2 = fcOp.getInput(2)->refCastFrom(mInput2Fallback, *fcOp.getOutput(0));
+    const auto& input2 = (fcOp.getInput(2)) ? fcOp.getInput(2)->refCastFrom(mInput2Fallback, *fcOp.getOutput(0)) : Tensor();
 
     switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
         case DataType::Float64:
@@ -156,10 +156,9 @@ void Aidge::FCImpl_cuda::backward_(const Tensor& input0, const Tensor& input1, c
 
     // Performing weightsGrad = (input) * T(outputGrad)
     //              [n x m]   = [n x k] *   [k x m]
-    int m = input0.dims()[input0.nbDims()-1];
+    int m = input1.dims()[1];
     int k = input0.size()/m;
     int n = input1.size()/m;
-    int input0LastDim = input0.dims()[input0.nbDims()-1];
     CHECK_CUBLAS_STATUS(cublasGemm(
         CudaContext::cublasHandle(),
         CUBLAS_OP_N,
@@ -190,7 +189,7 @@ void Aidge::FCImpl_cuda::backward_(const Tensor& input0, const Tensor& input1, c
         CHECK_CUBLAS_STATUS(cublasGemv(CudaContext::cublasHandle(),
                                        CUBLAS_OP_N,
                                        outChannels,
-                                       input0LastDim,
+                                       k,
                                        reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&alpha),
                                        outputGrad,
                                        outChannels,
@@ -199,7 +198,7 @@ void Aidge::FCImpl_cuda::backward_(const Tensor& input0, const Tensor& input1, c
                                        reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&beta),
                                        biasGrad,
                                        1));
-        cudaFree(onesVector);
+        CHECK_CUDA_STATUS(cudaFree(onesVector));
     }
     // Performing inputGrad = (weights) * (outputGrad)
     CHECK_CUBLAS_STATUS(cublasGemm(
@@ -207,7 +206,7 @@ void Aidge::FCImpl_cuda::backward_(const Tensor& input0, const Tensor& input1, c
         CUBLAS_OP_N,
         CUBLAS_OP_N,
         op.getInput(1)->grad()->size()/outChannels,
-        input0LastDim,
+        k,
         outChannels,
         reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&alpha),
         weights,//w
diff --git a/src/operator/ILayerNormImpl.cpp b/src/operator/ILayerNormImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..47dd1d5d1a3f127c9e08788f605796020a7814a7
--- /dev/null
+++ b/src/operator/ILayerNormImpl.cpp
@@ -0,0 +1,204 @@
+/********************************************************************************
+ * Copyright (c) 2024 Thales
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ * Author: Lucas RAKOTOARIVONY, Thales Research & Technology France
+ * Date: 10.09.2024
+ *
+ ********************************************************************************/
+
+#include <cassert>
+#include <chrono>  // std::chrono::milliseconds
+#include <numeric> // std::accumulate
+#include <thread>  // std::this_thread::sleep_for
+#include <vector>
+#include <algorithm>  // For std::max
+#include <cmath>      // For pow
+#include <typeinfo>
+
+#include "aidge/backend/cuda/data/TensorImpl.hpp"
+#include "aidge/backend/cuda/operator/ILayerNormImpl.hpp"
+#include "aidge/backend/cuda/operator/ILayerNormImpl_CUDA_kernels.hpp"
+#include "aidge/backend/cuda/utils/CudaContext.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/operator/ILayerNorm.hpp"
+#include "aidge/utils/Types.h"
+
+void Aidge::ILayerNormImpl_cuda::forward() {
+
+
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+
+    assert(mOp.getRawInput(0) && "missing input #0");
+    assert(mOp.getRawInput(1) && "missing input #1");
+    assert(mOp.getRawInput(2) && "missing input #2");
+
+    const auto& input0 = op.getInput(0)->refCastFrom(mInput0Fallback, *op.getOutput(0));
+    const auto& input1 = op.getInput(1)->refCastFrom(mInput1Fallback, *op.getOutput(0));
+    const auto& input2 = op.getInput(2)->refCastFrom(mInput2Fallback, *op.getOutput(0));
+
+    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+        case DataType::Float64:
+            forward_<double>(input0, input1, input2);
+            break;
+        case DataType::Float32:
+            forward_<float>(input0, input1, input2);
+            break;
+        default:
+            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    }
+}
+
+
+template<class T>
+void Aidge::ILayerNormImpl_cuda::forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2)
+{
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const T * input_raw = static_cast<const T*>(input0.getImpl()->rawPtr());
+    const T * weight = static_cast<const T*>(input1.getImpl()->rawPtr());
+    const T * bias = static_cast<const T*>(input2.getImpl()->rawPtr());
+    T * output = static_cast<T*>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr());
+
+    int N = 15;
+    int output_bits = 8;
+    size_t size = input0.size();
+    std::vector<DimSize_t> dims_input = input0.dims();
+
+    // maybe find a most efficient way to compute scaling factor (a max and min function could help to retrieve scaling factor value)
+
+    double min = std::numeric_limits<double>::max();
+    double max = std::numeric_limits<double>::min();
+    for(std::size_t i = 0; i < dims_input[0]; i++) {
+        for(std::size_t j = 0; j < dims_input[1]; j++) {
+            for(std::size_t k = 0; k < dims_input[2]; k++) {
+                for(std::size_t l = 0; l < dims_input[3]; l++) {
+                    std::vector<std::size_t> coordIdx = {i, j, k, l};
+                    std::size_t newFlatIdx = input0.getIdx(coordIdx);
+                    if (newFlatIdx < min) {
+                        min = newFlatIdx;
+                    }
+                    if (newFlatIdx > max) {
+                        max = newFlatIdx;
+                    }
+               }
+            }     
+        }
+    }
+    double m = std::max(std::abs(min), std::abs(max));
+    double normalization_factor = static_cast<double>(1 << (output_bits - 1)) - 1;
+    double scaling_factor =  m / normalization_factor;
+    
+    // The new scaling factor that we can use to dequantify the returned tensor (not used here)
+    // double new_SF = 1/std::pow(2,2*output_bits-1); 
+
+    ILayerNormforward(input_raw, output, scaling_factor, weight, bias, size, dims_input);
+}
+
+void Aidge::ILayerNormImpl_cuda::backward() {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+
+    assert(op.getOutput(0)->grad() && "missing output #0");
+
+    const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad());
+
+    if (op.getInput(0)->grad()->dataType() == DataType::Float64) {
+        backward_<double>(output_grad);
+    }
+    else {
+        backward_<float>(output_grad);
+    }
+}
+
+template <class T>
+void Aidge::ILayerNormImpl_cuda::backward_(const Tensor& output_grad) {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    size_t size = output_grad.size();
+    std::vector<DimSize_t> dims_input = output_grad.dims();
+
+    const T * output = static_cast<const T*>(std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr());
+
+    T * input_grad = static_cast<T*>(op.getInput(0)->grad()->getImpl()->rawPtr());
+    T * weight_grad = static_cast<T*>(op.getInput(1)->grad()->getImpl()->rawPtr());
+    T * bias_grad = static_cast<T*>(op.getInput(2)->grad()->getImpl()->rawPtr());
+    
+    const T * input = static_cast<const T*>(op.getInput(0)->getImpl()->rawPtr());
+    const T * weight = static_cast<const T*>(op.getInput(1)->getImpl()->rawPtr());
+    const T * bias = static_cast<const T*>(op.getInput(2)->getImpl()->rawPtr());
+
+    // maybe find a most efficient way to compute mean and variance tensor
+
+    std::vector<std::vector<std::vector<std::vector<T>>>> means(dims_input[0],
+        std::vector<std::vector<std::vector<T>>>(dims_input[1],
+            std::vector<std::vector<T>>(dims_input[2],
+                std::vector<T>(dims_input[3], 0.0f))));
+
+    for (std::size_t i = 0; i < dims_input[0]; i++) {
+        for (std::size_t j = 0; j < dims_input[1]; j++) {
+            for (std::size_t k = 0; k < dims_input[2]; k++) {
+                T sum = 0.0f;
+
+                for (std::size_t l = 0; l < dims_input[3]; l++) {
+                    std::vector<std::size_t> coordIdx = {i, j, k, l};
+                    sum += output_grad.getIdx(coordIdx);
+                }
+                for (std::size_t l = 0; l < dims_input[3]; l++) {
+                    std::vector<std::size_t> coordIdx = {i, j, k, l};
+                    means[i][j][k][l] = sum / static_cast<T>(dims_input[3]);
+                }
+            }
+        }
+    }
+    std::vector<T> flat_means;
+
+    for (const auto &vec3d : means) {
+        for (const auto &vec2d : vec3d) {
+            for (const auto &vec1d : vec2d) {
+                flat_means.insert(flat_means.end(), vec1d.begin(), vec1d.end());
+            }
+        }
+    }
+
+    std::vector<std::vector<std::vector<std::vector<T>>>> vars(dims_input[0],
+        std::vector<std::vector<std::vector<T>>>(dims_input[1],
+            std::vector<std::vector<T>>(dims_input[2],
+                std::vector<T>(dims_input[3], 0.0f))));
+    
+    for (std::size_t i = 0; i < dims_input[0]; i++) {
+        for (std::size_t j = 0; j < dims_input[1]; j++) {
+            for (std::size_t k = 0; k < dims_input[2]; k++) {
+                T sum_sq_diff = 0.0f;
+
+                for (std::size_t l = 0; l < dims_input[3]; l++) {
+                    std::vector<std::size_t> coordIdx = {i, j, k, l};
+                    T value = static_cast<T>(output_grad.getIdx(coordIdx));
+                    T diff = value - means[i][j][k][l];
+                    sum_sq_diff += diff * diff;
+                }
+                T variance = sum_sq_diff / static_cast<T>(dims_input[3]);
+                for (std::size_t l = 0; l < dims_input[3]; l++) {
+                    vars[i][j][k][l] = variance;
+                }
+            }
+        }
+    }
+
+    std::vector<T> flat_vars;
+
+    for (const auto &vec3d : vars) {
+        for (const auto &vec2d : vec3d) {
+            for (const auto &vec1d : vec2d) {
+                flat_vars.insert(flat_vars.end(), vec1d.begin(), vec1d.end());
+            }
+        }
+    }
+
+    const T* mean_ = flat_means.data();
+    const T* var_ = flat_vars.data();
+    const T * output_grad_raw = static_cast<const T*>(output_grad.getImpl()->rawPtr());
+
+    ILayerNormbackward(output, output_grad_raw, input, mean_, var_, weight, bias, input_grad, weight_grad, bias_grad, size);
+}
diff --git a/src/operator/ILayerNormImpl_CUDA_kernels.cu b/src/operator/ILayerNormImpl_CUDA_kernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fafdc176fdad6a6130c9bc4374d75f8a773f2c16
--- /dev/null
+++ b/src/operator/ILayerNormImpl_CUDA_kernels.cu
@@ -0,0 +1,335 @@
+/********************************************************************************
+ * Copyright (c) 2024 Thales
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ * Author: Lucas RAKOTOARIVONY, Thales Research & Technology France
+ * Date: 10.09.2024
+ *
+ ********************************************************************************/
+
+#define MAX(X,Y) (((X) > (Y)) ? (X) : (Y))
+#define CLAMP(X) (((X) < (0)) ? (0) : (X))
+
+#include <stdio.h>
+#include <cuda_runtime.h>
+
+#include "aidge/backend/cuda/operator/ILayerNormImpl_CUDA_kernels.hpp"
+
+namespace Aidge{
+
+template <class T>
+__global__ void ILayerNormforward_(T* input, double SF, int* dims, int* quantized_tensor,long long int* square_tensor, T* weight, T* biase, double new_SF) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    int z = blockIdx.z * blockDim.z + threadIdx.z;
+
+    int k = 1 << 16;
+    long long int sum = 0;
+    if (x < dims[0] && y < dims[1] && z < dims[2]) {
+        int maxIdx = x * dims[1] * dims[2] * dims[3] + y * dims[2] * dims[3] + z * dims[3];
+        int val;
+        int mean_val = 0;
+        for (int i = 0; i < dims[3]; i++) {
+            int idx = maxIdx + i;
+            val = roundf(input[idx] / SF);
+            quantized_tensor[idx] = val;
+            mean_val += val;
+        }
+        for (int i = 0; i < dims[3]; i++) {
+            int idx = maxIdx + i;
+            quantized_tensor[idx] -= (mean_val/dims[3]) ;
+            square_tensor[idx] = (quantized_tensor[idx] * quantized_tensor[idx]); // I-ViT code implementation
+            //square_tensor[idx] = (quantized_tensor[idx] * quantized_tensor[idx])/dims[3]; // I-ViT paper implementation
+        }
+        for (int i = 0; i < dims[3]; i++) {
+            int idx = maxIdx + i;
+            sum += square_tensor[idx];
+            biase[i] =  (biase[i]/weight[i])/new_SF;
+            weight[i] = weight[i] * new_SF;
+        }
+        for(int h = 0; h < 10 ; h++)
+        {
+            k = floorf((k + floorf(sum / k))/2);
+        }
+        int factor = (((1 << 31) - 1) / k);
+        for (int i = 0; i < dims[3]; i++) {
+            int idx = maxIdx + i;
+            square_tensor[idx] =  (biase[idx]/weight[idx])/new_SF;
+            quantized_tensor[idx] = (quantized_tensor[idx] * factor / 2) + biase[maxIdx];
+            input[idx] = quantized_tensor[idx] * new_SF;
+        }
+
+    }
+}
+
+template <>
+void ILayerNormforward<float>(const float* input, float* output, double SF, const float* weight_raw, const float* bias_raw, size_t size, std::vector<long unsigned int> dims_input)
+{
+    int dims_input_cuda[4] = {1, 1, 1, 1};
+    for (std::size_t i = 0; i < std::min(dims_input.size(), size_t(4)); ++i) {
+        dims_input_cuda[i] = static_cast<int>(dims_input[i]);
+    }
+
+    double new_SF = std::sqrt(dims_input_cuda[3]) / (1 << 30);
+
+    float* input_cuda_tensor;
+    cudaMalloc(&input_cuda_tensor,size*sizeof(float));
+    cudaMemcpy(input_cuda_tensor,input, size * sizeof(float),cudaMemcpyHostToDevice);
+
+    int *quantized_tensor;
+    cudaMalloc(&quantized_tensor, size * sizeof(int));
+
+    int *dims;
+    cudaMalloc(&dims, 4 * sizeof(int));
+    cudaMemcpy(dims, dims_input_cuda, 4 * sizeof(int), cudaMemcpyHostToDevice);
+
+    float *weight;
+    cudaMalloc(&weight,dims_input_cuda[3]*sizeof(float));
+    cudaMemcpy(weight,weight_raw,dims_input_cuda[3]*sizeof(float),cudaMemcpyHostToDevice);
+
+    float *bias;
+    cudaMalloc(&bias,dims_input_cuda[3]*sizeof(float));
+    cudaMemcpy(bias,bias_raw,dims_input_cuda[3]*sizeof(float),cudaMemcpyHostToDevice);
+
+    long long int* Squaretensor;
+    cudaMalloc(&Squaretensor,(size)*sizeof(long long int));
+
+    dim3 threadsPerBlock(10, 10, 10);
+    dim3 numBlocks((dims_input_cuda[0] + threadsPerBlock.x - 1) / threadsPerBlock.x,
+                   (dims_input_cuda[1] + threadsPerBlock.y - 1) / threadsPerBlock.y,
+                   (dims_input_cuda[2] + threadsPerBlock.z - 1) / threadsPerBlock.z);
+
+    ILayerNormforward_<float><<<numBlocks,threadsPerBlock>>>(input_cuda_tensor,SF,dims,quantized_tensor,Squaretensor,weight,bias,new_SF);
+    cudaDeviceSynchronize();
+
+    cudaError_t err = cudaGetLastError();
+    if(err != cudaSuccess)
+    {
+        std::cerr << "CUDA Error: " << cudaGetErrorString(err) << std::endl;
+    }
+    cudaMemcpy(output,input_cuda_tensor, (size ) * sizeof(float), cudaMemcpyDeviceToHost);
+
+
+    cudaFree(input_cuda_tensor);
+    cudaFree(weight);
+    cudaFree(bias);
+    cudaFree(dims);
+    cudaFree(quantized_tensor);
+}
+
+template <>
+void ILayerNormforward<double>(const double* input, double* output, double SF, const double* weight_raw, const double* bias_raw, size_t size, std::vector<long unsigned int> dims_input)
+{
+    int dims_input_cuda[4] = {1, 1, 1, 1};
+    for (std::size_t i = 0; i < std::min(dims_input.size(), size_t(4)); ++i) {
+        dims_input_cuda[i] = static_cast<int>(dims_input[i]);
+    }
+
+    double new_SF = std::sqrt(dims_input_cuda[3]) / (1 << 30);
+
+    double* input_cuda_tensor;
+    cudaMalloc(&input_cuda_tensor,size*sizeof(double));
+    cudaMemcpy(input_cuda_tensor,input, size * sizeof(double),cudaMemcpyHostToDevice);
+
+    int *quantized_tensor;
+    cudaMalloc(&quantized_tensor, size * sizeof(int));
+
+    int *dims;
+    cudaMalloc(&dims, 4 * sizeof(int));
+    cudaMemcpy(dims, dims_input_cuda, 4 * sizeof(int), cudaMemcpyHostToDevice);
+
+    double *weight;
+    cudaMalloc(&weight,dims_input_cuda[3]*sizeof(double));
+    cudaMemcpy(weight,weight_raw,dims_input_cuda[3]*sizeof(double),cudaMemcpyHostToDevice);
+
+    double *bias;
+    cudaMalloc(&bias,dims_input_cuda[3]*sizeof(double));
+    cudaMemcpy(bias,bias_raw,dims_input_cuda[3]*sizeof(double),cudaMemcpyHostToDevice);
+
+    long long int* Squaretensor;
+    cudaMalloc(&Squaretensor,(size)*sizeof(long long int));
+
+    dim3 threadsPerBlock(10, 10, 10);
+    dim3 numBlocks((dims_input_cuda[0] + threadsPerBlock.x - 1) / threadsPerBlock.x,
+                   (dims_input_cuda[1] + threadsPerBlock.y - 1) / threadsPerBlock.y,
+                   (dims_input_cuda[2] + threadsPerBlock.z - 1) / threadsPerBlock.z);
+
+    ILayerNormforward_<double><<<numBlocks,threadsPerBlock>>>(input_cuda_tensor,SF,dims,quantized_tensor,Squaretensor,weight,bias,new_SF);
+    cudaDeviceSynchronize();
+
+    cudaError_t err = cudaGetLastError();
+    if(err != cudaSuccess)
+    {
+        std::cerr << "CUDA Error: " << cudaGetErrorString(err) << std::endl;
+    }
+
+    cudaMemcpy(output,input_cuda_tensor, (size ) * sizeof(double), cudaMemcpyDeviceToHost);
+
+    cudaFree(input_cuda_tensor);
+    cudaFree(weight);
+    cudaFree(bias);
+    cudaFree(dims);
+    cudaFree(quantized_tensor);
+}
+
+template <class T>
+__global__ void ILayerNormbackward_(T* output_grad, T* input_tensor, T* output_tensor, T* mean, T* var, T* weight, T* bias, T* input_grad, T* weight_grad, T* bias_grad, int size)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < size) {
+        T d_norm = output_grad[i] * weight[i];
+        T d_var = d_norm * (input_tensor[i] - mean[i]) * -0.5 * powf(var[i] + 1e-6, -1.5);
+        T d_mean = d_norm * -1 / sqrtf(var[i] + 1e-6) + d_var * -2 * mean[i] / size;
+        T d_input = d_norm / sqrtf(var[i] + 1e-6) + d_var * 2 * (input_tensor[i] - mean[i]) / size + d_mean / size;
+
+        input_grad[i] = d_input;
+        weight_grad[i] = output_grad[i] * output_tensor[i];
+        bias_grad[i] = output_grad[i]; 
+    }
+}
+
+template <>
+void ILayerNormbackward<float>(const float* input_tensor, const float* output_grad, const float* output_tensor,const float* mean,const float* var, const float* weight, const float* bias, float* input_grad, float* weight_grad, float* bias_grad, size_t size)
+{
+    float* input_cuda_tensor;
+    cudaMalloc(&input_cuda_tensor,size*sizeof(float));
+    cudaMemcpy(input_cuda_tensor,input_tensor,size*sizeof(float),cudaMemcpyHostToDevice);
+
+    float* output_grad_;
+    cudaMalloc(&output_grad_,size*sizeof(float));
+    cudaMemcpy(output_grad_,output_grad,size*sizeof(float),cudaMemcpyHostToDevice);
+
+    float* output_tensor_;
+    cudaMalloc(&output_tensor_,size*sizeof(float));
+    cudaMemcpy(output_tensor_,output_tensor,size*sizeof(float),cudaMemcpyHostToDevice);
+
+    float* mean_;
+    cudaMalloc(&mean_,size*sizeof(float));
+    cudaMemcpy(mean_,mean,size*sizeof(float),cudaMemcpyHostToDevice);
+
+    float* var_;
+    cudaMalloc(&var_,size*sizeof(float));
+    cudaMemcpy(var_,var,size*sizeof(float),cudaMemcpyHostToDevice);
+
+    float* weight_;
+    cudaMalloc(&weight_,size*sizeof(float));
+    cudaMemcpy(weight_,weight,size*sizeof(float),cudaMemcpyHostToDevice);
+
+    float* bias_;
+    cudaMalloc(&bias_,size*sizeof(float));
+    cudaMemcpy(bias_,bias,size*sizeof(float),cudaMemcpyHostToDevice);
+
+    
+    float* input_grad_;
+    cudaMalloc(&input_grad_,size*sizeof(float));
+
+    float* weight_grad_;
+    cudaMalloc(&weight_grad_,size*sizeof(float));
+
+    float* bias_grad_;
+    cudaMalloc(&bias_grad_,size*sizeof(float));
+
+
+    dim3 threadParBlock(256);
+    dim3 Blocks((size + threadParBlock.x -1) / threadParBlock.x);
+
+    ILayerNormbackward_<<<Blocks,threadParBlock>>>(output_grad_,input_cuda_tensor,output_tensor_,mean_,var_,weight_,bias_,input_grad_, weight_grad_, bias_grad_, size);
+
+    cudaDeviceSynchronize();
+    cudaError_t err = cudaGetLastError();
+    if(err != cudaSuccess)
+    {
+        printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    }
+
+    cudaMemcpy(input_grad , input_grad_, (size) * sizeof(float), cudaMemcpyDeviceToHost);
+    cudaMemcpy(weight_grad , weight_grad_, (size) * sizeof(float), cudaMemcpyDeviceToHost);
+    cudaMemcpy(bias_grad , bias_grad_, (size) * sizeof(float), cudaMemcpyDeviceToHost);
+
+    cudaFree(input_cuda_tensor);
+    cudaFree(output_grad_);
+    cudaFree(mean_);
+    cudaFree(var_);
+    cudaFree(weight_);
+    cudaFree(bias_);
+    cudaFree(input_grad_);
+    cudaFree(weight_grad_);
+    cudaFree(bias_grad_);
+    
+}
+
+template <>
+void ILayerNormbackward<double>(const double* input_tensor, const double* output_grad, const double* output_tensor,const double* mean,const double* var, const double* weight, const double* bias, double* input_grad, double* weight_grad, double* bias_grad, size_t size)
+{
+    double* input_cuda_tensor;
+    cudaMalloc(&input_cuda_tensor,size*sizeof(double));
+    cudaMemcpy(input_cuda_tensor,input_tensor,size*sizeof(double),cudaMemcpyHostToDevice);
+
+    double* output_grad_;
+    cudaMalloc(&output_grad_,size*sizeof(double));
+    cudaMemcpy(output_grad_,output_grad,size*sizeof(double),cudaMemcpyHostToDevice);
+
+    double* output_tensor_;
+    cudaMalloc(&output_tensor_,size*sizeof(double));
+    cudaMemcpy(output_tensor_,output_tensor,size*sizeof(double),cudaMemcpyHostToDevice);
+
+    double* mean_;
+    cudaMalloc(&mean_,size*sizeof(double));
+    cudaMemcpy(mean_,mean,size*sizeof(double),cudaMemcpyHostToDevice);
+
+    double* var_;
+    cudaMalloc(&var_,size*sizeof(double));
+    cudaMemcpy(var_,var,size*sizeof(double),cudaMemcpyHostToDevice);
+
+    double* weight_;
+    cudaMalloc(&weight_,size*sizeof(double));
+    cudaMemcpy(weight_,weight,size*sizeof(double),cudaMemcpyHostToDevice);
+
+    double* bias_;
+    cudaMalloc(&bias_,size*sizeof(double));
+    cudaMemcpy(bias_,bias,size*sizeof(double),cudaMemcpyHostToDevice);
+
+    
+    double* input_grad_;
+    cudaMalloc(&input_grad_,size*sizeof(double));
+
+    double* weight_grad_;
+    cudaMalloc(&weight_grad_,size*sizeof(double));
+
+    double* bias_grad_;
+    cudaMalloc(&bias_grad_,size*sizeof(double));
+
+
+    dim3 threadParBlock(256);
+    dim3 Blocks((size + threadParBlock.x -1) / threadParBlock.x);
+
+    ILayerNormbackward_<<<Blocks,threadParBlock>>>(output_grad_,input_cuda_tensor,output_tensor_,mean_,var_,weight_,bias_,input_grad_, weight_grad_, bias_grad_, size);
+
+    cudaDeviceSynchronize();
+    cudaError_t err = cudaGetLastError();
+    if(err != cudaSuccess)
+    {
+        printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    }
+
+
+    cudaMemcpy(input_grad , input_grad_, (size) * sizeof(double), cudaMemcpyDeviceToHost);
+    cudaMemcpy(weight_grad , weight_grad_, (size) * sizeof(double), cudaMemcpyDeviceToHost);
+    cudaMemcpy(bias_grad , bias_grad_, (size) * sizeof(double), cudaMemcpyDeviceToHost);
+
+    cudaFree(input_cuda_tensor);
+    cudaFree(output_grad_);
+    cudaFree(mean_);
+    cudaFree(var_);
+    cudaFree(weight_);
+    cudaFree(bias_);
+    cudaFree(input_grad_);
+    cudaFree(weight_grad_);
+    cudaFree(bias_grad_);
+}
+
+}
\ No newline at end of file
diff --git a/src/operator/LnImpl.cpp b/src/operator/LnImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ed09ed45f5006c3760376a9d6f44f29d05bcfabe
--- /dev/null
+++ b/src/operator/LnImpl.cpp
@@ -0,0 +1,80 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <cassert>
+#include <vector>
+
+#include "aidge/backend/cuda/data/TensorImpl.hpp"
+#include "aidge/backend/cuda/operator/LnImpl.hpp"
+#include "aidge/backend/cuda/operator/LnImpl_CUDA_kernels.hpp"
+#include "aidge/backend/cuda/utils/CudaContext.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/operator/Ln.hpp"
+#include "aidge/utils/Types.h"
+
+void Aidge::LnImpl_cuda::forward() {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+
+    assert(mOp.getRawInput(0) && "missing input #0");
+
+    const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0));
+
+    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+        case DataType::Float64:
+            forward_<double>(input);
+            break;
+        case DataType::Float32:
+            forward_<float>(input);
+            break;
+        case DataType::Float16:
+            forward_<half>(input);
+            break;
+        default:
+            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    }
+}
+
+template <class T>
+void Aidge::LnImpl_cuda::forward_(const Tensor& input) {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const T * inputPtr = static_cast<const T*>(input.getImpl()->rawPtr());
+    T * outputPtr = static_cast<T*>(op.getOutput(0)->getImpl()->rawPtr());
+
+
+    Aidge::lnForward<T>(inputPtr, outputPtr, static_cast<int>(op.getOutput(0)->size()));
+}
+
+void Aidge::LnImpl_cuda::backward() {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+
+    assert(op.getOutput(0)->grad() && "missing output #0");
+
+    const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad());
+
+    switch(op.getInput(0)->grad()->dataType()) {
+        case DataType::Float64:
+            backward_<double>(output_grad);
+            break;
+        case DataType::Float32:
+            backward_<float>(output_grad);
+            break;
+        case DataType::Float16:
+            backward_<half>(output_grad);
+            break;
+        default:
+            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    }
+}
+
+template <class T>
+void Aidge::LnImpl_cuda::backward_(const Tensor& output_grad) {
+    //TODO
+}
diff --git a/src/operator/LnImpl_CUDA_kernels.cu b/src/operator/LnImpl_CUDA_kernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..21521eaa9dbe7f7bb0664a4fce71c67979f735ad
--- /dev/null
+++ b/src/operator/LnImpl_CUDA_kernels.cu
@@ -0,0 +1,48 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include "aidge/backend/cuda/operator/LnImpl_CUDA_kernels.hpp"
+// Base template for floating-point types (float, double)
+template<typename T>
+__device__ T ln_helper(T x) {
+    return std::log(x);  // std::log works for both float and double
+}
+
+// Specialization for half-precision type using CUDA's half
+template<>
+__device__ half ln_helper<half>(half x) {
+    float x_float = __half2float(x);  // Convert __half to float
+    return __float2half(std::log(x_float));  // Compute log and convert back to half
+}
+
+template <class T>
+__global__ void lnKernel(const T* input, T* output, int size) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= size) return;
+
+    output[idx] = ln_helper(input[idx]);
+}
+
+template <class T>
+void Aidge::lnForward(const T* input, T* output, int size)
+{
+    int blockSize = 256;
+    int numBlocks = (size + blockSize - 1) / blockSize;
+
+    // Launch the kernel
+    lnKernel<<<numBlocks, blockSize>>>(input, output, size);
+};
+
+template void Aidge::lnForward<double>(const double* input, double* output, int size);
+
+template void Aidge::lnForward<float>(const float* input, float* output, int size);
+
+template void Aidge::lnForward<half>(const half* input, half* output, int size);
\ No newline at end of file
diff --git a/src/operator/MulImpl.cpp b/src/operator/MulImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..af87251e8f29eded7d24cca2f08b880557ebb482
--- /dev/null
+++ b/src/operator/MulImpl.cpp
@@ -0,0 +1,221 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <algorithm>
+#include <cassert>
+#include <numeric>
+#include <vector>
+#include <chrono>
+
+#include "aidge/backend/cuda/data/TensorImpl.hpp"
+#include "aidge/backend/cuda/operator/MulImpl.hpp"
+#include "aidge/backend/cuda/utils/CudaContext.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/operator/Mul.hpp"
+#include "aidge/utils/Types.h"
+
+void Aidge::MulImpl_cuda::forward() {
+   const Mul_Op& op = static_cast<const Mul_Op&>(mOp);
+    // Check inputs
+    AIDGE_ASSERT(op.getInput(0), "missing input in Mul operator");
+    AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Mul forward because the 0-th input has no implementation.");
+    DataType datatypeFirstInput = op.getInput(0)->dataType();
+    for (IOIndex_t i = 1; i < op.nbInputs(); ++i) {
+        AIDGE_ASSERT(op.getInput(i), "missing input in Mul operator");
+        AIDGE_ASSERT(op.getInput(i)->hasImpl(), "cannot run Mul forward because the {}-th input has no implementation.", i);
+        AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, "Cannot Mul inputs with two differents data type.");
+    }
+
+    std::vector<std::shared_ptr<Tensor>> inputFallbacks(op.nbInputs());
+    std::vector<Tensor> inputs(op.nbInputs());
+    std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims
+    std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides
+    for (IOIndex_t i = 0; i < op.nbInputs(); ++i) {
+        inputs[i] = op.getInput(i)->refCastFrom(inputFallbacks[i], *op.getOutput(0));
+
+        // Get tensor dims and broadcast them
+        std::copy(inputs[i].dims().begin(), inputs[i].dims().end(), std::back_inserter(dims[i]));
+        dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1));
+
+        if (dims[i].size() < 4) {
+            dims[i].resize(4, 1);
+        }
+
+        // Compute the corresponding strides
+        std::vector<int> tensorStrides(dims[i].size());
+        int product = 1;
+        for (size_t j = dims[i].size(); j > 0; --j) {
+            tensorStrides[j - 1] = product;
+            product *= dims[i][j - 1];
+        }
+        strides[i] = tensorStrides;
+    }
+
+    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+        case DataType::Float64:
+            forward_<double>(inputs, dims, strides);
+            break;
+        case DataType::Float32:
+            forward_<float>(inputs, dims, strides);
+            break;
+        case DataType::Float16:
+            forward_<half>(inputs, dims, strides);
+            break;
+        default:
+            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    }
+}
+
+template <class T>
+void Aidge::MulImpl_cuda::forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
+    const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
+
+    // Create a Tensor descriptor with the broadcasted dims and strides
+    cudnnTensorDescriptor_t tensorDesc0, tensorDesc1;
+    CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc0));
+    CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc0, CudaContext::data_type<T>::value, inputsDims[0].size(), inputsDims[0].data(), inputsStrides[0].data()));
+    CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc1));
+    CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc1, CudaContext::data_type<T>::value, inputsDims[1].size(), inputsDims[1].data(), inputsStrides[1].data()));
+    // Multiply inputs
+    cudnnOpTensorDescriptor_t opTensorDesc;
+    CHECK_CUDNN_STATUS(cudnnCreateOpTensorDescriptor(&opTensorDesc));
+    CHECK_CUDNN_STATUS(cudnnSetOpTensorDescriptor(opTensorDesc, CUDNN_OP_TENSOR_MUL, CudaContext::data_type<T>::value, CUDNN_PROPAGATE_NAN));
+    if(inputs[0].size()>inputs[1].size()) {
+        CHECK_CUDNN_STATUS(cudnnOpTensor(CudaContext::cudnnHandle(),
+                                        opTensorDesc,
+                                        &alpha,
+                                        tensorDesc0,
+                                        inputs[0].getImpl()->rawPtr(),
+                                        &alpha,
+                                        tensorDesc1,
+                                        inputs[1].getImpl()->rawPtr(),
+                                        &beta,
+                                        std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
+                                        std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()));
+    }
+    else {
+        CHECK_CUDNN_STATUS(cudnnOpTensor(CudaContext::cudnnHandle(),
+                                opTensorDesc,
+                                &alpha,
+                                tensorDesc1,
+                                inputs[1].getImpl()->rawPtr(),
+                                &alpha,
+                                tensorDesc0,
+                                inputs[0].getImpl()->rawPtr(),
+                                &beta,
+                                std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
+                                std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()));
+    }
+
+    CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc0));
+    CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc1));
+    CHECK_CUDNN_STATUS(cudnnDestroyOpTensorDescriptor(opTensorDesc));
+}
+
+void Aidge::MulImpl_cuda::backward() {
+    const Mul_Op& op = static_cast<const Mul_Op&>(mOp);
+    // Check output
+    AIDGE_ASSERT(op.getOutput(0)->grad(), "missing output gradient in Mul operator");
+    AIDGE_ASSERT(op.getOutput(0)->grad()->hasImpl(), "cannot run Mul backward because the output gradient has no implementation.");
+
+    std::shared_ptr<Tensor> outputGradFallback;
+    const auto& outputGrad = op.getOutput(0)->grad()->refCastFrom(outputGradFallback, *op.getOutput(0)->grad());
+
+    std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims
+    std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides
+    for (IOIndex_t i = 0; i < op.nbInputs(); ++i) {
+        std::shared_ptr<Tensor> inputFallback;
+        const Tensor input = op.getInput(i)->refCastFrom(inputFallback, *op.getOutput(0));
+
+        // Get tensor dims and broadcast them
+        std::copy(input.dims().begin(), input.dims().end(), std::back_inserter(dims[i]));
+        dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1));
+        
+        if (dims[i].size() < 4) {
+            dims[i].resize(4, 1);
+        }
+
+        // Compute the corresponding strides
+        std::vector<int> tensorStrides(dims[i].size());
+        int product = 1;
+        for (size_t j = dims[i].size(); j > 0; --j) {
+            tensorStrides[j - 1] = product;
+            product *= dims[i][j - 1];
+        }
+        strides[i] = tensorStrides;
+    }
+
+    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+        case DataType::Float64:
+            backward_<double>(outputGrad, dims, strides);
+            break;
+        case DataType::Float32:
+            backward_<float>(outputGrad, dims, strides);
+            break;
+        case DataType::Float16:
+            backward_<half>(outputGrad, dims, strides);
+            break;
+        default:
+            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    }
+}
+
+template <class T>
+void Aidge::MulImpl_cuda::backward_(const Tensor& outputGrad, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
+    const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
+
+
+    // Create a Tensor descriptor with the broadcasted dims and strides
+    cudnnTensorDescriptor_t tensorDesc0, tensorDesc1;
+    CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc0));
+    CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc0, CudaContext::data_type<T>::value, inputsDims[0].size(), inputsDims[0].data(), inputsStrides[0].data()));
+    CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc1));
+    CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc1, CudaContext::data_type<T>::value, inputsDims[1].size(), inputsDims[1].data(), inputsStrides[1].data()));
+    
+    // Create the operation descriptor
+    cudnnOpTensorDescriptor_t opTensorDesc;
+    CHECK_CUDNN_STATUS(cudnnCreateOpTensorDescriptor(&opTensorDesc));
+    CHECK_CUDNN_STATUS(cudnnSetOpTensorDescriptor(opTensorDesc, CUDNN_OP_TENSOR_MUL, CudaContext::data_type<T>::value, CUDNN_PROPAGATE_NAN));
+
+    // Input0_grad = output_grad * Input1
+    CHECK_CUDNN_STATUS(cudnnOpTensor(CudaContext::cudnnHandle(),
+                            opTensorDesc,
+                            &alpha,
+                            std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
+                            outputGrad.getImpl()->rawPtr(),
+                            &alpha,
+                            tensorDesc1,
+                            op.getInput(1)->getImpl()->rawPtr(),
+                            &beta,
+                            tensorDesc0,
+                            op.getInput(0)->grad()->getImpl()->rawPtr()));
+
+    // Input1_grad = output_grad * Input0
+    CHECK_CUDNN_STATUS(cudnnOpTensor(CudaContext::cudnnHandle(),
+                            opTensorDesc,
+                            &alpha,
+                            std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
+                            outputGrad.getImpl()->rawPtr(),
+                            &alpha,
+                            tensorDesc0,
+                            op.getInput(0)->getImpl()->rawPtr(),
+                            &beta,
+                            tensorDesc1,
+                            op.getInput(1)->grad()->getImpl()->rawPtr()));
+    
+    CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc0));
+    CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc1));
+    CHECK_CUDNN_STATUS(cudnnDestroyOpTensorDescriptor(opTensorDesc));
+}
\ No newline at end of file
diff --git a/src/operator/PowImpl.cpp b/src/operator/PowImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..84af8c2a74c8ebaeb7d7380975089086e4db31da
--- /dev/null
+++ b/src/operator/PowImpl.cpp
@@ -0,0 +1,113 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <algorithm>
+#include <cassert>
+#include <numeric>
+#include <vector>
+
+#include "aidge/backend/cuda/data/TensorImpl.hpp"
+#include "aidge/backend/cuda/operator/PowImpl.hpp"
+#include "aidge/backend/cuda/operator/PowImpl_CUDA_kernels.hpp"
+#include "aidge/backend/cuda/utils/CudaContext.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/operator/Pow.hpp"
+#include "aidge/utils/Types.h"
+
+void Aidge::PowImpl_cuda::forward() {
+    const Pow_Op& op = static_cast<const Pow_Op&>(mOp);
+    // Check inputs
+    AIDGE_ASSERT(op.getInput(0), "missing input in Pow operator");
+    AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Pow forward because the 0-th input has no implementation.");
+    DataType datatypeFirstInput = op.getInput(0)->dataType();
+    for (IOIndex_t i = 1; i < op.nbInputs(); ++i) {
+        AIDGE_ASSERT(op.getInput(i), "missing input in Pow operator");
+        AIDGE_ASSERT(op.getInput(i)->hasImpl(), "cannot run Pow forward because the {}-th input has no implementation.", i);
+        AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, "Cannot Pow inputs with two differents data type.");
+    }
+
+    std::vector<std::shared_ptr<Tensor>> inputFallbacks(op.nbInputs());
+    std::vector<Tensor> inputs(op.nbInputs());
+    std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims
+    std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides
+    for (IOIndex_t i = 0; i < op.nbInputs(); ++i) {
+        inputs[i] = op.getInput(i)->refCastFrom(inputFallbacks[i], *op.getOutput(0));
+
+        // Get tensor dims and broadcast them
+        std::copy(inputs[i].dims().begin(), inputs[i].dims().end(), std::back_inserter(dims[i]));
+        dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1));
+
+        if (dims[i].size() < 4) {
+            dims[i].resize(4, 1);
+        }
+
+        // Compute the corresponding strides
+        std::vector<int> tensorStrides(dims[i].size());
+        int product = 1;
+        for (size_t j = dims[i].size(); j > 0; --j) {
+            tensorStrides[j - 1] = product;
+            product *= dims[i][j - 1];
+        }
+        strides[i] = tensorStrides;
+    }
+
+    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+        case DataType::Float64:
+            forward_<double>(inputs, dims, strides);
+            break;
+        case DataType::Float32:
+            forward_<float>(inputs, dims, strides);
+            break;
+        case DataType::Float16:
+            forward_<half>(inputs, dims, strides);
+            break;
+        default:
+            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    }
+}
+
+template <class T>
+void Aidge::PowImpl_cuda::forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    // const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
+    // const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
+    const T * input1Ptr = static_cast<const T*>(inputs[0].getImpl()->rawPtr());
+    const T * input2Ptr = static_cast<const T*>(inputs[1].getImpl()->rawPtr());
+    T * outputPtr = static_cast<T*>(op.getOutput(0)->getImpl()->rawPtr());
+
+    std::vector<int> outputStrides(op.getOutput(0)->nbDims(), 1);
+    if(op.getOutput(0)->nbDims()>1) {
+        for (int i = op.getOutput(0)->nbDims()-2; i >= 0; i--) {
+            outputStrides[i] = outputStrides[i+1] *  op.getOutput(0)->dims()[i+1];
+        }
+    }
+    std::vector<int> outDims(std::max(op.getOutput(0)->nbDims(),std::size_t(4)), 1);
+    for (std::size_t i = 0; i < op.getOutput(0)->nbDims(); i++) {
+        outDims[i] = static_cast<int>(op.getOutput(0)->dims()[i]);
+    }
+
+    Aidge::powForward<T>(input1Ptr, outputPtr, input2Ptr,
+                inputsDims[0], inputsDims[1], outDims,
+                inputsStrides[0], inputsStrides[1], outputStrides,
+                static_cast<int>(op.getOutput(0)->size()));
+}
+
+void Aidge::PowImpl_cuda::backward() {
+    // TODO
+}
+
+template <class T>
+void Aidge::PowImpl_cuda::backward_(const Tensor& outGrad) {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
+    const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
+    // TODO
+}
\ No newline at end of file
diff --git a/src/operator/PowImpl_CUDA_kernels.cu b/src/operator/PowImpl_CUDA_kernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..acd03b21764dccdfd3c5bc279e255cd0b692537e
--- /dev/null
+++ b/src/operator/PowImpl_CUDA_kernels.cu
@@ -0,0 +1,99 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <cuda_fp16.h>
+
+#include "aidge/backend/cuda/operator/PowImpl_CUDA_kernels.hpp"
+
+// Helper function for pow
+template <typename T>
+__device__ T pow(T x, T exponent) {
+    return std::pow(x, exponent);
+}
+template <>
+__device__ half pow<half>(half x, half exponent) {
+    return __float2half(powf(__half2float(x), __half2float(exponent)));
+}
+
+template <class T>
+__global__ void pow_kernel(const T* input, T* output, const T* exponent,
+                          int* input_shape, int* exponent_shape, int* output_shape,
+                          int* input_strides, int* exponent_strides, int* output_strides,
+                          int num_dims, int size) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= size) return;
+
+    int input_idx = 0, exponent_idx = 0;
+    int temp_idx = idx;
+    for (int i = 0; i < num_dims; ++i) {
+        int dim = temp_idx / output_strides[i];
+        temp_idx %= output_strides[i];
+        input_idx += (input_shape[i] == 1 ? 0 : dim) * input_strides[i];
+        exponent_idx += (exponent_shape[i] == 1 ? 0 : dim) * exponent_strides[i];
+    }
+
+    output[idx] = pow(input[input_idx], exponent[exponent_idx]);
+}
+
+
+template <class T>
+void Aidge::powForward<T>(const T* input, T* output, const T* exponent,
+                                const std::vector<int>& inputDims,const std::vector<int>& exponentDims, const std::vector<int>& outputDims,
+                                const std::vector<int>& inputStrides, const std::vector<int>& exponentStrides,const std::vector<int>& outputStrides,
+                                int outSize)
+{
+    int *d_input_strides, *d_exponent_strides, *d_output_strides, *d_input_shape, *d_exponent_shape, *d_output_shape;
+    // Allocate device memory
+    CHECK_CUDA_STATUS(cudaMalloc(&d_input_shape, inputDims.size() * sizeof(int)));
+    CHECK_CUDA_STATUS(cudaMalloc(&d_exponent_shape, inputDims.size() * sizeof(int)));
+    CHECK_CUDA_STATUS(cudaMalloc(&d_output_shape, inputDims.size() * sizeof(int)));
+    CHECK_CUDA_STATUS(cudaMalloc(&d_input_strides, inputDims.size() * sizeof(int)));
+    CHECK_CUDA_STATUS(cudaMalloc(&d_exponent_strides, inputDims.size() * sizeof(int)));
+    CHECK_CUDA_STATUS(cudaMalloc(&d_output_strides, inputDims.size() * sizeof(int)));
+
+    // Copy data from host to device;
+    CHECK_CUDA_STATUS(cudaMemcpy(d_input_shape, inputDims.data(), inputDims.size() * sizeof(int), cudaMemcpyHostToDevice));
+    CHECK_CUDA_STATUS(cudaMemcpy(d_exponent_shape, exponentDims.data(), inputDims.size() * sizeof(int), cudaMemcpyHostToDevice));
+    CHECK_CUDA_STATUS(cudaMemcpy(d_output_shape, outputDims.data(), inputDims.size() * sizeof(int), cudaMemcpyHostToDevice));
+    CHECK_CUDA_STATUS(cudaMemcpy(d_input_strides, inputStrides.data(), inputDims.size() * sizeof(int), cudaMemcpyHostToDevice));
+    CHECK_CUDA_STATUS(cudaMemcpy(d_exponent_strides, exponentStrides.data(), inputDims.size() * sizeof(int), cudaMemcpyHostToDevice));
+    CHECK_CUDA_STATUS(cudaMemcpy(d_output_strides, outputStrides.data(), inputDims.size() * sizeof(int), cudaMemcpyHostToDevice));
+    int blockSize = 256;
+    int numBlocks = (outSize + blockSize - 1) / blockSize;
+
+    int num_dims = inputDims.size();
+    // Launch the kernel
+    pow_kernel<<<numBlocks, blockSize>>>(input, output, exponent,
+                                        d_input_shape, d_exponent_shape, d_output_shape,
+                                        d_input_strides, d_exponent_strides, d_output_strides,
+                                        num_dims, outSize);
+    CHECK_CUDA_STATUS(cudaFree(d_input_shape));
+    CHECK_CUDA_STATUS(cudaFree(d_exponent_shape));
+    CHECK_CUDA_STATUS(cudaFree(d_output_shape));
+    CHECK_CUDA_STATUS(cudaFree(d_input_strides));
+    CHECK_CUDA_STATUS(cudaFree(d_exponent_strides));
+    CHECK_CUDA_STATUS(cudaFree(d_output_strides));
+};
+
+template void Aidge::powForward<double>(const double* input, double* output, const double* exponent,
+                                        const std::vector<int>& inputDims,const std::vector<int>& exponentDims, const std::vector<int>& outputDims,
+                                        const std::vector<int>& inputStrides, const std::vector<int>& exponentStrides,const std::vector<int>& outputStrides,
+                                        int outSize);
+
+template void Aidge::powForward<float>(const float* input, float* output, const float* exponent,
+                                        const std::vector<int>& inputDims,const std::vector<int>& exponentDims, const std::vector<int>& outputDims,
+                                        const std::vector<int>& inputStrides, const std::vector<int>& exponentStrides,const std::vector<int>& outputStrides,
+                                        int outSize);
+
+template void Aidge::powForward<half>(const half* input, half* output, const half* exponent,
+                                        const std::vector<int>& inputDims,const std::vector<int>& exponentDims, const std::vector<int>& outputDims,
+                                        const std::vector<int>& inputStrides, const std::vector<int>& exponentStrides,const std::vector<int>& outputStrides,
+                                        int outSize);
diff --git a/src/operator/ReduceImpl_CUDA_kernels.cu b/src/operator/ReduceImpl_CUDA_kernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7002e34116d2c1050987dc0cb93dbf7339a7ea93
--- /dev/null
+++ b/src/operator/ReduceImpl_CUDA_kernels.cu
@@ -0,0 +1,114 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include "aidge/backend/cuda/operator/ReduceImpl_CUDA_kernels.hpp"
+
+template <typename T>
+__global__ void duplicateElements(const T* input, T* output, const std::size_t* shape, const std::size_t* new_shape, const int* axes, const std::size_t* factors, int num_dims, int num_axes) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int input_size = 1;
+    int output_size = 1;
+
+    for (int i = 0; i < num_dims; ++i) {
+        input_size *= shape[i];
+        output_size *= new_shape[i];
+    }
+
+    if (idx >= output_size) return;
+
+    int* out_idx = new int[num_dims];
+    int* in_idx = new int[num_dims];
+    int remaining_idx = idx;
+
+    for (int i = num_dims - 1; i >= 0; --i) {
+        out_idx[i] = remaining_idx % new_shape[i];
+        remaining_idx /= new_shape[i];
+    }
+
+    for (int i = 0; i < num_dims; ++i) {
+        in_idx[i] = out_idx[i];
+    }
+
+    for (int i = 0; i < num_axes; ++i) {
+        int axis = axes[i];
+        int factor = factors[i];
+        in_idx[axis] = out_idx[axis] / factor;
+    }
+
+    int in_linear_idx = 0;
+    int out_linear_idx = 0;
+    int input_stride = 1;
+    int output_stride = 1;
+
+    for (int i = num_dims - 1; i >= 0; --i) {
+        in_linear_idx += in_idx[i] * input_stride;
+        out_linear_idx += out_idx[i] * output_stride;
+        input_stride *= shape[i];
+        output_stride *= new_shape[i];
+    }
+
+    output[out_linear_idx] = input[in_linear_idx];
+
+    delete[] out_idx;
+    delete[] in_idx;
+}
+
+template <typename T>
+void Aidge::ReduceBackward(const T* input, T* output, const std::vector<std::size_t>& inputDims, const std::vector<std::size_t>& outputDims, const std::vector<int>& axes, const std::vector<std::size_t>& factors, int outSize) {
+
+    std::size_t* d_shape;
+    std::size_t* d_new_shape;
+    int* d_axes;
+    std::size_t* d_factors;
+    cudaMalloc(&d_shape, inputDims.size() * sizeof(std::size_t));
+    cudaMalloc(&d_new_shape, outputDims.size() * sizeof(std::size_t));
+    cudaMalloc(&d_axes, axes.size() * sizeof(int));
+    cudaMalloc(&d_factors, axes.size() * sizeof(std::size_t));
+
+    cudaMemcpy(d_shape, inputDims.data(), inputDims.size() * sizeof(std::size_t), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_new_shape, outputDims.data(), outputDims.size() * sizeof(std::size_t), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_axes, axes.data(), axes.size() * sizeof(int), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_factors, factors.data(), axes.size() * sizeof(std::size_t), cudaMemcpyHostToDevice);
+
+    int blockSize = 256;
+    int numBlocks = (outSize + blockSize - 1) / blockSize;
+
+    duplicateElements<<<numBlocks, blockSize>>>(input, output, d_shape, d_new_shape, d_axes, d_factors, static_cast<int>(inputDims.size()), static_cast<int>(axes.size()));
+    cudaFree(d_shape);
+    cudaFree(d_new_shape);
+    cudaFree(d_axes);
+    cudaFree(d_factors);
+}
+
+
+template void Aidge::ReduceBackward(const double* input,
+                               double* output,
+                               const std::vector<std::size_t>& inputDims,
+                               const std::vector<std::size_t>& outputDims,
+                               const std::vector<int>& axes,
+                               const std::vector<std::size_t>& factors,
+                               int outSize);
+
+template void Aidge::ReduceBackward(const float* input,
+                               float* output,
+                               const std::vector<std::size_t>& inputDims,
+                               const std::vector<std::size_t>& outputDims,
+                               const std::vector<int>& axes,
+                               const std::vector<std::size_t>& factors,
+                               int outSize);
+
+template void Aidge::ReduceBackward(const half* input,
+                               half* output,
+                               const std::vector<std::size_t>& inputDims,
+                               const std::vector<std::size_t>& outputDims,
+                               const std::vector<int>& axes,
+                               const std::vector<std::size_t>& factors,
+                               int outSize);
diff --git a/src/operator/ReduceMeanImpl.cpp b/src/operator/ReduceMeanImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ff83ea5153a95e109ce7ef83c42ed4d672561ad1
--- /dev/null
+++ b/src/operator/ReduceMeanImpl.cpp
@@ -0,0 +1,200 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <algorithm>
+#include <cassert>
+#include <numeric>
+#include <vector>
+
+#include "aidge/backend/cuda/data/TensorImpl.hpp"
+#include "aidge/backend/cuda/operator/ReduceMeanImpl.hpp"
+#include "aidge/backend/cuda/operator/ReduceImpl_CUDA_kernels.hpp"
+#include "aidge/backend/cuda/utils/CudaContext.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/operator/ReduceMean.hpp"
+#include "aidge/utils/Types.h"
+
+void Aidge::ReduceMeanImpl_cuda::forward() {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    AIDGE_ASSERT(op.getInput(0), "missing input in ReduceMean operator");
+    AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run ReduceMean forward because the input has no implementation.");
+
+    const auto& input = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->refCastFrom(mInputFallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
+
+    const ReduceMean_Op& rmOp = static_cast<const ReduceMean_Op&>(mOp);
+    bool keepDims = rmOp.keepDims();
+    auto axes =  rmOp.axes();
+    if (axes.empty()) {
+        input.getImpl()->copy(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr(), input.size());
+    }
+    else {
+        switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+            case DataType::Float64:
+                forward_<double>(input, axes, keepDims);
+                break;
+            case DataType::Float32:
+                forward_<float>(input, axes, keepDims);
+                break;
+            case DataType::Float16:
+                forward_<half>(input, axes, keepDims);
+                break;
+            default:
+                AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+        }
+    }
+}
+
+
+template <class T>
+void Aidge::ReduceMeanImpl_cuda::forward_(const Tensor& input, const std::vector<int>& axes,  bool keepDims) {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
+    const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
+
+    cudnnReduceTensorDescriptor_t reduceDesc;
+    cudnnTensorDescriptor_t outputDesc;
+    if (keepDims) {
+        outputDesc = std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0));
+        CHECK_CUDNN_STATUS(cudnnCreateReduceTensorDescriptor(&reduceDesc));
+        CHECK_CUDNN_STATUS(cudnnSetReduceTensorDescriptor(reduceDesc,
+                                                            CUDNN_REDUCE_TENSOR_AVG,
+                                                            CudaContext::data_type<T>::value,
+                                                            CUDNN_PROPAGATE_NAN,
+                                                            CUDNN_REDUCE_TENSOR_NO_INDICES,
+                                                            CUDNN_32BIT_INDICES));
+
+
+        size_t workspaceSize;
+        CHECK_CUDNN_STATUS(cudnnGetReductionWorkspaceSize(CudaContext::cudnnHandle(),
+                            reduceDesc,
+                            std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input),
+                            outputDesc,
+                            &workspaceSize));
+
+        void *d_workspace;
+        CHECK_CUDA_STATUS(cudaMalloc(&d_workspace, workspaceSize));
+
+        CHECK_CUDNN_STATUS(cudnnReduceTensor(CudaContext::cudnnHandle(),
+                            reduceDesc,
+                            NULL,
+                            0,
+                            d_workspace,
+                            workspaceSize,
+                            &alpha,
+                            std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input),
+                            input.getImpl()->rawPtr(),
+                            &beta,
+                            outputDesc,
+                            std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()));
+
+        CHECK_CUDNN_STATUS(cudnnDestroyReduceTensorDescriptor(reduceDesc));
+    }
+    else {
+        CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&outputDesc));
+        std::vector<int> outputDims;
+        std::copy(input.dims().begin(), input.dims().end(), std::back_inserter(outputDims));
+        for (const auto axis:axes) {
+            outputDims[axis] = 1;
+        }
+        if (outputDims.size() < 4) {
+            outputDims.resize(4, 1);
+        }
+        // Compute the corresponding strides
+        std::vector<int> outputStrides(outputDims.size());
+        int product = 1;
+        for (size_t i = outputDims.size(); i > 0; --i) {
+            outputStrides[i - 1] = product;
+            product *= outputDims[i - 1];
+        }
+        CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(outputDesc, CudaContext::data_type<T>::value, outputDims.size(), outputDims.data(), outputStrides.data()));
+    
+        CHECK_CUDNN_STATUS(cudnnCreateReduceTensorDescriptor(&reduceDesc));
+        CHECK_CUDNN_STATUS(cudnnSetReduceTensorDescriptor(reduceDesc,
+                                                            CUDNN_REDUCE_TENSOR_AVG,
+                                                            CudaContext::data_type<T>::value,
+                                                            CUDNN_PROPAGATE_NAN,
+                                                            CUDNN_REDUCE_TENSOR_NO_INDICES,
+                                                            CUDNN_32BIT_INDICES));
+
+
+        size_t workspaceSize;
+        CHECK_CUDNN_STATUS(cudnnGetReductionWorkspaceSize(CudaContext::cudnnHandle(),
+                            reduceDesc,
+                            std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input),
+                            outputDesc,
+                            &workspaceSize));
+
+        void *d_workspace;
+        CHECK_CUDA_STATUS(cudaMalloc(&d_workspace, workspaceSize));
+
+        CHECK_CUDNN_STATUS(cudnnReduceTensor(CudaContext::cudnnHandle(),
+                            reduceDesc,
+                            NULL,
+                            0,
+                            d_workspace,
+                            workspaceSize,
+                            &alpha,
+                            std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input),
+                            input.getImpl()->rawPtr(),
+                            &beta,
+                            outputDesc,
+                            std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()));
+
+        CHECK_CUDNN_STATUS(cudnnDestroyReduceTensorDescriptor(reduceDesc));
+        CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(outputDesc));
+    }
+}
+
+void Aidge::ReduceMeanImpl_cuda::backward() {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    AIDGE_ASSERT(op.getOutput(0)->grad(), "missing outputGrad in ReduceMean operator");
+    AIDGE_ASSERT(op.getOutput(0)->grad()->hasImpl(), "cannot run ReduceMean backward because the output grad has no implementation.");
+
+    const auto& outGrad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getInput(0)->grad());
+
+    const ReduceMean_Op& rmOp = static_cast<const ReduceMean_Op&>(mOp);
+    auto axes =  rmOp.axes();
+    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+        case DataType::Float64:
+            backward_<double>(outGrad, axes);
+            break;
+        case DataType::Float32:
+            backward_<float>(outGrad, axes);
+            break;
+        case DataType::Float16:
+            backward_<half>(outGrad, axes);
+            break;
+        default:
+            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    }
+}
+
+template <class T>
+void Aidge::ReduceMeanImpl_cuda::backward_(const Tensor& outGrad, const std::vector<int>& axes) {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    // const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
+    // const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
+    const T * outputGrad = static_cast<const T*>(op.getOutput(0)->grad()->getImpl()->rawPtr());
+    T * inputGrad = static_cast<T*>(op.getInput(0)->grad()->getImpl()->rawPtr());
+
+    std::vector<std::size_t> factors;
+    for (auto axis:axes) {
+        factors.push_back(op.getInput(0)->grad()->dims()[axis]);
+    }
+    
+    Aidge::ReduceBackward(outputGrad,
+                            inputGrad,
+                            outGrad.dims(),
+                            op.getInput(0)->grad()->dims(),
+                            axes,
+                            factors,
+                            static_cast<int>(op.getInput(0)->grad()->size()));
+}
diff --git a/src/operator/ReduceSumImpl.cpp b/src/operator/ReduceSumImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..895584d87dab88f3f71a424a02a3b32954c4dc43
--- /dev/null
+++ b/src/operator/ReduceSumImpl.cpp
@@ -0,0 +1,199 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <algorithm>
+#include <cassert>
+#include <numeric>
+#include <vector>
+
+#include "aidge/backend/cuda/data/TensorImpl.hpp"
+#include "aidge/backend/cuda/operator/ReduceSumImpl.hpp"
+#include "aidge/backend/cuda/operator/ReduceImpl_CUDA_kernels.hpp"
+#include "aidge/backend/cuda/utils/CudaContext.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/operator/ReduceSum.hpp"
+#include "aidge/utils/Types.h"
+
+void Aidge::ReduceSumImpl_cuda::forward() {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    AIDGE_ASSERT(op.getInput(0), "missing input in ReduceSum operator");
+    AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run ReduceSum forward because the input has no implementation.");
+
+    const auto& input = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->refCastFrom(mInputFallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
+
+    const ReduceSum_Op& rsOp = static_cast<const ReduceSum_Op&>(mOp);
+    bool keepDims = rsOp.keepDims();
+    auto axes =  rsOp.axes();
+    if (axes.empty()) {
+        input.getImpl()->copy(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr(), input.size());
+    }
+    else {
+        switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+            case DataType::Float64:
+                forward_<double>(input, axes, keepDims);
+                break;
+            case DataType::Float32:
+                forward_<float>(input, axes, keepDims);
+                break;
+            case DataType::Float16:
+                forward_<half>(input, axes, keepDims);
+                break;
+            default:
+                AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+        }
+    }
+}
+
+
+template <class T>
+void Aidge::ReduceSumImpl_cuda::forward_(const Tensor& input, const std::vector<int>& axes, bool keepDims) {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
+    const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
+
+    cudnnReduceTensorDescriptor_t reduceDesc;
+    cudnnTensorDescriptor_t outputDesc;
+    if (keepDims) {
+        outputDesc = std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0));
+        CHECK_CUDNN_STATUS(cudnnCreateReduceTensorDescriptor(&reduceDesc));
+        CHECK_CUDNN_STATUS(cudnnSetReduceTensorDescriptor(reduceDesc,
+                                                            CUDNN_REDUCE_TENSOR_ADD,
+                                                            CudaContext::data_type<T>::value,
+                                                            CUDNN_PROPAGATE_NAN,
+                                                            CUDNN_REDUCE_TENSOR_NO_INDICES,
+                                                            CUDNN_32BIT_INDICES));
+
+
+        size_t workspaceSize;
+        CHECK_CUDNN_STATUS(cudnnGetReductionWorkspaceSize(CudaContext::cudnnHandle(),
+                            reduceDesc,
+                            std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input),
+                            outputDesc,
+                            &workspaceSize));
+
+        void *d_workspace;
+        CHECK_CUDA_STATUS(cudaMalloc(&d_workspace, workspaceSize));
+
+        CHECK_CUDNN_STATUS(cudnnReduceTensor(CudaContext::cudnnHandle(),
+                            reduceDesc,
+                            NULL,
+                            0,
+                            d_workspace,
+                            workspaceSize,
+                            &alpha,
+                            std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input),
+                            input.getImpl()->rawPtr(),
+                            &beta,
+                            outputDesc,
+                            std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()));
+
+        CHECK_CUDNN_STATUS(cudnnDestroyReduceTensorDescriptor(reduceDesc));
+    }
+    else {
+        CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&outputDesc));
+        std::vector<int> outputDims;
+        std::copy(input.dims().begin(), input.dims().end(), std::back_inserter(outputDims));
+        for (const auto axis:axes) {
+            outputDims[axis] = 1;
+        }
+        if (outputDims.size() < 4) {
+            outputDims.resize(4, 1);
+        }
+        // Compute the corresponding strides
+        std::vector<int> outputStrides(outputDims.size());
+        int product = 1;
+        for (size_t i = outputDims.size(); i > 0; --i) {
+            outputStrides[i - 1] = product;
+            product *= outputDims[i - 1];
+        }
+        CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(outputDesc, CudaContext::data_type<T>::value, outputDims.size(), outputDims.data(), outputStrides.data()));
+    
+        CHECK_CUDNN_STATUS(cudnnCreateReduceTensorDescriptor(&reduceDesc));
+        CHECK_CUDNN_STATUS(cudnnSetReduceTensorDescriptor(reduceDesc,
+                                                            CUDNN_REDUCE_TENSOR_ADD,
+                                                            CudaContext::data_type<T>::value,
+                                                            CUDNN_PROPAGATE_NAN,
+                                                            CUDNN_REDUCE_TENSOR_NO_INDICES,
+                                                            CUDNN_32BIT_INDICES));
+
+
+        size_t workspaceSize;
+        CHECK_CUDNN_STATUS(cudnnGetReductionWorkspaceSize(CudaContext::cudnnHandle(),
+                            reduceDesc,
+                            std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input),
+                            outputDesc,
+                            &workspaceSize));
+
+        void *d_workspace;
+        CHECK_CUDA_STATUS(cudaMalloc(&d_workspace, workspaceSize));
+
+        CHECK_CUDNN_STATUS(cudnnReduceTensor(CudaContext::cudnnHandle(),
+                            reduceDesc,
+                            NULL,
+                            0,
+                            d_workspace,
+                            workspaceSize,
+                            &alpha,
+                            std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input),
+                            input.getImpl()->rawPtr(),
+                            &beta,
+                            outputDesc,
+                            std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()));
+
+        CHECK_CUDNN_STATUS(cudnnDestroyReduceTensorDescriptor(reduceDesc));
+        CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(outputDesc));
+    }
+}
+
+void Aidge::ReduceSumImpl_cuda::backward() {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    AIDGE_ASSERT(op.getOutput(0)->grad(), "missing outputGrad in ReduceSum operator");
+    AIDGE_ASSERT(op.getOutput(0)->grad()->hasImpl(), "cannot run ReduceSum backward because the output grad has no implementation.");
+
+    const auto& outGrad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getInput(0)->grad());
+
+    const ReduceSum_Op& rmOp = static_cast<const ReduceSum_Op&>(mOp);
+    auto axes =  rmOp.axes();
+    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+        case DataType::Float64:
+            backward_<double>(outGrad, axes);
+            break;
+        case DataType::Float32:
+            backward_<float>(outGrad, axes);
+            break;
+        case DataType::Float16:
+            backward_<half>(outGrad, axes);
+            break;
+        default:
+            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    }
+}
+
+template <class T>
+void Aidge::ReduceSumImpl_cuda::backward_(const Tensor& outGrad, const std::vector<int>& axes) {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+
+    const T * outputGrad = static_cast<const T*>(op.getOutput(0)->grad()->getImpl()->rawPtr());
+    T * inputGrad = static_cast<T*>(op.getInput(0)->grad()->getImpl()->rawPtr());
+
+    std::vector<std::size_t> factors;
+    for (auto axis:axes) {
+        factors.push_back(op.getInput(0)->grad()->dims()[axis]);
+    }
+    
+    Aidge::ReduceBackward(outputGrad,
+                        inputGrad,
+                        outGrad.dims(),
+                        op.getInput(0)->grad()->dims(),
+                        axes,
+                        factors,
+                        static_cast<int>(op.getInput(0)->grad()->size()));
+}
diff --git a/src/operator/ReshapeImpl.cpp b/src/operator/ReshapeImpl.cpp
index 8016a5a9d1dfc26454af2cb03b6fe573820245f5..783e244057b0fc42a782fd363c3a99aa6d73b46b 100644
--- a/src/operator/ReshapeImpl.cpp
+++ b/src/operator/ReshapeImpl.cpp
@@ -39,8 +39,3 @@ void Aidge::ReshapeImpl_cuda::backward() {
 
     std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->grad() -> getImpl() -> setRawPtr(output_grad.getImpl()->rawPtr(), output_grad.getImpl()->size());
 }
-
-Aidge::ReshapeImpl_cuda::~ReshapeImpl_cuda() {
-
-}
-
diff --git a/src/operator/ShiftGELUImpl.cpp b/src/operator/ShiftGELUImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c2774804d04a422aefd0c66ed0d1fc1d949b1f06
--- /dev/null
+++ b/src/operator/ShiftGELUImpl.cpp
@@ -0,0 +1,119 @@
+/********************************************************************************
+ * Copyright (c) 2024 Thales
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ * Author: Lucas RAKOTOARIVONY, Thales Research & Technology France
+ * Date: 25.06.2024
+ *
+ ********************************************************************************/
+
+#include <cassert>
+#include <chrono>  // std::chrono::milliseconds
+#include <numeric> // std::accumulate
+#include <thread>  // std::this_thread::sleep_for
+#include <vector>
+#include <algorithm>  // For std::max
+#include <cmath>      // For pow
+#include <typeinfo>
+
+#include "aidge/backend/cuda/data/TensorImpl.hpp"
+#include "aidge/backend/cuda/operator/ShiftGELUImpl.hpp"
+#include "aidge/backend/cuda/operator/ShiftGELUImpl_CUDA_kernels.hpp"
+#include "aidge/backend/cuda/utils/CudaContext.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/operator/ShiftGELU.hpp"
+#include "aidge/utils/Types.h"
+
+void Aidge::ShiftGELUImpl_cuda::forward() {
+
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    assert(mOp.getRawInput(0) && "missing input #0");
+    const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0));
+
+    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+        case DataType::Float64:
+            forward_<double>(input);
+            break;
+        case DataType::Float32:
+            forward_<float>(input);
+            break;
+        default:
+            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    }
+}
+
+template<class T>
+void Aidge::ShiftGELUImpl_cuda::forward_(const Tensor& input)
+{
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const T * input_raw = static_cast<const T*>(input.getImpl()->rawPtr());
+    T * output = static_cast<T*>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr());
+
+    int N = 15;
+    int output_bits = 8;
+    size_t size = input.size();
+    std::vector<DimSize_t> dims_input = input.dims();
+
+    // maybe find a most efficient way to compute scaling factor (a max and min function could help to retrieve scaling factor value)
+
+    double min = std::numeric_limits<double>::max();
+    double max = std::numeric_limits<double>::min();
+    for(std::size_t i = 0; i < dims_input[0]; i++) {
+        for(std::size_t j = 0; j < dims_input[1]; j++) {
+            for(std::size_t k = 0; k < dims_input[2]; k++) {
+                for(std::size_t l = 0; l < dims_input[3]; l++) {
+                    std::vector<std::size_t> coordIdx = {i, j, k, l};
+                    std::size_t newFlatIdx = input.getIdx(coordIdx);
+                    if (newFlatIdx < min) {
+                        min = newFlatIdx;
+                    }
+                    if (newFlatIdx > max) {
+                        max = newFlatIdx;
+                    }
+               }
+            }     
+        }
+    }
+
+    double m = std::max(std::abs(min), std::abs(max));
+    double normalization_factor = static_cast<double>(1 << (output_bits - 1)) - 1;
+    double scaling_factor =  m / normalization_factor;
+
+    // The new scaling factor that we can use to dequantify the returned tensor (not used here)
+    // double new_SF = 1/std::pow(2,2*output_bits-1);
+
+    ShiftGELUforward(input_raw, output, scaling_factor,N, output_bits, size, dims_input);
+}
+
+void Aidge::ShiftGELUImpl_cuda::backward() {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+
+    assert(op.getOutput(0)->grad() && "missing output #0");
+
+    const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad());
+
+    if (op.getInput(0)->grad()->dataType() == DataType::Float64) {
+        backward_<double>(output_grad);
+    }
+    else {
+        backward_<float>(output_grad);
+    }
+}
+
+template <class T>
+void Aidge::ShiftGELUImpl_cuda::backward_(const Tensor& output_grad) {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const T * input = static_cast<const T*>(std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr());
+
+    size_t size = output_grad.size();
+
+    T * output = static_cast<T*>(op.getInput(0)->grad()->getImpl()->rawPtr());
+
+    const T * output_grad_raw = static_cast<const T*>(output_grad.getImpl()->rawPtr());
+    ShiftGELUbackward(input, output_grad_raw, output, size);
+
+}
\ No newline at end of file
diff --git a/src/operator/ShiftGELUImpl_CUDA_kernels.cu b/src/operator/ShiftGELUImpl_CUDA_kernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..aabd89c04e960f9f19eca69247173168d3eaf71e
--- /dev/null
+++ b/src/operator/ShiftGELUImpl_CUDA_kernels.cu
@@ -0,0 +1,256 @@
+/********************************************************************************
+ * Copyright (c) 2024 Thales
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ * Author: Lucas RAKOTOARIVONY, Thales Research & Technology France
+ * Date: 25.06.2024
+ *
+ ********************************************************************************/
+#define MAX(X,Y) (((X) > (Y)) ? (X) : (Y))
+#define CLAMP(X) (((X) < (0)) ? (0) : (X))
+
+#include <stdio.h>
+#include <cuda_runtime.h>
+
+#include "aidge/backend/cuda/operator/ShiftGELUImpl_CUDA_kernels.hpp"
+
+__device__ inline int ExpShift(int I,int N, double SF)
+{
+    int Ip = I + (I >> 1) - (I >> 4);
+    int I0 = floorf(-1.0/SF);
+    Ip = MAX(Ip,N*I0);
+    int q = floorf(Ip / (I0));
+    int r = Ip -(I0*q);
+    int Ib = r/2 - I0;
+    Ib = CLAMP(Ib * powf(2,N-q));
+    return (int)Ib;
+}
+
+namespace Aidge{
+
+template <class T>
+__global__ void ShiftGELUforward_(T* input,int* quantized_tensor,int* GELUtensor,int* SumTensor, int* dims, double SF, int N, int output_bits) {
+
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    int z = blockIdx.z * blockDim.z + threadIdx.z;
+
+    double SF_sig = SF * 1.702;
+    double Final_SF = SF / powf(2,(output_bits-1));
+
+    if (x < dims[0] && y < dims[1] && z < dims[2]) {
+        int maxIdx = x * dims[1] * dims[2] * dims[3] + y * dims[2] * dims[3] + z * dims[3];
+        for (int i = 0; i < dims[3]; i++) {
+            int idx = maxIdx + i;
+            quantized_tensor[idx] = roundf(input[idx] / SF);
+        }
+        int maxVal = quantized_tensor[maxIdx];
+        for (int i = 1; i < dims[3]; i++) {
+            int idx = maxIdx + i;
+            maxVal = MAX(maxVal, quantized_tensor[idx]);
+        }
+        int Max_Exp = ExpShift(-maxVal,N,SF_sig);
+        for (int i = 0; i < dims[3]; i++) {
+            int idx = maxIdx + i;
+            GELUtensor[idx] = ExpShift(quantized_tensor[idx] - maxVal,N,SF_sig);
+            if(GELUtensor[idx] > INT_MAX - Max_Exp) {
+                SumTensor[idx] = 1;
+            }
+            else
+            {
+                SumTensor[idx] = floorf(INT_MAX/(GELUtensor[idx] + Max_Exp));
+            }
+            SumTensor[idx] = floorf((GELUtensor[idx] * SumTensor[idx]) >> (31 - output_bits + 1));
+            quantized_tensor[idx] *= SumTensor[idx];
+            input[idx] = quantized_tensor[idx] * Final_SF;
+        }
+    }
+}
+
+template <>
+void ShiftGELUforward<float>(const float* input, float* output, double SF,int N, int output_bits, size_t size, std::vector<long unsigned int> dims_input) {
+
+    double new_SF = 1/std::pow(2,2*output_bits-1);
+
+    int dims_input_cuda[4];
+    if (dims_input.size() >= 4) {
+        for (std::size_t i = 0; i < 4; ++i) {
+            dims_input_cuda[i] = static_cast<int>(dims_input[i]);
+        }
+    } 
+
+    float* input_cuda_tensor;
+    cudaMalloc(&input_cuda_tensor,size*sizeof(float));
+    cudaMemcpy(input_cuda_tensor,input,size*sizeof(float),cudaMemcpyHostToDevice);
+                                                                                                                        
+    int* quantized_tensor;
+    cudaMalloc(&quantized_tensor,size*sizeof(int));
+    
+    int* GELUtensor;
+    cudaMalloc(&GELUtensor,size*sizeof(int));
+
+    int* SumTensor;
+    cudaMalloc(&SumTensor,size*sizeof(int));
+
+    int* dims;
+    cudaMalloc(&dims,4*sizeof(int));
+
+    cudaMemcpy(dims,dims_input_cuda,4*sizeof(int),cudaMemcpyHostToDevice);     
+
+    dim3 threadsPerBlock(10, 10, 10);
+    dim3 numBlocks((dims_input[0] + threadsPerBlock.x - 1) / threadsPerBlock.x,
+                   (dims_input[1] + threadsPerBlock.y - 1) / threadsPerBlock.y,
+                   (dims_input[2] + threadsPerBlock.z - 1) / threadsPerBlock.z);
+
+    ShiftGELUforward_<float><<<numBlocks, threadsPerBlock>>>(input_cuda_tensor, quantized_tensor,GELUtensor,SumTensor, dims, SF,N,8);
+    cudaDeviceSynchronize();
+
+    cudaError_t err = cudaGetLastError();
+    if(err != cudaSuccess)
+    {
+        printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    }
+
+    cudaMemcpy(output,input_cuda_tensor,size*sizeof(float),cudaMemcpyDeviceToHost);
+
+    cudaFree(quantized_tensor);
+    cudaFree(GELUtensor);
+    cudaFree(SumTensor);
+    cudaFree(dims);
+    cudaFree(input_cuda_tensor);
+}
+
+template <>
+void ShiftGELUforward<double>(const double* input, double* output, double SF,int N, int output_bits, size_t size, std::vector<long unsigned int> dims_input) {
+
+    double new_SF = 1/std::pow(2,2*output_bits-1);
+
+    int dims_input_cuda[4];
+    if (dims_input.size() >= 4) {
+        for (std::size_t i = 0; i < 4; ++i) {
+            dims_input_cuda[i] = static_cast<int>(dims_input[i]);
+        }
+    } 
+
+    double* input_cuda_tensor;
+    cudaMalloc(&input_cuda_tensor,size*sizeof(double));
+    cudaMemcpy(input_cuda_tensor,input,size*sizeof(double),cudaMemcpyHostToDevice);
+    
+    int* quantized_tensor;
+    cudaMalloc(&quantized_tensor,size*sizeof(int));
+
+    int* GELUtensor;
+    cudaMalloc(&GELUtensor,size*sizeof(int));
+
+    int* SumTensor;
+    cudaMalloc(&SumTensor,size*sizeof(int));
+
+    int* dims;
+    cudaMalloc(&dims,4*sizeof(int));
+
+    cudaMemcpy(dims,dims_input_cuda,4*sizeof(int),cudaMemcpyHostToDevice);     
+
+    dim3 threadsPerBlock(10, 10, 10);
+    dim3 numBlocks((dims_input[0] + threadsPerBlock.x - 1) / threadsPerBlock.x,
+                   (dims_input[1] + threadsPerBlock.y - 1) / threadsPerBlock.y,
+                   (dims_input[2] + threadsPerBlock.z - 1) / threadsPerBlock.z);
+
+    ShiftGELUforward_<double><<<numBlocks, threadsPerBlock>>>(input_cuda_tensor, quantized_tensor,GELUtensor,SumTensor, dims, SF,N,8);
+    cudaDeviceSynchronize();
+
+    cudaError_t err = cudaGetLastError();
+    if(err != cudaSuccess)
+    {
+        printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    }
+
+    cudaMemcpy(output,input_cuda_tensor,size*sizeof(double),cudaMemcpyDeviceToHost);
+
+    cudaFree(quantized_tensor);
+    cudaFree(GELUtensor);
+    cudaFree(SumTensor);
+    cudaFree(dims);
+    cudaFree(input_cuda_tensor);
+}
+
+template <class T>
+__global__ void ShiftGELUbackward_(T* input_grad, const T* output_tensor, const T* output_grad, int size) {
+
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < size) {
+        float x = output_tensor[index];
+        float grad = output_grad[index];
+
+        float cdf = 0.5 * (1.0 + tanh(sqrt(2.0 / M_PI) * (x + 0.044715 * pow(x, 3))));
+        float pdf = exp(-0.5 * x * x) / sqrt(2.0 * M_PI);
+        float dx = pdf + x * cdf;
+        float backprop_grad = grad * dx;
+        input_grad[index] = backprop_grad;
+    }
+}
+
+template <>
+void ShiftGELUbackward<float>(const float* output_tensor, const float* output_grad, float* input_grad, size_t size)
+{
+    float* output_cuda_tensor;
+    cudaMalloc(&output_cuda_tensor,size*sizeof(float));
+    cudaMemcpy(output_cuda_tensor,output_tensor,size*sizeof(float),cudaMemcpyHostToDevice);
+
+    float* output_grad_;
+    cudaMalloc(&output_grad_,size*sizeof(float));
+    cudaMemcpy(output_grad_,output_grad,size*sizeof(float),cudaMemcpyHostToDevice);
+
+    float *input_grad_;
+    cudaMalloc(&input_grad_, size * sizeof(float));
+
+    dim3 threadParBlock(256);
+    dim3 Blocks((size + threadParBlock.x -1) / threadParBlock.x);
+
+    ShiftGELUbackward_<float><<<Blocks,threadParBlock>>>(input_grad_,output_cuda_tensor,output_grad_,size);
+    cudaDeviceSynchronize();
+    cudaError_t err = cudaGetLastError();
+    if(err != cudaSuccess)
+    {
+        printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    }
+    cudaMemcpy(input_grad,input_grad_, (size) * sizeof(float), cudaMemcpyDeviceToHost);
+    cudaFree(output_cuda_tensor);
+    cudaFree(input_grad_);
+    cudaFree(output_grad_);
+}
+
+template <>
+void ShiftGELUbackward<double>(const double* output_tensor, const double* output_grad, double* input_grad, size_t size)
+{
+    double* output_cuda_tensor;
+    cudaMalloc(&output_cuda_tensor,size*sizeof(double));
+    cudaMemcpy(output_cuda_tensor,output_tensor,size*sizeof(double),cudaMemcpyHostToDevice);
+
+    double* output_grad_;
+    cudaMalloc(&output_grad_,size*sizeof(double));
+    cudaMemcpy(output_grad_,output_grad,size*sizeof(double),cudaMemcpyHostToDevice);
+
+    double *input_grad_;
+    cudaMalloc(&input_grad_, size * sizeof(double));
+
+    dim3 threadParBlock(256);
+    dim3 Blocks((size + threadParBlock.x -1) / threadParBlock.x);
+
+    ShiftGELUbackward_<double><<<Blocks,threadParBlock>>>(input_grad_,output_cuda_tensor,output_grad_,size);
+    cudaDeviceSynchronize();
+    cudaError_t err = cudaGetLastError();
+    if(err != cudaSuccess)
+    {
+        printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    }
+    cudaMemcpy(input_grad,input_grad_, (size) * sizeof(double), cudaMemcpyDeviceToHost);
+    cudaFree(output_cuda_tensor);
+    cudaFree(input_grad_);
+    cudaFree(output_grad_);
+}
+
+}
\ No newline at end of file
diff --git a/src/operator/ShiftMaxImpl.cpp b/src/operator/ShiftMaxImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1134cc5d6b99e53eb492c82e32d811bc0bcba0e0
--- /dev/null
+++ b/src/operator/ShiftMaxImpl.cpp
@@ -0,0 +1,121 @@
+/********************************************************************************
+ * Copyright (c) 2024 Thales
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ * Author: Lucas RAKOTOARIVONY, Thales Research & Technology France
+ * Date: 25.06.2024
+ *
+ ********************************************************************************/
+
+#include <cassert>
+#include <chrono>  // std::chrono::milliseconds
+#include <numeric> // std::accumulate
+#include <thread>  // std::this_thread::sleep_for
+#include <vector>
+#include <algorithm>  // For std::max
+#include <cmath>      // For pow
+#include <typeinfo>
+
+#include "aidge/backend/cuda/data/TensorImpl.hpp"
+#include "aidge/backend/cuda/operator/ShiftMaxImpl.hpp"
+#include "aidge/backend/cuda/operator/ShiftMaxImpl_CUDA_kernels.hpp"
+#include "aidge/backend/cuda/utils/CudaContext.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/operator/ShiftMax.hpp"
+#include "aidge/utils/Types.h"
+
+void Aidge::ShiftMaxImpl_cuda::forward() {
+
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    assert(mOp.getRawInput(0) && "missing input #0");
+    const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0));
+
+    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+        case DataType::Float64:
+            forward_<double>(input);
+            break;
+        case DataType::Float32:
+            forward_<float>(input);
+            break;
+        default:
+            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    }
+}
+
+template<class T>
+void Aidge::ShiftMaxImpl_cuda::forward_(const Tensor& input)
+{
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const T * input_raw = static_cast<const T*>(input.getImpl()->rawPtr());
+    T * output = static_cast<T*>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr());
+
+    int N = 15;
+    int output_bits = 8;
+    size_t size = input.size();
+    std::vector<DimSize_t> dims_input = input.dims();
+
+    // maybe find a most efficient way to compute scaling factor (a max and min function could help to retrieve scaling factor value)
+
+    double min = std::numeric_limits<double>::max();
+    double max = std::numeric_limits<double>::min();
+    for(std::size_t i = 0; i < dims_input[0]; i++) {
+        for(std::size_t j = 0; j < dims_input[1]; j++) {
+            for(std::size_t k = 0; k < dims_input[2]; k++) {
+                for(std::size_t l = 0; l < dims_input[3]; l++) {
+                    std::vector<std::size_t> coordIdx = {i, j, k, l};
+                    std::size_t newFlatIdx = input.getIdx(coordIdx);
+                    if (newFlatIdx < min) {
+                        min = newFlatIdx;
+                    }
+                    if (newFlatIdx > max) {
+                        max = newFlatIdx;
+                    }
+               }
+            }     
+        }
+    }
+
+    double m = std::max(std::abs(min), std::abs(max));
+    double normalization_factor = static_cast<double>(1 << (output_bits - 1)) - 1;
+    double scaling_factor =  m / normalization_factor;
+    
+    // The new scaling factor that we can use to dequantify the returned tensor (not used here)
+    // double new_SF = 1/std::pow(2,2*output_bits-1);
+
+    ShiftMaxforward(input_raw, output, scaling_factor,N, output_bits, size, dims_input);
+}
+
+
+void Aidge::ShiftMaxImpl_cuda::backward() {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+
+    assert(op.getOutput(0)->grad() && "missing output #0");
+
+    const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad());
+
+    if (op.getInput(0)->grad()->dataType() == DataType::Float64) {
+        backward_<double>(output_grad);
+    }
+    else {
+        backward_<float>(output_grad);
+    }
+}
+
+template <class T>
+void Aidge::ShiftMaxImpl_cuda::backward_(const Tensor& output_grad) {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const T * output_tensor = static_cast<const T*>(std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr());
+
+    size_t size = output_grad.size();
+    std::vector<DimSize_t> dims_output = output_grad.dims();
+
+    T * input_grad = static_cast<T*>(op.getInput(0)->grad()->getImpl()->rawPtr());
+
+    const T * output_grad_raw = static_cast<const T*>(output_grad.getImpl()->rawPtr());
+    ShiftMaxbackward(output_tensor, output_grad_raw, input_grad, size, dims_output);
+
+}
\ No newline at end of file
diff --git a/src/operator/ShiftMaxImpl_CUDA_kernels.cu b/src/operator/ShiftMaxImpl_CUDA_kernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ba3cfcb51e02fb0befbf9f7c1fc054e73a2a7157
--- /dev/null
+++ b/src/operator/ShiftMaxImpl_CUDA_kernels.cu
@@ -0,0 +1,286 @@
+/********************************************************************************
+ * Copyright (c) 2024 Thales
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ * Author: Lucas RAKOTOARIVONY, Thales Research & Technology France
+ * Date: 25.06.2024
+ *
+ ********************************************************************************/
+#define MAX(X,Y) (((X) > (Y)) ? (X) : (Y))
+#define CLAMP(X) (((X) < (0)) ? (0) : (X))
+
+#include <stdio.h>
+#include <cuda_runtime.h>
+
+#include "aidge/backend/cuda/operator/ShiftMaxImpl_CUDA_kernels.hpp"
+
+__device__ inline int ExpShift(int I,int N, double SF)
+{
+    int Ip = I + (I >> 1) - (I >> 4);
+    int I0 = floorf(-1.0/SF);
+    Ip = MAX(Ip,N*I0);
+    int q = floorf(Ip / (I0));
+    int r = Ip -(I0*q);
+    int Ib = r/2 - I0;
+    Ib = CLAMP(Ib * powf(2,N-q));
+    return (int)Ib;
+}
+
+namespace Aidge{
+
+template <class T>
+__global__ void ShiftMaxforward_(T* input,int* quantized_tensor,int* factor, int* dims, double SF, int N, int output_bits,double new_SF)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    int z = blockIdx.z * blockDim.z + threadIdx.z;
+    int sum = 0;
+
+    if (x < dims[0] && y < dims[1] && z < dims[2]) {
+        int maxIdx = x * dims[1] * dims[2] * dims[3] + y * dims[2] * dims[3] + z * dims[3];
+        for (int i = 0; i < dims[3]; i++) {
+            int idx = maxIdx + i;
+            quantized_tensor[idx] = roundf(input[idx] / SF);
+        }
+        int maxVal = quantized_tensor[maxIdx];
+        for (int i = 1; i < dims[3]; i++) {
+            int idx = maxIdx + i;
+            maxVal = MAX(maxVal, quantized_tensor[idx]);
+        }
+        for (int i = 0; i < dims[3]; i++) {
+            int idx = maxIdx + i;
+            quantized_tensor[idx] = ExpShift(quantized_tensor[idx]-maxVal,N,SF);
+        }
+        for (int i = 0; i < dims[3]; i++) {
+            int idx = maxIdx + i;
+            if(quantized_tensor[idx] > 0 && sum > INT_MAX - quantized_tensor[idx])
+            {
+                sum = INT_MAX;
+                break;
+            }
+            else {
+                sum += quantized_tensor[idx];
+            }
+        }
+        factor[x * dims[1] * dims[2] + y * dims[2] + z] = floorf(INT_MAX/sum);
+        for(int i= 0; i < dims[3]; ++i)
+        {
+            int idx = maxIdx + i;
+            quantized_tensor[idx] = (quantized_tensor[idx] * factor[x * dims[1] * dims[2] + y * dims[2] + z]) >> (31-(2*output_bits-1));
+            input[idx] =quantized_tensor[idx]*new_SF;
+        }
+    }
+}
+
+template <>
+void ShiftMaxforward<float>(const float* input, float* output, double SF, int N, int output_bits, size_t size, std::vector<long unsigned int> dims_input) {
+
+    double new_SF = 1 / std::pow(2, 2 * output_bits - 1); // New scaling factor
+
+    int dims_input_cuda[4] = {1, 1, 1, 1};
+    for (std::size_t i = 0; i < std::min(dims_input.size(), size_t(4)); ++i) {
+        dims_input_cuda[i] = static_cast<int>(dims_input[i]);
+    }
+
+    // Allocate memory on the GPU
+    float* input_cuda_tensor;
+    cudaMalloc(&input_cuda_tensor, size * sizeof(float));
+    cudaMemcpy(input_cuda_tensor, input, size * sizeof(float), cudaMemcpyHostToDevice);
+
+    int* quantized_tensor;
+    cudaMalloc(&quantized_tensor, size * sizeof(int));
+
+    int* factor;
+    cudaMalloc(&factor, size * sizeof(int));
+
+    int* dims;
+    cudaMalloc(&dims, 4 * sizeof(int));
+    cudaMemcpy(dims, dims_input_cuda, 4 * sizeof(int), cudaMemcpyHostToDevice);
+
+    // Calculate grid and block dimensions
+    dim3 threadsPerBlock(10, 10, 10);
+    dim3 numBlocks(
+        (dims_input_cuda[0] + threadsPerBlock.x - 1) / threadsPerBlock.x,
+        (dims_input_cuda[1] + threadsPerBlock.y - 1) / threadsPerBlock.y,
+        (dims_input_cuda[2] + threadsPerBlock.z - 1) / threadsPerBlock.z
+    );
+
+    // Launch the kernel (assuming a templated ShiftMaxWholeKernel function exists)
+    ShiftMaxforward_<float><<<numBlocks, threadsPerBlock>>>(input_cuda_tensor, quantized_tensor, factor, dims, SF, N, output_bits, new_SF);
+    cudaDeviceSynchronize();
+
+    // Check for CUDA errors
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        std::cerr << "CUDA Error: " << cudaGetErrorString(err) << std::endl;
+    }
+
+    // Copy the result back to host
+    cudaMemcpy(output, input_cuda_tensor, size * sizeof(float), cudaMemcpyDeviceToHost);
+
+    // Free allocated memory on GPU
+    cudaFree(quantized_tensor);
+    cudaFree(factor);
+    cudaFree(dims);
+    cudaFree(input_cuda_tensor);
+}
+
+template <>
+void ShiftMaxforward<double>(const double* input, double* output, double SF, int N, int output_bits, size_t size, std::vector<long unsigned int> dims_input) {
+
+    double new_SF = 1 / std::pow(2, 2 * output_bits - 1);
+
+    int dims_input_cuda[4] = {1, 1, 1, 1};
+    for (std::size_t i = 0; i < std::min(dims_input.size(), size_t(4)); ++i) {
+        dims_input_cuda[i] = static_cast<int>(dims_input[i]);
+    }
+
+    // Allocate memory on the GPU
+    double* input_cuda_tensor;
+    cudaMalloc(&input_cuda_tensor, size * sizeof(double));
+    cudaMemcpy(input_cuda_tensor, input, size * sizeof(double), cudaMemcpyHostToDevice);
+
+    int* quantized_tensor;
+    cudaMalloc(&quantized_tensor, size * sizeof(int));
+
+    int* factor;
+    cudaMalloc(&factor, size * sizeof(int));
+
+    int* dims;
+    cudaMalloc(&dims, 4 * sizeof(int));
+    cudaMemcpy(dims, dims_input_cuda, 4 * sizeof(int), cudaMemcpyHostToDevice);
+
+    // Calculate grid and block dimensions
+    dim3 threadsPerBlock(10, 10, 10);
+    dim3 numBlocks(
+        (dims_input_cuda[0] + threadsPerBlock.x - 1) / threadsPerBlock.x,
+        (dims_input_cuda[1] + threadsPerBlock.y - 1) / threadsPerBlock.y,
+        (dims_input_cuda[2] + threadsPerBlock.z - 1) / threadsPerBlock.z
+    );
+
+    // Launch the kernel (assuming a templated ShiftMaxWholeKernel function exists)
+    ShiftMaxforward_<double><<<numBlocks, threadsPerBlock>>>(input_cuda_tensor, quantized_tensor, factor, dims, SF, N, output_bits, new_SF);
+    cudaDeviceSynchronize();
+
+    // Check for CUDA errors
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        std::cerr << "CUDA Error: " << cudaGetErrorString(err) << std::endl;
+    }
+
+    // Copy the result back to host
+    cudaMemcpy(output, input_cuda_tensor, size * sizeof(double), cudaMemcpyDeviceToHost);
+
+    // Free allocated memory on GPU
+    cudaFree(quantized_tensor);
+    cudaFree(factor);
+    cudaFree(dims);
+    cudaFree(input_cuda_tensor);
+}
+
+
+template <class T>
+__global__ void ShiftMaxbackward_(T* input_grad, const T* output_tensor, const T* output_grad, const int* dims) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < dims[0] * dims[1] * dims[2] * dims[3]) {
+        int w = (index / dims[3]) % dims[2];
+        int h = (index / dims[3] / dims[2]) % dims[1];
+        int n = index / dims[3] / dims[2] / dims[1];
+
+        float sum = 0.0f;
+        for (int i = 0; i < dims[3]; ++i) {
+            sum += output_tensor[n * dims[1] * dims[2] * dims[3] + h * dims[2] * dims[3] + w * dims[3] + i] * output_grad[n * dims[1] * dims[2] * dims[3] + h * dims[2] * dims[3] + w * dims[3] + i];
+        }
+        input_grad[index] = output_tensor[index] * (output_grad[index] - sum);
+    }
+}
+
+template <>
+void ShiftMaxbackward<float>(const float* output_tensor, const float* output_grad, float* input_grad, size_t size, std::vector<long unsigned int> dims)
+{   
+    int dims_input_cuda[4] = {1, 1, 1, 1};
+    for (std::size_t i = 0; i < std::min(dims.size(), size_t(4)); ++i) {
+        dims_input_cuda[i] = static_cast<int>(dims[i]);
+    }
+
+    float* output_cuda_tensor;
+    cudaMalloc(&output_cuda_tensor,size*sizeof(float));
+    cudaMemcpy(output_cuda_tensor,output_tensor,size*sizeof(float),cudaMemcpyHostToDevice);
+
+    float* output_grad_;
+    cudaMalloc(&output_grad_,size*sizeof(float));
+    cudaMemcpy(output_grad_,output_grad,size*sizeof(float),cudaMemcpyHostToDevice);
+
+    float *input_grad_;
+    cudaMalloc(&input_grad_, size * sizeof(float));
+
+    int *dims_;
+    cudaMalloc(&dims_, 4 * sizeof(int));
+    cudaMemcpy(dims_, dims_input_cuda, 4 * sizeof(int), cudaMemcpyHostToDevice);
+
+    dim3 threadParBlock(256);
+    dim3 Blocks((size + threadParBlock.x -1) / threadParBlock.x);
+
+    ShiftMaxbackward_<float><<<Blocks,threadParBlock>>>(input_grad_,output_cuda_tensor,output_grad_,dims_);
+    cudaDeviceSynchronize();
+    cudaError_t err = cudaGetLastError();
+    if(err != cudaSuccess)
+    {
+        printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    }
+
+    cudaMemcpy(input_grad, input_grad_, (size) * sizeof(float), cudaMemcpyDeviceToHost);
+    cudaFree(output_cuda_tensor);
+    cudaFree(input_grad_);
+    cudaFree(dims_);
+    cudaFree(output_grad_);
+}
+
+template <>
+void ShiftMaxbackward<double>(const double* output_tensor, const double* output_grad, double* input_grad, size_t size, std::vector<long unsigned int> dims)
+{   
+    int dims_input_cuda[4] = {1, 1, 1, 1};
+    for (std::size_t i = 0; i < std::min(dims.size(), size_t(4)); ++i) {
+        dims_input_cuda[i] = static_cast<int>(dims[i]);
+    }
+
+    double* output_cuda_tensor;
+    cudaMalloc(&output_cuda_tensor,size*sizeof(double));
+    cudaMemcpy(output_cuda_tensor,output_tensor,size*sizeof(double),cudaMemcpyHostToDevice);
+
+    double* output_grad_;
+    cudaMalloc(&output_grad_,size*sizeof(double));
+    cudaMemcpy(output_grad_,output_grad,size*sizeof(double),cudaMemcpyHostToDevice);
+
+    double *input_grad_;
+    cudaMalloc(&input_grad_, size * sizeof(double));
+
+    int *dims_;
+    cudaMalloc(&dims_, 4 * sizeof(int));
+    cudaMemcpy(dims_, dims_input_cuda, 4 * sizeof(int), cudaMemcpyHostToDevice);
+
+    dim3 threadParBlock(256);
+    dim3 Blocks((size + threadParBlock.x -1) / threadParBlock.x);
+
+    ShiftMaxbackward_<double><<<Blocks,threadParBlock>>>(input_grad_,output_cuda_tensor,output_grad_,dims_);
+    cudaDeviceSynchronize();
+    cudaError_t err = cudaGetLastError();
+    if(err != cudaSuccess)
+    {
+        printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    }
+
+    cudaMemcpy(input_grad,input_grad_, (size) * sizeof(double), cudaMemcpyDeviceToHost);
+    cudaFree(output_cuda_tensor);
+    cudaFree(input_grad_);
+    cudaFree(dims_);
+    cudaFree(output_grad_);
+}
+
+
+
+}
\ No newline at end of file
diff --git a/src/operator/SubImpl.cpp b/src/operator/SubImpl.cpp
index adebd2a1bdcede94f159627f67860e7ec60a5d85..a04a1c3018b0c9ba455d21ba563253eb3e004e10 100644
--- a/src/operator/SubImpl.cpp
+++ b/src/operator/SubImpl.cpp
@@ -44,6 +44,10 @@ void Aidge::SubImpl_cuda::forward() {
         std::copy(inputs[i].dims().begin(), inputs[i].dims().end(), std::back_inserter(dims[i]));
         dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1));
 
+        if (dims[i].size() < 4) {
+            dims[i].resize(4, 1);
+        }
+
         // Compute the corresponding strides
         std::vector<int> tensorStrides(dims[i].size());
         int product = 1;
@@ -197,7 +201,7 @@ void Aidge::SubImpl_cuda::backward_(const Tensor& outputGrad, const std::vector<
                                tensorDesc,
                                &workspaceSize));
 
-            float *d_workspace;
+            void *d_workspace;
             CHECK_CUDA_STATUS(cudaMalloc(&d_workspace, workspaceSize));
 
             CHECK_CUDNN_STATUS(cudnnReduceTensor(CudaContext::cudnnHandle(),
@@ -216,4 +220,4 @@ void Aidge::SubImpl_cuda::backward_(const Tensor& outputGrad, const std::vector<
             CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc));
         }
     }
-}
\ No newline at end of file
+}
diff --git a/unit_tests/CMakeLists.txt b/unit_tests/CMakeLists.txt
index ab65c924e4ac9abecc132e5d7cbc4dc91e172821..807adc55e8c85f31f5e94013e174bf8cbc5a2320 100644
--- a/unit_tests/CMakeLists.txt
+++ b/unit_tests/CMakeLists.txt
@@ -1,9 +1,11 @@
 Include(FetchContent)
 
+set(CATCH2_VERSION v3.0.1)
+message(STATUS "Retrieving Catch2 ${CATCH2_VERSION} from git")
 FetchContent_Declare(
   Catch2
   GIT_REPOSITORY https://github.com/catchorg/Catch2.git
-  GIT_TAG        v3.0.1 # or a later release
+  GIT_TAG   ${CATCH2_VERSION}       # or a later release
 )
 
 FetchContent_MakeAvailable(Catch2)
diff --git a/unit_tests/Test_AndImpl.cpp b/unit_tests/Test_AndImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..66de926088bb47c06ea1f9f10655730404787149
--- /dev/null
+++ b/unit_tests/Test_AndImpl.cpp
@@ -0,0 +1,132 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu.hpp"
+#include "aidge/backend/cuda.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+using namespace Aidge;
+
+TEST_CASE("[gpu/operator] And(forward)", "[And][GPU]") {
+    SECTION("Same size inputs") {
+        std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<float,3,3,3,2> {
+            {                                       //
+                {                                   //
+                    {{20, 15},{31, 11},{22, 49}},   //
+                    {{41, 10},{24, 51},{27, 52}},   //
+                    {{26, 53},{27, 54},{28, 55}}    //
+                },                                  //
+                {                                   //
+                    {{29, 56},{30, 57},{31, 58}},   //
+                    {{32, 59},{33, 60},{34, 61}},   //
+                    {{35, 62},{36, 63},{37, 64}}    //
+                },                                  //
+                {                                   //
+                    {{38, 65},{39, 66},{40, 67}},   //
+                    {{41, 68},{42, 69},{43, 70}},   //
+                    {{44, 71},{45, 72},{46, 73}}    //
+                }                                   //
+            }                                       //
+        });                                         //
+        input_1->setBackend("cuda");
+        std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array4D<float,3,3,3,2> {
+            {                                       //
+                {                                   //
+                    {{20, 47},{21, 48},{22, 49}},   //
+                    {{23, 50},{24, 51},{25, 52}},   //
+                    {{17, 53},{27, 26},{14, 33}}    //
+                },                                  //
+                {                                   //
+                    {{29, 56},{30, 57},{31, 58}},   //
+                    {{72, 44},{33, 20},{27, 55}},   //
+                    {{35, 24},{25, 63},{28, 64}}    //
+                },                                  //
+                {                                   //
+                    {{32, 65},{39, 66},{40, 70}},   //
+                    {{41, 53},{42, 60},{34, 70}},   //
+                    {{44, 71},{30, 12},{46, 73}}    //
+                }                                   //
+            }                                       //
+        });                                         //
+        input_2->setBackend("cuda");
+        const Tensor myOutput = Tensor(Array4D<float,3,3,3,2> {
+            {
+                {
+                    {{1, 0},{0, 0},{1, 1}},
+                    {{0, 0},{1, 1},{0, 1}},
+                    {{0, 1},{1, 0},{0, 0}}
+                },
+                {
+                    {{1, 1},{1, 1},{1, 1}},
+                    {{0, 0},{1, 0},{0, 0}},
+                    {{1, 0},{0, 1},{0, 1}}
+                },
+                {
+                    {{0, 1},{1, 1},{1, 0}},
+                    {{1, 0},{1, 0},{0, 1}},
+                    {{1, 1},{0, 0},{1, 1}}
+                }
+            }
+        });
+
+        std::shared_ptr<Node> myAnd = And();
+        auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator());
+        op->associateInput(0, input_1);
+        op->associateInput(1, input_2);
+        op->setBackend("cuda");
+        op->setDataType(DataType::Float32);
+        myAnd->forward();
+
+
+        std::shared_ptr<Tensor> outputFallback;
+        const auto& cudaOutput = op->getOutput(0)->refCastFrom(outputFallback, myOutput);
+        REQUIRE(approxEq<float>(cudaOutput, myOutput));
+    }
+
+    SECTION("Broadcasting") {
+        std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<float,1,3,3,2> {
+        {                                       //
+            {                                   //
+                {{10, 20},{22, 23},{20, 20}},   //
+                {{10, 15},{10, 29},{20, 20}},   //
+                {{26, 25},{33, 20},{10, 20}}    //
+            }                                   //
+        }                                       //
+        });                                     //
+        input_1->setBackend("cuda");
+        std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<float,2> {{10, 20}});  
+        const Tensor myOutput = Tensor(Array4D<float,1,3,3,2> {
+            {                                   //
+                {                               //
+                    {{ 1, 1},{ 0, 0},{ 0, 1}},  //
+                    {{ 1, 0},{ 1, 0},{ 0, 1}},  //
+                    {{ 0, 0},{ 0, 1},{ 1, 1}}   //
+                }                               //
+            }                                   //
+        });                                     //
+        input_2->setBackend("cuda");
+        std::shared_ptr<Node> myAnd = And();
+        auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator());
+        op->associateInput(0, input_1);
+        op->associateInput(1, input_2);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cuda");
+        myAnd->forward();
+
+        std::shared_ptr<Tensor> outputFallback;
+        const auto& cudaOutput = op->getOutput(0)->refCastFrom(outputFallback, myOutput);
+        REQUIRE(approxEq<float>(cudaOutput, myOutput));
+    }
+}
\ No newline at end of file
diff --git a/unit_tests/Test_ArgMaxImpl.cpp b/unit_tests/Test_ArgMaxImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d123b5bd3376c7169b2e003d8b366bb9045fe3e1
--- /dev/null
+++ b/unit_tests/Test_ArgMaxImpl.cpp
@@ -0,0 +1,155 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <numeric> // std::accumulate
+#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu.hpp"
+#include "aidge/backend/cuda.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/Add.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+using namespace Aidge;
+
+TEST_CASE("[cpu/operator] ArgMax(forward)", "[ArgMax][CPU]") {
+    SECTION("3D Tensor") {
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,2,3,4> {
+                {
+                    {
+                        { 1.0, 2.0, 3.0, 4.0},
+                        { 8.0, 0.0, 17.0, 1.0},
+                        { 5.0, 10.0, 6.0, 0.0}
+                    },
+                    {
+                        { 7.0, 1.0, 9.0, 4.0},
+                        { 0.0, 8.0, 4.0, 2.0},
+                        { 9.0, 2.0, 0.0, 5.0}
+                    }
+                }
+            });
+            myInput->setBackend("cuda");
+        SECTION("Axis 2") {
+
+            const Tensor myOutput = Tensor(Array3D<float,2,3, 1> {
+               { 
+                    { 
+                        {3.0},
+                        {2.0},
+                        {1.0}
+                    },
+                    {
+                        {2.0},
+                        {1.0},
+                        {0.0}
+                    }
+               }
+            });
+
+            std::shared_ptr<Node> myArgMax = ArgMax(2);
+            auto op = std::static_pointer_cast<OperatorTensor>(myArgMax -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cuda");
+            myArgMax->forward();
+
+            std::shared_ptr<Tensor> outputFallback;
+            const auto& cudaOutput = op->getOutput(0)->refCastFrom(outputFallback, myOutput);
+            REQUIRE(approxEq<float>(cudaOutput, myOutput));
+
+        }
+        SECTION("Axis 2 with keep_dims false") {
+
+            const Tensor myOutput = Tensor(Array2D<float,2,3> {
+               { 
+                    { 3.0, 2.0, 1.0 },
+                    { 2.0, 1.0, 0.0 }
+               }
+            });
+
+            std::shared_ptr<Node> myArgMax = ArgMax(2,0);
+            auto op = std::static_pointer_cast<OperatorTensor>(myArgMax -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cuda");
+            myArgMax->forward();
+
+            std::shared_ptr<Tensor> outputFallback;
+            const auto& cudaOutput = op->getOutput(0)->refCastFrom(outputFallback, myOutput);
+            REQUIRE(approxEq<float>(cudaOutput, myOutput));
+        }
+        SECTION("Axis 1") {
+            const Tensor myOutput = Tensor(Array3D<float,2,1,4> {
+                {
+                    {
+                        { 1.0, 2.0, 1.0, 0.0 }
+                    },
+                    {
+                        { 2.0, 1.0, 0.0, 2.0 }
+                    }
+                }
+            });
+
+            std::shared_ptr<Node> myArgMax = ArgMax(1);
+            auto op = std::static_pointer_cast<OperatorTensor>(myArgMax -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cuda");
+            myArgMax->forward();
+
+            std::shared_ptr<Tensor> outputFallback;
+            const auto& cudaOutput = op->getOutput(0)->refCastFrom(outputFallback, myOutput);
+            REQUIRE(approxEq<float>(cudaOutput, myOutput));
+        }
+        SECTION("Axis 0") {
+            const Tensor myOutput = Tensor(Array3D<float,1,3,4> {
+                {
+                    {
+                        { 1.0, 0.0, 1.0, 0.0 },
+                        { 0.0, 1.0, 0.0, 1.0 },
+                        { 1.0, 0.0, 0.0, 1.0 }
+                    }
+                }
+            });
+
+            std::shared_ptr<Node> myArgMax = ArgMax(0);
+            auto op = std::static_pointer_cast<OperatorTensor>(myArgMax -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cuda");
+            myArgMax->forward();
+
+            std::shared_ptr<Tensor> outputFallback;
+            const auto& cudaOutput = op->getOutput(0)->refCastFrom(outputFallback, myOutput);
+            REQUIRE(approxEq<float>(cudaOutput, myOutput));
+        }
+    }
+    SECTION("Select_Last_Index") {
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array1D<float,10> {
+            {
+                1.0, 5.0, 9.0, 0.0, 6.0, 2.0, 9.0, 4.0, 3.0, 9.0
+            }
+        });
+        const Tensor myOutput = Tensor(Array1D<float,1> {{9}});
+
+        std::shared_ptr<Node> myArgMax = ArgMax(0, 1, 1);
+        auto op = std::static_pointer_cast<OperatorTensor>(myArgMax -> getOperator());
+        op->associateInput(0,myInput);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cuda");
+        myArgMax->forward();
+
+        std::shared_ptr<Tensor> outputFallback;
+        const auto& cudaOutput = op->getOutput(0)->refCastFrom(outputFallback, myOutput);
+        REQUIRE(approxEq<float>(cudaOutput, myOutput));
+    }
+}
\ No newline at end of file
diff --git a/unit_tests/Test_ConvImpl.cpp b/unit_tests/Test_ConvImpl.cpp
index dc77e35b64fd22952e683e373fcc271c742ece75..72a4040a8ecbd091e24f8441d9c29970ea82c606 100644
--- a/unit_tests/Test_ConvImpl.cpp
+++ b/unit_tests/Test_ConvImpl.cpp
@@ -240,8 +240,8 @@ TEST_CASE("[gpu/operator] Conv(forward)") {
         for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial)
         {
             const std::size_t kernel = kernelDist(gen);
-            std::uniform_int_distribution<std::size_t> resolutionDist(std::size_t(kernel+2),
-                                                               std::size_t(10));
+            std::uniform_int_distribution<std::size_t> resolutionDist(std::size_t(kernel),
+                                                                      std::size_t(10));
             const std::size_t nbDims = 4;
             std::vector<std::size_t> dims;
             for (std::size_t i = 0; i < nbDims; ++i) {
@@ -351,8 +351,9 @@ TEST_CASE("[gpu/operator] Conv(forward)") {
 
             // forward CPU
             op_cpu->forward();
-            float *computed_cpu = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr());
-            REQUIRE(approxEq<float>(*computed_cuda, *computed_cpu));
+            std::shared_ptr<Tensor> outputFallback;
+            const auto& cudaOutput = op_cuda->getOutput(0)->refCastFrom(outputFallback, *op_cpu->getOutput(0));
+            REQUIRE(approxEq<float>(cudaOutput, *(op_cpu->getOutput(0))));
 
             delete[] array0;
             delete[] weights;
diff --git a/unit_tests/Test_DivImpl.cpp b/unit_tests/Test_DivImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..07cde5d6acb8eeeff2667e5c67aedb87b893e84c
--- /dev/null
+++ b/unit_tests/Test_DivImpl.cpp
@@ -0,0 +1,140 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <array>
+#include <numeric> // std::accumulate
+#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu.hpp"
+#include "aidge/backend/cuda.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+namespace Aidge {
+
+TEST_CASE("[gpu/operator] Div", "[Div][GPU]") {
+constexpr std::uint16_t NBTRIALS = 10;
+        // Create a random number generator
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> valueDist(
+            0.1f, 1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1),
+                                                               std::size_t(10));
+        std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(4), std::size_t(5));
+        std::uniform_int_distribution<int> boolDist(0,1);
+
+        // To measure execution time of 'forward()'
+        std::chrono::time_point<std::chrono::system_clock> start;
+        std::chrono::time_point<std::chrono::system_clock> end;
+        std::chrono::duration<double, std::micro> duration{};
+        std::size_t number_of_operation = 0;
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial)
+        {
+            // Create Div Operator CUDA
+            std::shared_ptr<Node> myDivCUDA = Div();
+            auto op_cuda = std::static_pointer_cast<OperatorTensor>(myDivCUDA -> getOperator());
+
+            // Create Div Operator CPU
+            std::shared_ptr<Node> myDivCPU = Div();
+            auto op_cpu = std::static_pointer_cast<OperatorTensor>(myDivCPU -> getOperator());
+            op_cpu->setDataType(DataType::Float32);
+            op_cpu->setBackend("cpu");
+
+            const std::size_t nbDims = nbDimsDist(gen);
+            std::vector<std::size_t> dims0, dims1, dims;
+            for (std::size_t i = 0; i < nbDims; ++i) {
+                const std::size_t dim = dimSizeDist(gen);
+                    dims0.push_back(dim);
+                if (boolDist(gen)) {
+                    dims1.push_back(1);
+                }else{
+                    dims1.push_back(dim);
+                }
+                dims.push_back(std::max(dims0[i], dims1[i]));
+            }
+
+            const std::size_t nb_elements0 = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+            const std::size_t nb_elements1 = std::accumulate(dims1.cbegin(), dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
+            const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+            number_of_operation += nb_elements;
+            float* array0 = new float[nb_elements0];
+            float* array1 = new float[nb_elements1];
+
+            for (std::size_t i = 0; i < nb_elements0; ++i) {
+                array0[i] = valueDist(gen);
+            }
+            for (std::size_t i = 0; i < nb_elements1; ++i) {
+                array1[i] = valueDist(gen);
+            }
+
+            // input0 CUDA
+            float* array0_d, *array1_d;
+            std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>();
+            T0_cuda->setDataType(DataType::Float32);
+            T0_cuda->setBackend("cuda");
+            T0_cuda->resize(dims0);
+            op_cuda->associateInput(0, T0_cuda);
+            cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements0);
+            cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements0, cudaMemcpyHostToDevice);
+            T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements0);
+
+            // input0 CPU
+            std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>();
+            op_cpu->associateInput(0,T0_cpu);
+            T0_cpu->setDataType(DataType::Float32);
+            T0_cpu->setBackend("cpu");
+            T0_cpu->resize(dims0);
+            T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements0);
+
+            // input1 CUDA
+            std::shared_ptr<Tensor> T1_cuda = std::make_shared<Tensor>();
+            T1_cuda->setDataType(DataType::Float32);
+            T1_cuda->setBackend("cuda");
+            T1_cuda->resize(dims1);
+            op_cuda->associateInput(1, T1_cuda);
+            cudaMalloc(reinterpret_cast<void **>(&array1_d), sizeof(float) * nb_elements1);
+            cudaMemcpy(array1_d, array1, sizeof(float) * nb_elements1, cudaMemcpyHostToDevice);
+            T1_cuda->getImpl()->setRawPtr(array1_d, nb_elements1);
+
+            // input1 CPU
+            std::shared_ptr<Tensor> T1_cpu = std::make_shared<Tensor>();
+            op_cpu->associateInput(1,T1_cpu);
+            T1_cpu->setDataType(DataType::Float32);
+            T1_cpu->setBackend("cpu");
+            T1_cpu->resize(dims1);
+            T1_cpu -> getImpl() -> setRawPtr(array1, nb_elements1);
+
+            // forward CUDA
+            op_cuda->setDataType(DataType::Float32);
+            op_cuda->setBackend("cuda");
+            start = std::chrono::system_clock::now();
+            op_cuda->forward();
+            end = std::chrono::system_clock::now();
+            duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+            // forward CPU
+            op_cpu->forward();
+            float *computedCPU = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr());
+
+            std::shared_ptr<Tensor> outputFallback;
+            const auto& cudaOutput = op_cuda->getOutput(0)->refCastFrom(outputFallback, *op_cpu->getOutput(0));
+            REQUIRE(approxEq<float>(cudaOutput, *(op_cpu->getOutput(0))));
+
+            delete[] array0;
+            delete[] array1;
+            cudaFree(array0_d);
+            cudaFree(array1_d);
+        }
+}
+} // namespace Aidge
diff --git a/unit_tests/Test_ILayerNormImpl.cpp b/unit_tests/Test_ILayerNormImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0487b7c4716596e0d2e7bcbdaf812358be4de3bf
--- /dev/null
+++ b/unit_tests/Test_ILayerNormImpl.cpp
@@ -0,0 +1,201 @@
+/********************************************************************************
+ * Copyright (c) 2024 Thales
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ * Author: Lucas RAKOTOARIVONY, Thales Research & Technology France
+ * Date: 10.09.2024
+ *
+ ********************************************************************************/
+
+#include <array>
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "Test_cuda.hpp"
+
+#include "aidge/data/Tensor.hpp"
+
+#include "aidge/backend/cpu.hpp"
+#include "aidge/backend/cuda.hpp"
+
+using namespace Aidge;
+
+TEST_CASE("[gpu/operator] ILayerNorm(forward)", "[ILayerNorm][GPU]") {
+    SECTION("4D Tensor") {
+        std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array4D<float,2,2,2,10> {
+            {
+                {
+                    {
+                        {0.96, 0.48, 0.54, 0.49, 0.59, 0.93, 0.00, 0.00, 0.61, 0.61},
+                        {0.85, 0.06, 0.11, 0.87, 0.55, 0.12, 0.80, 0.48, 0.41, 0.16}
+                    },
+                    {
+                        {0.24, 0.46, 0.97, 0.19, 0.65, 0.12, 0.44, 1.00, 0.37, 0.09},
+                        {0.44, 0.64, 0.21, 0.58, 0.05, 0.24, 0.56, 0.07, 0.49, 0.79}
+                    }
+                },
+                {
+                    {
+                        {0.00, 0.13, 0.55, 0.42, 0.49, 0.28, 0.52, 0.55, 0.34, 0.85},
+                        {0.98, 0.32, 0.09, 0.05, 0.37, 0.47, 0.63, 0.13, 0.70, 0.02}
+                    },
+                    {
+                        {0.69, 0.13, 0.74, 0.61, 0.25, 0.87, 0.46, 0.40, 0.81, 0.06},
+                        {0.89, 0.32, 0.61, 0.24, 0.70, 0.23, 0.09, 0.03, 0.14, 0.80}
+                    }
+                }
+            }
+        });
+
+        std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float, 10>{{0, 0, 0, 0, 0, 0, 0, 0, 0, 0}});
+        std::shared_ptr<Tensor> myWeight = std::make_shared<Tensor>(Array1D<float, 10>{{0.1617684f, 0.3833238f ,-0.6842308f ,-0.4342245f ,-0.4717381f ,-0.1776187f, -0.2728751f, -0.4638580f, 0.2936697f, -0.9011016f}});
+
+        myWeight->setBackend("cuda");
+        myBias->setBackend("cuda");
+
+        std::shared_ptr<Node> myILayerNorm = ILayerNorm();
+        auto op = std::static_pointer_cast<OperatorTensor>(myILayerNorm -> getOperator());
+
+        op -> associateInput(1, myWeight);
+        op -> associateInput(2, myBias);
+
+        input0->setBackend("cuda");
+
+        op -> associateInput(0,input0);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cuda");
+        op->forward();
+
+        // expected output
+        std::shared_ptr<Tensor> output_ilayernorm = std::make_shared<Tensor>(Array4D<float,2,2,2,10> {
+        {
+            {
+                {
+                    {9.8821178e-02, 4.9410585e-02, 4.9410585e-02, 4.9410585e-02, 4.9410585e-02, 4.9410585e-02, 0.0000000e+00, 0.0000000e+00, 4.9410585e-02, 4.9410585e-02},
+                    {4.9410585e-02, 0.0000000e+00, 0.0000000e+00, 4.9410585e-02, 4.9410585e-02, 0.0000000e+00, 4.9410585e-02, 4.9410585e-02, 4.9410585e-02, 0.0000000e+00}
+                },
+                {
+                    {0.0000000e+00, 4.9410585e-02, 9.8821178e-02, 0.0000000e+00, 4.9410585e-02, 0.0000000e+00, 4.9410585e-02, 9.8821178e-02, 4.9410585e-02, 0.0000000e+00},
+                    {4.9410585e-02, 4.9410585e-02, 0.0000000e+00, 4.9410585e-02, 0.0000000e+00, 0.0000000e+00, 4.9410585e-02, 0.0000000e+00, 4.9410585e-02, 4.9410585e-02}
+                }
+            },
+            {
+                {
+                    {0.0000000e+00, 0.0000000e+00, 4.9410585e-02, 4.9410585e-02, 4.9410585e-02, 0.0000000e+00, 4.9410585e-02, 4.9410585e-02, 4.9410585e-02, 4.9410585e-02},
+                    {9.8821178e-02, 4.9410585e-02, 0.0000000e+00, 0.0000000e+00, 4.9410585e-02, 4.9410585e-02, 4.9410585e-02, 0.0000000e+00, 4.9410585e-02, 0.0000000e+00}
+                },
+                {
+                    {4.9410585e-02, 0.0000000e+00, 4.9410585e-02, 4.9410585e-02, 0.0000000e+00, 4.9410585e-02, 4.9410585e-02, 4.9410585e-02, 4.9410585e-02, 0.0000000e+00},
+                    {4.9410585e-02, 4.9410585e-02, 4.9410585e-02, 0.0000000e+00, 4.9410585e-02, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 4.9410585e-02}
+                }
+            }
+        }
+    });
+
+
+        float* computedOutput   = new float[output_ilayernorm->size()]();
+        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * output_ilayernorm->size(), cudaMemcpyDeviceToHost);
+
+        //test if forward result are as expected
+        for(int i = 0; i < output_ilayernorm->size(); i++){
+            const float targetOutput = *(static_cast<float*>(output_ilayernorm->getImpl()->rawPtr()) + i);
+            REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
+        }
+
+        }
+
+}
+
+TEST_CASE("[gpu/operator] ILayerNorm(backward)", "[ILayerNorm][GPU]")
+
+{   
+    std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array4D<float,1,1,1,8> { //NCHW
+            {
+                    {
+                        {
+                            {1.46650600,  1.24083233, -0.33106008, -0.15137172, 0.06625678, -1.8326609, 0.53444749, -0.05167147},
+                        },
+                    },
+            }
+        });
+    
+    std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array4D<float,1,1,1,8> { //NCHW
+            {
+                    {
+                        {
+                            {0.96, 0.54, 0.22, -0.15, 0.17, 0.26, -0.85, 0.5},
+                        },
+                    },
+            }
+        });
+    
+    std::shared_ptr<Tensor> myWeight = std::make_shared<Tensor>(Array4D<float,1,1,1,8> { //NCHW
+            {
+                    {
+                        {
+                            {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0},
+                        },
+                    },
+            }
+        });
+    
+
+        myWeight->setBackend("cuda");
+        myBias->setBackend("cuda");
+
+        std::shared_ptr<Node> myILayerNorm = ILayerNorm();
+        auto op = std::static_pointer_cast<OperatorTensor>(myILayerNorm -> getOperator());
+
+        op -> associateInput(1, myWeight);
+        op -> associateInput(2, myBias);
+
+        input0->setBackend("cuda");
+
+        op -> associateInput(0,input0);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cuda");
+        myILayerNorm->forward();
+
+    std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array4D<float,1,1,1,8> {
+            {
+                {
+                    {
+                        { 1.34347093,  0.90813798, 0.39607167,  1.20428133, 0.16845724,  0.48487359, 0.40748054, -0.21790814},
+                    },
+                },
+            }
+        });
+
+
+    myOutputGrad->setBackend("cuda");
+    std::shared_ptr<Tensor> predictedOutput = op->getOutput(0);
+    std::shared_ptr<Tensor> input = op->getInput(0);
+    predictedOutput->setGrad(myOutputGrad);
+    REQUIRE_NOTHROW(myILayerNorm->backward());
+
+    std::shared_ptr<Tensor> expectedInputGradILayerNorm = std::make_shared<Tensor>(Array4D<float,1,1,1,8> {
+            {
+                {
+                    {
+                        { 0.467678, 0.310749, 0.1129, 0.351786, 0.0507252, 0.101587, 0.130249, -0.0646476},
+                    },
+                },
+            }
+        });
+
+
+    float *computedInputGradCuda = new float[myOutputGrad->size()]();
+    cudaMemcpy(computedInputGradCuda, op->getInput(0)->grad()->getImpl()->rawPtr(), sizeof(float) * myOutputGrad->size(), cudaMemcpyDeviceToHost);
+
+    //test if backward result are as expected
+    for(int i = 0; i < expectedInputGradILayerNorm->size(); i++){
+        const float targetOutput = *(static_cast<float*>(expectedInputGradILayerNorm->getImpl()->rawPtr()) + i);
+        REQUIRE(fabs(computedInputGradCuda[i] - targetOutput) < 2e-6);  
+    }
+
+    delete[] computedInputGradCuda;
+}
diff --git a/unit_tests/Test_LnImpl.cpp b/unit_tests/Test_LnImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..06e2205ba38ce0becd0326bf4d258b9f55a228bd
--- /dev/null
+++ b/unit_tests/Test_LnImpl.cpp
@@ -0,0 +1,106 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <array>
+#include <numeric> // std::accumulate
+#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu.hpp"
+#include "aidge/backend/cuda.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+namespace Aidge {
+
+TEST_CASE("[gpu/operator] Ln", "[Ln][GPU]") {
+constexpr std::uint16_t NBTRIALS = 10;
+        // Create a random number generator
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> valueDist(
+            0.1f, 1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1),
+                                                               std::size_t(10));
+        std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(8));
+
+        // To measure execution time of 'forward()'
+        std::chrono::time_point<std::chrono::system_clock> start;
+        std::chrono::time_point<std::chrono::system_clock> end;
+        std::chrono::duration<double, std::micro> duration{};
+        std::size_t number_of_operation = 0;
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial)
+        {
+            // Create Ln Operator CUDA
+            std::shared_ptr<Node> myLnCUDA = Ln();
+            auto op_cuda = std::static_pointer_cast<OperatorTensor>(myLnCUDA -> getOperator());
+
+            // Create Ln Operator CPU
+            std::shared_ptr<Node> myLnCPU = Ln();
+            auto op_cpu = std::static_pointer_cast<OperatorTensor>(myLnCPU -> getOperator());
+            op_cpu->setDataType(DataType::Float32);
+            op_cpu->setBackend("cpu");
+
+            const std::size_t nbDims = nbDimsDist(gen);
+            std::vector<std::size_t> dims;
+            for (std::size_t i = 0; i < nbDims; ++i) {
+                dims.push_back(dimSizeDist(gen));
+            }
+
+            const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+            number_of_operation += nb_elements;
+            float* array0 = new float[nb_elements];
+
+            for (std::size_t i = 0; i < nb_elements; ++i) {
+                array0[i] = valueDist(gen);
+            }
+
+            // input0 CUDA
+            float* array0_d;
+            std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>();
+            T0_cuda->setDataType(DataType::Float32);
+            T0_cuda->setBackend("cuda");
+            T0_cuda->resize(dims);
+            op_cuda->associateInput(0, T0_cuda);
+            cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements);
+            cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice);
+            T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements);
+
+            // input0 CPU
+            std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>();
+            op_cpu->associateInput(0,T0_cpu);
+            T0_cpu->setDataType(DataType::Float32);
+            T0_cpu->setBackend("cpu");
+            T0_cpu->resize(dims);
+            T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements);
+
+            // forward CUDA
+            op_cuda->setDataType(DataType::Float32);
+            op_cuda->setBackend("cuda");
+            start = std::chrono::system_clock::now();
+            op_cuda->forward();
+            end = std::chrono::system_clock::now();
+            duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+            // forward CPU
+            op_cpu->forward();
+            float *computedCPU = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr());
+
+            std::shared_ptr<Tensor> outputFallback;
+            const auto& cudaOutput = op_cuda->getOutput(0)->refCastFrom(outputFallback, *op_cpu->getOutput(0));
+            REQUIRE(approxEq<float>(cudaOutput, *(op_cpu->getOutput(0))));
+
+            delete[] array0;
+            cudaFree(array0_d);
+        }
+}
+} // namespace Aidge
diff --git a/unit_tests/Test_MulImpl.cpp b/unit_tests/Test_MulImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9eaba6e80971a7075576cd3d4d409b79dac4eb0c
--- /dev/null
+++ b/unit_tests/Test_MulImpl.cpp
@@ -0,0 +1,140 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <array>
+#include <numeric> // std::accumulate
+#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu.hpp"
+#include "aidge/backend/cuda.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+namespace Aidge {
+
+TEST_CASE("[gpu/operator] Mul", "[Mul][GPU]") {
+constexpr std::uint16_t NBTRIALS = 10;
+        // Create a random number generator
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> valueDist(
+            0.1f, 1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1),
+                                                               std::size_t(10));
+        std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(4), std::size_t(5));
+        std::uniform_int_distribution<int> boolDist(0,1);
+
+        // To measure execution time of 'forward()'
+        std::chrono::time_point<std::chrono::system_clock> start;
+        std::chrono::time_point<std::chrono::system_clock> end;
+        std::chrono::duration<double, std::micro> duration{};
+        std::size_t number_of_operation = 0;
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial)
+        {
+            // Create Mul Operator CUDA
+            std::shared_ptr<Node> myMulCUDA = Mul();
+            auto op_cuda = std::static_pointer_cast<OperatorTensor>(myMulCUDA -> getOperator());
+
+            // Create Mul Operator CPU
+            std::shared_ptr<Node> myMulCPU = Mul();
+            auto op_cpu = std::static_pointer_cast<OperatorTensor>(myMulCPU -> getOperator());
+            op_cpu->setDataType(DataType::Float32);
+            op_cpu->setBackend("cpu");
+
+            const std::size_t nbDims = nbDimsDist(gen);
+            std::vector<std::size_t> dims0, dims1, dims;
+            for (std::size_t i = 0; i < nbDims; ++i) {
+                const std::size_t dim = dimSizeDist(gen);
+                    dims0.push_back(dim);
+                if (boolDist(gen)) {
+                    dims1.push_back(1);
+                }else{
+                    dims1.push_back(dim);
+                }
+                dims.push_back(std::max(dims0[i], dims1[i]));
+            }
+
+            const std::size_t nb_elements0 = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+            const std::size_t nb_elements1 = std::accumulate(dims1.cbegin(), dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
+            const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+            number_of_operation += nb_elements;
+            float* array0 = new float[nb_elements0];
+            float* array1 = new float[nb_elements1];
+
+            for (std::size_t i = 0; i < nb_elements0; ++i) {
+                array0[i] = valueDist(gen);
+            }
+            for (std::size_t i = 0; i < nb_elements1; ++i) {
+                array1[i] = valueDist(gen);
+            }
+
+            // input0 CUDA
+            float* array0_d, *array1_d;
+            std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>();
+            T0_cuda->setDataType(DataType::Float32);
+            T0_cuda->setBackend("cuda");
+            T0_cuda->resize(dims0);
+            op_cuda->associateInput(0, T0_cuda);
+            cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements0);
+            cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements0, cudaMemcpyHostToDevice);
+            T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements0);
+
+            // input0 CPU
+            std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>();
+            op_cpu->associateInput(0,T0_cpu);
+            T0_cpu->setDataType(DataType::Float32);
+            T0_cpu->setBackend("cpu");
+            T0_cpu->resize(dims0);
+            T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements0);
+
+            // input1 CUDA
+            std::shared_ptr<Tensor> T1_cuda = std::make_shared<Tensor>();
+            T1_cuda->setDataType(DataType::Float32);
+            T1_cuda->setBackend("cuda");
+            T1_cuda->resize(dims1);
+            op_cuda->associateInput(1, T1_cuda);
+            cudaMalloc(reinterpret_cast<void **>(&array1_d), sizeof(float) * nb_elements1);
+            cudaMemcpy(array1_d, array1, sizeof(float) * nb_elements1, cudaMemcpyHostToDevice);
+            T1_cuda->getImpl()->setRawPtr(array1_d, nb_elements1);
+
+            // input1 CPU
+            std::shared_ptr<Tensor> T1_cpu = std::make_shared<Tensor>();
+            op_cpu->associateInput(1,T1_cpu);
+            T1_cpu->setDataType(DataType::Float32);
+            T1_cpu->setBackend("cpu");
+            T1_cpu->resize(dims1);
+            T1_cpu -> getImpl() -> setRawPtr(array1, nb_elements1);
+
+            // forward CUDA
+            op_cuda->setDataType(DataType::Float32);
+            op_cuda->setBackend("cuda");
+            start = std::chrono::system_clock::now();
+            op_cuda->forward();
+            end = std::chrono::system_clock::now();
+            duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+            // forward CPU
+            op_cpu->forward();
+            float *computedCPU = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr());
+
+            std::shared_ptr<Tensor> outputFallback;
+            const auto& cudaOutput = op_cuda->getOutput(0)->refCastFrom(outputFallback, *op_cpu->getOutput(0));
+            REQUIRE(approxEq<float>(cudaOutput, *(op_cpu->getOutput(0))));
+
+            delete[] array0;
+            delete[] array1;
+            cudaFree(array0_d);
+            cudaFree(array1_d);
+        }
+}
+} // namespace Aidge
diff --git a/unit_tests/Test_PowImpl.cpp b/unit_tests/Test_PowImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..49e65b46d7d85b7087c5c73151d643593d91e02e
--- /dev/null
+++ b/unit_tests/Test_PowImpl.cpp
@@ -0,0 +1,355 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <array>
+#include <numeric> // std::accumulate
+#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu.hpp"
+#include "aidge/backend/cuda.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+namespace Aidge {
+
+TEST_CASE("[gpu/operator] Pow", "[Pow][GPU]") {
+    constexpr std::uint16_t NBTRIALS = 10;
+    // Create a random number generator
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1
+    std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
+    std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5));
+    std::uniform_int_distribution<int> boolDist(0,1);
+
+    // To measure execution time of 'MatPow_Op::forward()' member function call
+    std::chrono::time_point<std::chrono::system_clock> start;
+    std::chrono::time_point<std::chrono::system_clock> end;
+    std::chrono::duration<double, std::micro> duration{};
+
+    SECTION("PowImpl::forward()") {
+        SECTION("Scalar / Scalar") {
+
+        }
+        SECTION("Scalar / +1-D Tensor") {
+
+        }
+        SECTION("+1-D Tensor / +1-D Tensor - same dimensions") {
+
+            // Create Pow Operator
+            std::shared_ptr<Node> myPowCUDA = Pow();
+            auto op_cuda = std::static_pointer_cast<OperatorTensor>(myPowCUDA-> getOperator());
+            op_cuda->setDataType(DataType::Float32);
+            op_cuda->setBackend("cuda");
+            std::shared_ptr<Node> myPowCPU = Pow();
+            auto op_cpu = std::static_pointer_cast<OperatorTensor>(myPowCPU-> getOperator());
+            op_cpu->setDataType(DataType::Float32);
+            op_cpu->setBackend("cpu");
+
+            std::size_t number_of_operation = 0;
+
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                // generate 2 random Tensors
+                const std::size_t nbDims = nbDimsDist(gen);
+                std::vector<std::size_t> dims;
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    dims.push_back(dimSizeDist(gen));
+                }
+                const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                number_of_operation += nb_elements;
+
+                // without broadcasting
+                float* array0 = new float[nb_elements];
+                float* array1 = new float[nb_elements];
+
+                for (std::size_t i = 0; i < nb_elements; ++i) {
+                    array0[i] = valueDist(gen);
+                    array1[i] = valueDist(gen);
+                }
+
+                // input0 CUDA
+                float* array0_d, *array1_d;
+                std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>();
+                T0_cuda->setDataType(DataType::Float32);
+                T0_cuda->setBackend("cuda");
+                T0_cuda->resize(dims);
+                op_cuda->associateInput(0, T0_cuda);
+                cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements);
+                cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice);
+                T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements);
+
+                // input0 CPU
+                std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>();
+                T0_cpu->setDataType(DataType::Float32);
+                T0_cpu->setBackend("cpu");
+                T0_cpu->resize(dims);
+                op_cpu->associateInput(0,T0_cpu);
+                T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements);
+
+                // input1 CUDA
+                std::shared_ptr<Tensor> T1_cuda = std::make_shared<Tensor>();
+                T1_cuda->setDataType(DataType::Float32);
+                T1_cuda->setBackend("cuda");
+                T1_cuda->resize(dims);
+                op_cuda->associateInput(1, T1_cuda);
+                cudaMalloc(reinterpret_cast<void **>(&array1_d), sizeof(float) * nb_elements);
+                cudaMemcpy(array1_d, array1, sizeof(float) * nb_elements, cudaMemcpyHostToDevice);
+                T1_cuda->getImpl()->setRawPtr(array1_d, nb_elements);
+
+                // input1
+                std::shared_ptr<Tensor> T1_cpu = std::make_shared<Tensor>();
+                T1_cpu->setDataType(DataType::Float32);
+                T1_cpu->setBackend("cpu");
+                T1_cpu->resize(dims);
+                op_cpu -> associateInput(1,T1_cpu);
+                T1_cpu -> getImpl() -> setRawPtr(array1, nb_elements);
+
+                op_cuda->forwardDims();
+                start = std::chrono::system_clock::now();
+                myPowCUDA->forward();
+                end = std::chrono::system_clock::now();
+                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+                // REQUIRE(false);
+                op_cpu->forwardDims();
+                myPowCPU->forward();
+
+                std::shared_ptr<Tensor> outputFallback;
+                const auto& cudaOutput = op_cuda->getOutput(0)->refCastFrom(outputFallback, *op_cpu->getOutput(0));
+                REQUIRE(approxEq<float>(cudaOutput, *(op_cpu->getOutput(0))));
+
+                delete[] array0;
+                delete[] array1;
+                cudaFree(array0_d);
+                cudaFree(array1_d);
+            }
+            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
+            std::cout << "total time: " << duration.count() << "μs" << std::endl;
+        }
+
+        SECTION("+1-D Tensor / +1-D Tensor - broadcasting") {
+            // Create Pow Operator
+            std::shared_ptr<Node> myPowCUDA = Pow();
+            auto op_cuda = std::static_pointer_cast<OperatorTensor>(myPowCUDA-> getOperator());
+            op_cuda->setDataType(DataType::Float32);
+            op_cuda->setBackend("cuda");
+            std::shared_ptr<Node> myPowCPU = Pow();
+            auto op_cpu = std::static_pointer_cast<OperatorTensor>(myPowCPU-> getOperator());
+            op_cpu->setDataType(DataType::Float32);
+            op_cpu->setBackend("cpu");
+            
+            std::size_t number_of_operation = 0;
+
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                // generate 2 random Tensors
+                // handle dimensions, replace some dimensions with '1' to get broadcasting
+                constexpr std::size_t nbDims = 4;
+                std::vector<std::size_t> dims;
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    dims.push_back(dimSizeDist(gen));
+                }
+                std::vector<std::size_t> dims0 = dims;
+                std::vector<std::size_t> dims1 = dims;
+                std::vector<std::size_t> dimsOut = dims;
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    if (boolDist(gen)) {
+                        dims0[i] = 1;
+                    }
+                    if (boolDist(gen)) {
+                        dims1[i] = 1;
+                    }
+                    dimsOut[i] = (dims0[i] == 1) ? dims1[i] : dims0[i];
+                }
+
+                // create arrays and fill them with random values
+                std::size_t array0_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                std::size_t array1_size = std::accumulate(dims1.cbegin(), dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                float* array0 = new float[array0_size];
+                float* array1 = new float[array1_size];
+
+                for (std::size_t i = 0; i < array0_size; ++i) {
+                    array0[i] = valueDist(gen);
+                }
+                for (std::size_t i = 0; i < array1_size; ++i) {
+                    array1[i] = valueDist(gen);
+                }
+                // input0 CUDA
+                float* array0_d, *array1_d;
+                std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>();
+                T0_cuda->setDataType(DataType::Float32);
+                T0_cuda->setBackend("cuda");
+                T0_cuda->resize(dims0);
+                op_cuda->associateInput(0, T0_cuda);
+                cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * array0_size);
+                cudaMemcpy(array0_d, array0, sizeof(float) * array0_size, cudaMemcpyHostToDevice);
+                T0_cuda->getImpl()->setRawPtr(array0_d, array0_size);
+
+                // input0 CPU
+                std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>();
+                T0_cpu->setDataType(DataType::Float32);
+                T0_cpu->setBackend("cpu");
+                op_cpu->associateInput(0,T0_cpu);
+                T0_cpu->resize(dims0);
+                T0_cpu -> getImpl() -> setRawPtr(array0, array0_size);
+
+                // input1 CUDA
+                std::shared_ptr<Tensor> T1_cuda = std::make_shared<Tensor>();
+                T1_cuda->setDataType(DataType::Float32);
+                T1_cuda->setBackend("cuda");
+                T1_cuda->resize(dims1);
+                op_cuda->associateInput(1, T1_cuda);
+                cudaMalloc(reinterpret_cast<void **>(&array1_d), sizeof(float) * array1_size);
+                cudaMemcpy(array1_d, array1, sizeof(float) * array1_size, cudaMemcpyHostToDevice);
+                T1_cuda->getImpl()->setRawPtr(array1_d, array1_size);
+
+                // input1
+                std::shared_ptr<Tensor> T1_cpu = std::make_shared<Tensor>();
+                T1_cpu->setDataType(DataType::Float32);
+                T1_cpu->setBackend("cpu");
+                T1_cpu->resize(dims1);
+                op_cpu -> associateInput(1,T1_cpu);
+                T1_cpu -> getImpl() -> setRawPtr(array1, array1_size);
+
+                op_cuda->forwardDims();
+                start = std::chrono::system_clock::now();
+                myPowCUDA->forward();
+                end = std::chrono::system_clock::now();
+                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+                op_cpu->forwardDims();
+                myPowCPU->forward();
+
+                std::shared_ptr<Tensor> outputFallback;
+                const auto& cudaOutput = op_cuda->getOutput(0)->refCastFrom(outputFallback, *op_cpu->getOutput(0));
+                REQUIRE(approxEq<float>(cudaOutput, *(op_cpu->getOutput(0))));
+
+                delete[] array0;
+                delete[] array1;
+                cudaFree(array0_d);
+                cudaFree(array1_d);
+
+                const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                number_of_operation += nb_elements;
+            }
+            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
+            std::cout << "total time: " << duration.count() << "μs" << std::endl;
+        }
+        SECTION("+1-D Tensor / 1-D Tensor") {
+            // Create Pow Operator
+            std::shared_ptr<Node> myPowCUDA = Pow();
+            auto op_cuda = std::static_pointer_cast<OperatorTensor>(myPowCUDA-> getOperator());
+            op_cuda->setDataType(DataType::Float32);
+            op_cuda->setBackend("cuda");
+            std::shared_ptr<Node> myPowCPU = Pow();
+            auto op_cpu = std::static_pointer_cast<OperatorTensor>(myPowCPU-> getOperator());
+            op_cpu->setDataType(DataType::Float32);
+            op_cpu->setBackend("cpu");
+
+            std::size_t number_of_operation = 0;
+            std::uniform_int_distribution<std::size_t> nbRemovedDimsDist(std::size_t(1), std::size_t(3));
+
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                // generate 2 random Tensors
+                // handle dimensions
+                constexpr std::size_t nbDims = 4;
+                std::vector<std::size_t> dims0(4);
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    dims0[i] = dimSizeDist(gen);
+                }
+                std::vector<std::size_t> dimsOut = dims0;
+                std::vector<std::size_t> dims1 = dims0;
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    if (boolDist(gen)) {
+                        dims1[i] = 1;
+                    }
+                }
+                dims1.erase(dims1.cbegin(), dims1.cbegin() + nbRemovedDimsDist(gen));
+
+                // create arrays and fill them with random values
+                std::size_t array0_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                float* array0 = new float[array0_size];
+                std::size_t array1_size = std::accumulate(dims1.cbegin(), dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                float* array1 = new float[array1_size];
+
+                for (std::size_t i = 0; i < array0_size; ++i) {
+                    array0[i] = valueDist(gen);
+                }
+                for (std::size_t i = 0; i < array1_size; ++i) {
+                    array1[i] = valueDist(gen);
+                }
+
+                // input0 CUDA
+                float* array0_d, *array1_d;
+                std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>();
+                T0_cuda->setDataType(DataType::Float32);
+                T0_cuda->setBackend("cuda");
+                T0_cuda->resize(dims0);
+                op_cuda->associateInput(0, T0_cuda);
+                cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * array0_size);
+                cudaMemcpy(array0_d, array0, sizeof(float) * array0_size, cudaMemcpyHostToDevice);
+                T0_cuda->getImpl()->setRawPtr(array0_d, array0_size);
+
+                // input0 CPU
+                std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>();
+                T0_cpu->setDataType(DataType::Float32);
+                T0_cpu->setBackend("cpu");
+                op_cpu->associateInput(0,T0_cpu);
+                T0_cpu->resize(dims0);
+                T0_cpu -> getImpl() -> setRawPtr(array0, array0_size);
+
+                // input1 CUDA
+                std::shared_ptr<Tensor> T1_cuda = std::make_shared<Tensor>();
+                T1_cuda->setDataType(DataType::Float32);
+                T1_cuda->setBackend("cuda");
+                T1_cuda->resize(dims1);
+                op_cuda->associateInput(1, T1_cuda);
+                cudaMalloc(reinterpret_cast<void **>(&array1_d), sizeof(float) * array1_size);
+                cudaMemcpy(array1_d, array1, sizeof(float) * array1_size, cudaMemcpyHostToDevice);
+                T1_cuda->getImpl()->setRawPtr(array1_d, array1_size);
+
+                // input1
+                std::shared_ptr<Tensor> T1_cpu = std::make_shared<Tensor>();
+                T1_cpu->setDataType(DataType::Float32);
+                T1_cpu->setBackend("cpu");
+                T1_cpu->resize(dims1);
+                op_cpu -> associateInput(1,T1_cpu);
+                T1_cpu -> getImpl() -> setRawPtr(array1, array1_size);
+
+                op_cuda->forwardDims();
+                start = std::chrono::system_clock::now();
+                myPowCUDA->forward();
+                end = std::chrono::system_clock::now();
+                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+                op_cpu->forwardDims();
+                myPowCPU->forward();
+
+                std::shared_ptr<Tensor> outputFallback;
+                const auto& cudaOutput = op_cuda->getOutput(0)->refCastFrom(outputFallback, *op_cpu->getOutput(0));
+                REQUIRE(approxEq<float>(cudaOutput, *(op_cpu->getOutput(0))));
+
+                delete[] array0;
+                delete[] array1;
+                cudaFree(array0_d);
+                cudaFree(array1_d);
+
+                const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                number_of_operation += nb_elements;
+            }
+
+            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
+            std::cout << "total time: " << duration.count() << "μs" << std::endl;
+        }
+    }
+}
+} // namespace Aidge
diff --git a/unit_tests/Test_ReduceMeanImpl.cpp b/unit_tests/Test_ReduceMeanImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..041ad6e02d5f39fde22f34ce715d2b807e164b1a
--- /dev/null
+++ b/unit_tests/Test_ReduceMeanImpl.cpp
@@ -0,0 +1,333 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <array>
+#include <numeric> // std::accumulate
+#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu.hpp"
+#include "aidge/backend/cuda.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+namespace Aidge {
+
+TEST_CASE("[gpu/operator] ReduceMean(forward)", "[ReduceMean][GPU]") {
+    SECTION("KeepDims") {
+        SECTION("test 1") {
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
+                {
+                    {
+                        { 5.0, 1.0 },
+                        { 20.0, 2.0 }
+                    },
+                    {
+                        { 30.0, 1.0 },
+                        { 40.0, 2.0 }
+                    },
+                    {
+                        { 55.0, 1.0 },
+                        { 60.0, 2.0 }
+                    }
+                }
+            });
+            myInput->setBackend("cuda");
+            std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array3D<float,3,1,2> {
+                {
+
+                    {{ 12.5, 1.5 }},
+                    {{ 35.0, 1.5 }},
+                    {{ 57.5, 1.5 }}
+                }
+            });
+
+            std::shared_ptr<Node> myReduceMean = ReduceMean({1});
+            auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cuda");
+            myReduceMean->forward();
+
+            float* computedOutput   = new float[myOutput->size()]();
+            cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+            for(int i = 0; i < myOutput->size(); i++){
+                const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+                REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
+            }
+
+            delete[] computedOutput;
+        }
+        SECTION("test 2") {
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,3,2> {
+                {
+                    {
+                        { 0.0, 0.0 },
+                        { 1.0, 1.0 },
+                        { 2.0, 2.0 }
+                    },
+                    {
+                        { 3.0, 3.0 },
+                        { 4.0, 4.0 },
+                        { 5.0, 5.0 }
+                    },
+                    {
+                        { 6.0, 6.0 },
+                        { 7.0, 7.0 },
+                        { 8.0, 8.0 }
+                    }
+                }
+            });
+            myInput->setBackend("cuda");
+            std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array3D<float,3,1,1> {
+                {
+
+                    {{ 1.0 }},
+                    {{ 4.0 }},
+                    {{ 7.0 }}
+                }
+            });
+
+            std::shared_ptr<Node> myReduceMean = ReduceMean({1, 2});
+            auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cuda");
+            myReduceMean->forward();
+
+            float* computedOutput   = new float[myOutput->size()]();
+            cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+            for(int i = 0; i < myOutput->size(); i++){
+                const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+                REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
+            }
+
+            delete[] computedOutput;
+        }
+    }
+    SECTION("not_KeepDims") {
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
+            {
+                {
+                    { 5.0, 1.0 },
+                    { 20.0, 2.0 }
+                },
+                {
+                    { 30.0, 1.0 },
+                    { 40.0, 2.0 }
+                },
+                {
+                    { 55.0, 1.0 },
+                    { 60.0, 2.0 }
+                }
+            }
+        });
+        myInput->setBackend("cuda");
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float,3,2> {
+            {
+                { 12.5, 1.5 },
+                { 35.0, 1.5 },
+                { 57.5, 1.5 }
+            }
+        });
+
+        std::shared_ptr<Node> myReduceMean = ReduceMean({1}, false);
+        auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
+        op->associateInput(0,myInput);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cuda");
+        myReduceMean->forward();
+        float* computedOutput   = new float[myOutput->size()]();
+        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+        for(int i = 0; i < myOutput->size(); i++){
+            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+            std::cout << "computed: " << computedOutput[i] << ", target: " << targetOutput << std::endl;
+            REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
+        }
+
+        delete[] computedOutput;
+
+    }
+    SECTION("all_axes") {
+        SECTION("1") {
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
+                {
+                    {
+                        { 5.0, 1.0 },
+                        { 20.0, 2.0 }
+                    },
+                    {
+                        { 30.0, 1.0 },
+                        { 40.0, 2.0 }
+                    },
+                    {
+                        { 55.0, 1.0 },
+                        { 60.0, 2.0 }
+                    }
+                }
+            });
+            myInput->setBackend("cuda");
+            std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array1D<float,1> {
+                {18.25}
+            });
+
+            std::shared_ptr<Node> myReduceMean = ReduceMean({0, 1, 2}, false);
+            auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cuda");
+            myReduceMean->forward();
+            float* computedOutput   = new float[myOutput->size()]();
+            cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+            for(int i = 0; i < myOutput->size(); i++){
+                const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+                REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
+            }
+
+            delete[] computedOutput;
+        }
+        SECTION("2") {
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<float,5,4> {
+               {{ 0.004232f, 0.105120f, 0.045124f, 0.009205f},
+                { 0.000766f, 0.272162f, 0.503560f, 0.044163f},
+                { 0.049755f, 0.000305f, 0.143634f, 0.013253f},
+                { 0.096258f, 0.311231f, 0.358143f, 0.000452f},
+                { 0.468617f, 0.015693f, 0.145316f, 0.000105f}}
+            });
+            myInput->setBackend("cuda");
+            std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array1D<float,1> {
+                {0.1293547f}
+            });
+
+            std::shared_ptr<Node> myReduceMean = ReduceMean({0, 1}, false);
+            auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cuda");
+            myReduceMean->forward();
+
+            float* computedOutput   = new float[myOutput->size()]();
+            cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+            for(int i = 0; i < myOutput->size(); i++){
+                const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+                REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
+            }
+
+            delete[] computedOutput;
+        }
+        SECTION("noop_with_empty_axes") {
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
+                {
+                    {
+                        { 5.0, 1.0 },
+                        { 20.0, 2.0 }
+                    },
+                    {
+                        { 30.0, 1.0 },
+                        { 40.0, 2.0 }
+                    },
+                    {
+                        { 55.0, 1.0 },
+                        { 60.0, 2.0 }
+                    }
+                }
+            });
+            myInput->setBackend("cuda");
+            std::shared_ptr<Node> myReduceMean = ReduceMean({}, false, true);
+            auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cuda");
+            myReduceMean->forward();
+            
+            myInput->setBackend("cpu");
+            float* computedOutput   = new float[myInput->size()]();
+            cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myInput->size(), cudaMemcpyDeviceToHost);
+            for(int i = 0; i < myInput->size(); i++){
+                const float targetOutput = *(static_cast<float*>(myInput->getImpl()->rawPtr()) + i);
+                REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
+            }
+
+            delete[] computedOutput;
+        }
+    }
+}
+
+TEST_CASE("[gpu/operator] ReduceMean(backward)", "[ReduceMean][GPU]") {
+    SECTION("KeepDims") {
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
+            {
+                {
+                    { 5.0, 1.0 },
+                    { 20.0, 2.0 }
+                },
+                {
+                    { 30.0, 1.0 },
+                    { 40.0, 2.0 }
+                },
+                {
+                    { 55.0, 1.0 },
+                    { 60.0, 2.0 }
+                }
+            }
+        });
+        myInput->setBackend("cuda");
+
+
+        std::shared_ptr<Node> myReduceMean = ReduceMean({1});
+        auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
+        op->associateInput(0,myInput);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cuda");
+        myReduceMean->forward();
+
+
+        std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array3D<float,3,1,2> {
+            {
+
+                {{ 1.0, 2.0 }},
+                {{ 3.0, 4.0 }},
+                {{ 5.0, 6.0 }}
+            }
+        });
+        std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array3D<float,3,2,2> {
+            {
+                {
+                    { 1.0, 2.0 },
+                    { 1.0, 2.0 }
+                },
+                {
+                    { 3.0, 4.0 },
+                    { 3.0, 4.0 }
+                },
+                {
+                    { 5.0, 6.0 },
+                    { 5.0, 6.0 }
+                }
+            }
+        });
+        myOutputGrad->setBackend("cuda");
+        op->getOutput(0)->setGrad(myOutputGrad);
+        REQUIRE_NOTHROW(myReduceMean->backward());
+
+        float *computedGradCuda = new float[expectedInputGrad->size()]();
+        cudaMemcpy(computedGradCuda, op->getInput(0)->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInputGrad->size(), cudaMemcpyDeviceToHost);
+        
+        for(int i = 0; i < expectedInputGrad->size(); i++){
+            const float targetOutput = *(static_cast<float*>(expectedInputGrad->getImpl()->rawPtr()) + i);
+            REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6);
+        }
+
+        delete[] computedGradCuda;
+    }
+}
+}
diff --git a/unit_tests/Test_ReduceSumImpl.cpp b/unit_tests/Test_ReduceSumImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d0d37754102331c8f91a1ce1c81d679761916339
--- /dev/null
+++ b/unit_tests/Test_ReduceSumImpl.cpp
@@ -0,0 +1,297 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <array>
+#include <numeric> // std::accumulate
+#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu.hpp"
+#include "aidge/backend/cuda.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+namespace Aidge {
+
+TEST_CASE("[gpu/operator] ReduceSum(forward)", "[ReduceSum][GPU]") {
+    SECTION("KeepDims") {
+        SECTION("test 1") {
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
+                {
+                    {
+                        { 5.0, 1.0 },
+                        { 20.0, 2.0 }
+                    },
+                    {
+                        { 30.0, 1.0 },
+                        { 40.0, 2.0 }
+                    },
+                    {
+                        { 55.0, 1.0 },
+                        { 60.0, 2.0 }
+                    }
+                }
+            });
+            myInput->setBackend("cuda");
+            std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array3D<float,3,1,2> {
+                {
+                    {{ 25.0, 3.0 }},
+                    {{ 70.0, 3.0 }},
+                    {{ 115.0, 3.0 }}
+                }
+            });
+
+            std::shared_ptr<Node> myReduceSum = ReduceSum({1});
+            auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cuda");
+            myReduceSum->forward();
+
+            float* computedOutput   = new float[myOutput->size()]();
+            cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+            for(int i = 0; i < myOutput->size(); i++){
+                const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+                std::cout << "i: " << i << ", computed: " << computedOutput[i] << ", target: "<< targetOutput <<std::endl;
+                REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
+            }
+
+            delete[] computedOutput;
+        }
+        SECTION("test 2") {
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,3,2> {
+                {
+                    {
+                        { 0.0, 0.0 },
+                        { 1.0, 1.0 },
+                        { 2.0, 2.0 }
+                    },
+                    {
+                        { 3.0, 3.0 },
+                        { 4.0, 4.0 },
+                        { 5.0, 5.0 }
+                    },
+                    {
+                        { 6.0, 6.0 },
+                        { 7.0, 7.0 },
+                        { 8.0, 8.0 }
+                    }
+                }
+            });
+            myInput->setBackend("cuda");
+            std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array3D<float,3,1,1> {
+                {
+
+                    {{ 6.0 }},
+                    {{ 24.0 }},
+                    {{ 42.0 }}
+                }
+            });
+
+            std::shared_ptr<Node> myReduceSum = ReduceSum({1, 2});
+            auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cuda");
+            myReduceSum->forward();
+
+            float* computedOutput   = new float[myOutput->size()]();
+            cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+            for(int i = 0; i < myOutput->size(); i++){
+                const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+                REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
+            }
+
+            delete[] computedOutput;
+        }
+    }
+    SECTION("not_KeepDims") {
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
+            {
+                {
+                    { 5.0, 1.0 },
+                    { 20.0, 2.0 }
+                },
+                {
+                    { 30.0, 1.0 },
+                    { 40.0, 2.0 }
+                },
+                {
+                    { 55.0, 1.0 },
+                    { 60.0, 2.0 }
+                }
+            }
+        });
+        myInput->setBackend("cuda");
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float,3,2> {
+            {
+                    { 25.0, 3.0 },
+                    { 70.0, 3.0 },
+                    { 115.0, 3.0 }
+            }
+        });
+
+        std::shared_ptr<Node> myReduceSum = ReduceSum({1}, false);
+        auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
+        op->associateInput(0,myInput);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cuda");
+        myReduceSum->forward();
+        float* computedOutput   = new float[myOutput->size()]();
+        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+        for(int i = 0; i < myOutput->size(); i++){
+            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+            REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
+        }
+
+        delete[] computedOutput;
+
+    }
+    SECTION("all_axes") {
+        SECTION("1") {
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
+                {
+                    {
+                        { 5.0, 1.0 },
+                        { 20.0, 2.0 }
+                    },
+                    {
+                        { 30.0, 1.0 },
+                        { 40.0, 2.0 }
+                    },
+                    {
+                        { 55.0, 1.0 },
+                        { 60.0, 2.0 }
+                    }
+                }
+            });
+            myInput->setBackend("cuda");
+            std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array1D<float,1> {
+                {219.0}
+            });
+
+            std::shared_ptr<Node> myReduceSum = ReduceSum({0, 1, 2}, false);
+            auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cuda");
+            myReduceSum->forward();
+            float* computedOutput   = new float[myOutput->size()]();
+            cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+            for(int i = 0; i < myOutput->size(); i++){
+                const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+                REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
+            }
+
+            delete[] computedOutput;
+        }
+        SECTION("2") {
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<float,5,4> {
+               {{ 0.004232f, 0.105120f, 0.045124f, 0.009205f},
+                { 0.000766f, 0.272162f, 0.503560f, 0.044163f},
+                { 0.049755f, 0.000305f, 0.143634f, 0.013253f},
+                { 0.096258f, 0.311231f, 0.358143f, 0.000452f},
+                { 0.468617f, 0.015693f, 0.145316f, 0.000105f}}
+            });
+            myInput->setBackend("cuda");
+            std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array1D<float,1> {
+                {2.587094f}
+            });
+
+            std::shared_ptr<Node> myReduceSum = ReduceSum({0, 1}, false);
+            auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cuda");
+            myReduceSum->forward();
+
+            float* computedOutput   = new float[myOutput->size()]();
+            cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+            for(int i = 0; i < myOutput->size(); i++){
+                const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+                REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
+            }
+
+            delete[] computedOutput;
+        }
+    }
+}
+
+TEST_CASE("[gpu/operator] ReduceSum(backward)", "[ReduceSum][GPU]") {
+    SECTION("KeepDims") {
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
+            {
+                {
+                    { 5.0, 1.0 },
+                    { 20.0, 2.0 }
+                },
+                {
+                    { 30.0, 1.0 },
+                    { 40.0, 2.0 }
+                },
+                {
+                    { 55.0, 1.0 },
+                    { 60.0, 2.0 }
+                }
+            }
+        });
+        myInput->setBackend("cuda");
+
+
+        std::shared_ptr<Node> myReduceSum = ReduceSum({1});
+        auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
+        op->associateInput(0,myInput);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cuda");
+        myReduceSum->forward();
+
+
+        std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array3D<float,3,1,2> {
+            {
+
+                {{ 1.0, 2.0 }},
+                {{ 3.0, 4.0 }},
+                {{ 5.0, 6.0 }}
+            }
+        });
+        std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array3D<float,3,2,2> {
+            {
+                {
+                    { 1.0, 2.0 },
+                    { 1.0, 2.0 }
+                },
+                {
+                    { 3.0, 4.0 },
+                    { 3.0, 4.0 }
+                },
+                {
+                    { 5.0, 6.0 },
+                    { 5.0, 6.0 }
+                }
+            }
+        });
+        myOutputGrad->setBackend("cuda");
+        op->getOutput(0)->setGrad(myOutputGrad);
+        REQUIRE_NOTHROW(myReduceSum->backward());
+
+        float *computedGradCuda = new float[expectedInputGrad->size()]();
+        cudaMemcpy(computedGradCuda, op->getInput(0)->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInputGrad->size(), cudaMemcpyDeviceToHost);
+        
+        for(int i = 0; i < expectedInputGrad->size(); i++){
+            const float targetOutput = *(static_cast<float*>(expectedInputGrad->getImpl()->rawPtr()) + i);
+            REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6);
+        }
+
+        delete[] computedGradCuda;
+    }
+}
+}
diff --git a/unit_tests/Test_ShiftGELUImpl.cpp b/unit_tests/Test_ShiftGELUImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..86e747e735eccb397caa8062f52c2561e8ef759d
--- /dev/null
+++ b/unit_tests/Test_ShiftGELUImpl.cpp
@@ -0,0 +1,220 @@
+/********************************************************************************
+ * Copyright (c) 2024 Thales
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ * Author: Lucas RAKOTOARIVONY, Thales Research & Technology France
+ * Date: 25.06.2024
+ *
+ ********************************************************************************/
+
+#include <array>
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "Test_cuda.hpp"
+
+#include "aidge/data/Tensor.hpp"
+
+#include "aidge/backend/cpu.hpp"
+#include "aidge/backend/cuda.hpp"
+
+using namespace Aidge;
+
+TEST_CASE("[gpu/operator] ShiftGELU(forward)", "[ShiftGELU][GPU]") {
+    SECTION("4D Tensor") {
+        std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array4D<float,2,2,2,10> {
+            {
+                {
+                    {
+                        {0.96, 0.48, 0.54, 0.49, 0.59, 0.93, 0.00, 0.00, 0.61, 0.61},
+                        {0.85, 0.06, 0.11, 0.87, 0.55, 0.12, 0.80, 0.48, 0.41, 0.16}
+                    },
+                    {
+                        {0.24, 0.46, 0.97, 0.19, 0.65, 0.12, 0.44, 1.00, 0.37, 0.09},
+                        {0.44, 0.64, 0.21, 0.58, 0.05, 0.24, 0.56, 0.07, 0.49, 0.79}
+                    }
+                },
+                {
+                    {
+                        {0.00, 0.13, 0.55, 0.42, 0.49, 0.28, 0.52, 0.55, 0.34, 0.85},
+                        {0.98, 0.32, 0.09, 0.05, 0.37, 0.47, 0.63, 0.13, 0.70, 0.02}
+                    },
+                    {
+                        {0.69, 0.13, 0.74, 0.61, 0.25, 0.87, 0.46, 0.40, 0.81, 0.06},
+                        {0.89, 0.32, 0.61, 0.24, 0.70, 0.23, 0.09, 0.03, 0.14, 0.80}
+                    }
+                }
+            }
+        });
+
+        //expected output of shiftgelu forward operator
+        std::shared_ptr<Tensor> output_shiftGELU = std::make_shared<Tensor>(Array4D<float,2,2,2,10> {
+            {
+                {
+                    {
+                        { 0.991388f, 0.413078f, 0.413078f, 0.413078f, 0.413078f, 0.413078f, 0.0f, 0.0f, 0.413078f, 0.413078f },
+                        { 0.413078f, 0.0f, 0.0f, 0.413078f, 0.413078f, 0.0f, 0.413078f, 0.413078f, 0.413078f, 0.0f }
+                    },
+                    {
+                        { 0.0f, 0.413078f, 0.991388f, 0.0f, 0.413078f, 0.0f, 0.413078f, 0.991388f, 0.413078f, 0.0f },
+                        { 0.413078f, 0.413078f, 0.0f, 0.413078f, 0.0f, 0.0f, 0.413078f, 0.0f, 0.413078f, 0.413078f }
+                    }
+                },
+                {
+                    {
+                        { 0.0f, 0.0f, 0.413078f, 0.413078f, 0.413078f, 0.0f, 0.413078f, 0.413078f, 0.413078f, 0.413078f },
+                        { 0.991388f, 0.413078f, 0.0f, 0.0f, 0.413078f, 0.413078f, 0.413078f, 0.0f, 0.413078f, 0.0f}
+                    },
+                    {
+                        { 0.413078f, 0.0f, 0.413078f, 0.413078f, 0.0f, 0.413078f, 0.413078f, 0.413078f, 0.413078f, 0.0f },
+                        { 0.413078f, 0.413078f, 0.413078f, 0.0f, 0.413078f, 0.0f, 0.0f, 0.0f, 0.0f, 0.413078f }
+                    }
+                }
+            }
+        });
+
+        //expected output of GELU forward operator (computed with PyTorch)
+        std::shared_ptr<Tensor> output_GELU = std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 10> {
+            {
+                {
+                    {
+                        { 0.7982f, 0.3285f, 0.3809f, 0.3371f, 0.4262f, 0.7661f, 0.0000f, 0.0000f, 0.4447f, 0.4447f },
+                        { 0.6820f, 0.0314f, 0.0598f, 0.7028f, 0.3899f, 0.0657f, 0.6305f, 0.3285f, 0.2702f, 0.0902f }
+                    },
+                    {
+                        { 0.1428f, 0.3115f, 0.8090f, 0.1093f, 0.4824f, 0.0657f, 0.2948f, 0.8413f, 0.2384f, 0.0482f },
+                        { 0.2948f, 0.4729f, 0.1225f, 0.4170f, 0.0260f, 0.1428f, 0.3989f, 0.0370f, 0.3371f, 0.6203f }
+                    }
+                },
+                {
+                    {
+                        { 0.0000f, 0.0717f, 0.3899f, 0.2784f, 0.3371f, 0.1709f, 0.3632f, 0.3899f, 0.2152f, 0.6820f },
+                        { 0.8197f, 0.2002f, 0.0482f, 0.0260f, 0.2384f, 0.3200f, 0.4635f, 0.0717f, 0.5306f, 0.0102f }
+                    },
+                    {
+                        { 0.5209f, 0.0717f, 0.5701f, 0.4447f, 0.1497f, 0.7028f, 0.3115f, 0.2622f, 0.6407f, 0.0314f },
+                        { 0.7238f, 0.2002f, 0.4447f, 0.1428f, 0.5306f, 0.1359f, 0.0482f, 0.0154f, 0.0778f, 0.6305f }
+                    }
+                }
+            }
+        });
+
+        std::shared_ptr<Node> myShiftGELU = ShiftGELU();
+        auto op = std::static_pointer_cast<OperatorTensor>(myShiftGELU -> getOperator());
+        op->associateInput(0,input0);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cuda");
+        op->forward();
+        
+        float* computedOutput   = new float[output_shiftGELU->size()]();
+        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * output_shiftGELU->size(), cudaMemcpyDeviceToHost);
+
+        //test if forward result are as expected
+        for(int i = 0; i < output_shiftGELU->size(); i++){
+            const float targetOutput = *(static_cast<float*>(output_shiftGELU->getImpl()->rawPtr()) + i);
+            REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
+        }
+
+        //measure difference between GELU and shiftgelu
+        float sum = 0.0;
+        for(int i = 0; i < output_GELU->size(); i++){
+            const float targetOutput = *(static_cast<float*>(output_GELU->getImpl()->rawPtr()) + i);
+            sum += fabs(computedOutput[i] - targetOutput);
+        }
+        sum = sum / output_GELU->size();
+        REQUIRE(sum < 1.5e-1);
+
+        delete[] computedOutput;
+    }
+
+}
+
+TEST_CASE("[gpu/operator] ShiftGELU(backward)", "[ShiftGELU][GPU]")
+
+{
+
+    std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array4D<float,1,1,1,8> { //NCHW
+            {
+                    {
+                        {
+                            {1.46650600,  1.24083233, -0.33106008, -0.15137172, 0.06625678, -1.8326609, 0.53444749, -0.05167147},
+                        },
+                    },
+            }
+        });
+    
+    input0->setBackend("cuda");
+
+    std::shared_ptr<Node> myShiftGELU = ShiftGELU();
+    auto op = std::static_pointer_cast<OperatorTensor>(myShiftGELU->getOperator());
+    op->associateInput(0, input0);
+    op->setDataType(DataType::Float32);
+    op->setBackend("cuda");
+    myShiftGELU->forward();
+
+    std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array4D<float,1,1,1,8> {
+            {
+                {
+                    {
+                        { 1.34347093,  0.90813798, 0.39607167,  1.20428133, 0.16845724,  0.48487359, 0.40748054, -0.21790814},
+                    },
+                },
+            }
+        });
+
+
+    myOutputGrad->setBackend("cuda");
+    std::shared_ptr<Tensor> predictedOutput = op->getOutput(0);
+    std::shared_ptr<Tensor> input = op->getInput(0);
+    predictedOutput->setGrad(myOutputGrad);
+    REQUIRE_NOTHROW(myShiftGELU->backward());
+
+    //expected output of shiftgelu backward operator
+    std::shared_ptr<Tensor> expectedInputGradShiftGELU = std::make_shared<Tensor>(Array4D<float,1,1,1,8> {
+            {
+                {
+                    {
+                        { 1.88094, 1.09182, 0.134203, 0.439603, 0.0696628, 0.173469, 0.254718, -0.084009},
+                    },
+                },
+            }
+        });
+
+    //expected output of gelu backward operator (computed with PyTorch)
+    std::shared_ptr<Tensor> expectedInputGradGELU = std::make_shared<Tensor>(Array4D<float,1,1,1,8> {
+            {
+                {
+                    {
+                        {  1.5159,  1.0188,  0.0971,  0.4578,  0.0931, -0.0499,  0.3620, -0.1000},
+                    },
+                },
+            }
+        });
+
+
+    float *computedGradCuda = new float[myOutputGrad->size()]();
+
+    cudaMemcpy(computedGradCuda, input->grad()->getImpl()->rawPtr(), sizeof(float) * myOutputGrad->size(), cudaMemcpyDeviceToHost);
+
+    //test if backward result are as expected
+    for(int i = 0; i < expectedInputGradShiftGELU->size(); i++){
+        const float targetOutput = *(static_cast<float*>(expectedInputGradShiftGELU->getImpl()->rawPtr()) + i);
+        REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 2e-6);  
+    }
+
+    //measure difference between gelu and shifgelu
+    float sum = 0.0;
+        for(int i = 0; i < expectedInputGradGELU->size(); i++){
+            const float targetOutput = *(static_cast<float*>(expectedInputGradGELU->getImpl()->rawPtr()) + i);
+            sum += fabs(computedGradCuda[i] - targetOutput);
+        }
+        sum = sum / expectedInputGradGELU->size();
+        REQUIRE(sum < 2e-1);
+
+
+    delete[] computedGradCuda;
+}
diff --git a/unit_tests/Test_ShiftMaxImpl.cpp b/unit_tests/Test_ShiftMaxImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2a94a23c3a04edd72cb535ebfb6e2c538e4aeee8
--- /dev/null
+++ b/unit_tests/Test_ShiftMaxImpl.cpp
@@ -0,0 +1,217 @@
+/********************************************************************************
+ * Copyright (c) 2024 Thales
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ * Author: Lucas RAKOTOARIVONY, Thales Research & Technology France
+ * Date: 25.06.2024
+ *
+ ********************************************************************************/
+
+#include <array>
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "Test_cuda.hpp"
+
+#include "aidge/data/Tensor.hpp"
+
+#include "aidge/backend/cpu.hpp"
+#include "aidge/backend/cuda.hpp"
+
+using namespace Aidge;
+
+TEST_CASE("[gpu/operator] ShiftMax(forward)", "[ShiftMax][GPU]") {
+    SECTION("4D Tensor") {
+        std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array4D<float,2,2,2,10> {
+            {
+                {
+                    {
+                        {0.96, 0.48, 0.54, 0.49, 0.59, 0.93, 0.00, 0.00, 0.61, 0.61},
+                        {0.85, 0.06, 0.11, 0.87, 0.55, 0.12, 0.80, 0.48, 0.41, 0.16}
+                    },
+                    {
+                        {0.24, 0.46, 0.97, 0.19, 0.65, 0.12, 0.44, 1.00, 0.37, 0.09},
+                        {0.44, 0.64, 0.21, 0.58, 0.05, 0.24, 0.56, 0.07, 0.49, 0.79}
+                    }
+                },
+                {
+                    {
+                        {0.00, 0.13, 0.55, 0.42, 0.49, 0.28, 0.52, 0.55, 0.34, 0.85},
+                        {0.98, 0.32, 0.09, 0.05, 0.37, 0.47, 0.63, 0.13, 0.70, 0.02}
+                    },
+                    {
+                        {0.69, 0.13, 0.74, 0.61, 0.25, 0.87, 0.46, 0.40, 0.81, 0.06},
+                        {0.89, 0.32, 0.61, 0.24, 0.70, 0.23, 0.09, 0.03, 0.14, 0.80}
+                    }
+                }
+            }
+        });
+        //expected output of shiftmax forward operator
+        std::shared_ptr<Tensor> output_shiftmax = std::make_shared<Tensor>(Array4D<float,2,2,2,10> {
+            {
+                {
+                    {
+                        { 0.111084f, 0.111084f, 0.111084f, 0.111084f, 0.111084f, 0.111084f, 0.055542f, 0.055542f, 0.111084f, 0.111084f },
+                        { 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f }
+                    },
+                    {
+                        { 0.0624695f, 0.124969f, 0.124969f, 0.0624695f, 0.124969f, 0.0624695f, 0.124969f, 0.124969f, 0.124969f, 0.0624695f },
+                        { 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f }
+                    }
+                },
+                {
+                    {
+                        { 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f },
+                        { 0.124969f, 0.124969f, 0.0624695f, 0.0624695f, 0.124969f, 0.124969f, 0.124969f, 0.0624695f, 0.124969f, 0.0624695f }
+                    },
+                    {
+                        { 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f },
+                        { 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f }
+                    }
+                }
+            }
+        });
+        //expected output of softmax forward operator (computed with PyTorch)
+        std::shared_ptr<Tensor> output_softmax = std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 10> {
+            {
+                {
+                    {
+                        { 0.1484f, 0.0918f, 0.0975f, 0.0928f, 0.1025f, 0.1440f, 0.0568f, 0.0568f, 0.1046f, 0.1046f },
+                        { 0.1436f, 0.0652f, 0.0685f, 0.1465f, 0.1064f, 0.0692f, 0.1366f, 0.0992f, 0.0925f, 0.0721f }
+                    },
+                    {
+                        { 0.0768f, 0.0957f, 0.1593f, 0.0730f, 0.1157f, 0.0681f, 0.0938f, 0.1642f, 0.0874f, 0.0661f },
+                        { 0.1005f, 0.1227f, 0.0798f, 0.1156f, 0.0680f, 0.0823f, 0.1133f, 0.0694f, 0.1056f, 0.1426f }
+                    }
+                },
+                {
+                    {
+                        { 0.0645f, 0.0734f, 0.1118f, 0.0981f, 0.1052f, 0.0853f, 0.1085f, 0.1118f, 0.0906f, 0.1509f },
+                        { 0.1743f, 0.0901f, 0.0716f, 0.0688f, 0.0947f, 0.1047f, 0.1228f, 0.0745f, 0.1317f, 0.0667f }
+                    },
+                    {
+                        { 0.1164f, 0.0665f, 0.1224f, 0.1075f, 0.0750f, 0.1394f, 0.0925f, 0.0871f, 0.1313f, 0.0620f },
+                        { 0.1551f, 0.0877f, 0.1172f, 0.0810f, 0.1283f, 0.0802f, 0.0697f, 0.0656f, 0.0733f, 0.1418f }
+                    }
+                }
+            }
+        });
+
+        std::shared_ptr<Node> myShiftMax = ShiftMax();
+        auto op = std::static_pointer_cast<OperatorTensor>(myShiftMax -> getOperator());
+        op->associateInput(0,input0);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cuda");
+        op->forward();
+        
+        float* computedOutput   = new float[output_shiftmax->size()]();
+        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * output_shiftmax->size(), cudaMemcpyDeviceToHost);
+
+        //test if forward result are as expected
+        for(int i = 0; i < output_shiftmax->size(); i++){
+            const float targetOutput = *(static_cast<float*>(output_shiftmax->getImpl()->rawPtr()) + i);
+            REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
+        }
+
+        //measure difference between softmax and shiftmax
+        float sum = 0.0;
+        for(int i = 0; i < output_softmax->size(); i++){
+            const float targetOutput = *(static_cast<float*>(output_softmax->getImpl()->rawPtr()) + i);
+            sum += fabs(computedOutput[i] - targetOutput);
+        }
+        sum = sum / output_softmax->size();
+        REQUIRE(sum < 4e-2);
+
+        delete[] computedOutput;
+    }
+
+}
+
+TEST_CASE("[gpu/operator] ShiftMax(backward)", "[ShiftMax][GPU]")
+
+{
+
+    std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array4D<float,1,1,1,8> { //NCHW
+            {
+                    {
+                        {
+                            {1.46650600,  1.24083233, -0.33106008, -0.15137172, 0.06625678, -1.8326609, 0.53444749, -0.05167147},
+                        },
+                    },
+            }
+        });
+    
+    input0->setBackend("cuda");
+
+    std::shared_ptr<Node> myShiftMax = ShiftMax();
+    auto op = std::static_pointer_cast<OperatorTensor>(myShiftMax->getOperator());
+    op->associateInput(0, input0);
+    op->setDataType(DataType::Float32);
+    op->setBackend("cuda");
+    myShiftMax->forward();
+
+    std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array4D<float,1,1,1,8> {
+            {
+                {
+                    {
+                        { 1.34347093,  0.90813798, 0.39607167,  1.20428133, 0.16845724,  0.48487359, 0.40748054, -0.21790814},
+                    },
+                },
+            }
+        });
+
+
+    myOutputGrad->setBackend("cuda");
+    std::shared_ptr<Tensor> predictedOutput = op->getOutput(0);
+    std::shared_ptr<Tensor> input = op->getInput(0);
+    predictedOutput->setGrad(myOutputGrad);
+    REQUIRE_NOTHROW(myShiftMax->backward());
+
+    //expected output of shiftmax backward operator
+    std::shared_ptr<Tensor> expectedInputGradShiftMax = std::make_shared<Tensor>(Array4D<float,1,1,1,8> {
+            {
+                {
+                    {
+                        { 0.159378, 0.0249331, -0.0250217, 0.0262418, -0.0514701, -0.00459638, -0.0551896, -0.0739511},
+                    },
+                },
+            }
+        });
+
+    //expected output of softmax backward operator (computed with PyTorch)
+    std::shared_ptr<Tensor> expectedInputGradSoftmax = std::make_shared<Tensor>(Array4D<float,1,1,1,8> {
+            {
+                {
+                    {
+                        { 0.1672,  0.0198, -0.0236,  0.0241, -0.0535, -0.0042, -0.0547, -0.0752},
+                    },
+                },
+            }
+        });
+
+
+    float *computedGradCuda = new float[myOutputGrad->size()]();
+
+    cudaMemcpy(computedGradCuda, input->grad()->getImpl()->rawPtr(), sizeof(float) * myOutputGrad->size(), cudaMemcpyDeviceToHost);
+
+    //test if backward result are as expected
+    for(int i = 0; i < expectedInputGradShiftMax->size(); i++){
+        const float targetOutput = *(static_cast<float*>(expectedInputGradShiftMax->getImpl()->rawPtr()) + i);
+        REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6);
+    }
+
+    //measure difference between softmax and shiftmax
+    float sum = 0.0;
+        for(int i = 0; i < expectedInputGradSoftmax->size(); i++){
+            const float targetOutput = *(static_cast<float*>(expectedInputGradSoftmax->getImpl()->rawPtr()) + i);
+            sum += fabs(computedGradCuda[i] - targetOutput);
+        }
+        sum = sum / expectedInputGradSoftmax->size();
+        REQUIRE(sum < 4e-3);
+
+    delete[] computedGradCuda;
+}
diff --git a/unit_tests/Test_TensorImpl.cpp b/unit_tests/Test_TensorImpl.cpp
index cad4a1a067d55fe5c8246d09a733f44007886dc0..cb120a970c5310f80f8c62960c029a845937ba30 100644
--- a/unit_tests/Test_TensorImpl.cpp
+++ b/unit_tests/Test_TensorImpl.cpp
@@ -122,3 +122,42 @@ TEST_CASE("Tensor creation", "[Connector]") {
         REQUIRE(val[7] == 8);
     }
 }
+
+TEST_CASE("Tensor Descriptor Update") {
+    Tensor x;
+    x.setBackend("cuda");
+
+    std::vector<std::size_t> shapeA = { 7, 6, 5, 4, 3 };
+    x.resize(shapeA);
+
+    cudnnTensorDescriptor_t desc = std::dynamic_pointer_cast<TensorImpl_cuda_>(x.getImpl())->getCudnnTensorDesc(x);
+
+    cudnnDataType_t currentDataType;
+    int currentNbDims;
+    std::vector<int> currentDimA(shapeA.size());
+    std::vector<int> currentStrideA(shapeA.size());
+
+    REQUIRE_NOTHROW(cudnnGetTensorNdDescriptor(desc, shapeA.size(), &currentDataType, &currentNbDims, currentDimA.data(), currentStrideA.data()));
+
+    REQUIRE(std::equal(currentDimA.begin(), currentDimA.end(), shapeA.begin(), [](int a, std::size_t b) {
+                            return static_cast<std::size_t>(a) == b;
+                        }
+                      )
+            );
+
+    // Change the tensor shape and check tensor descriptor
+    std::vector<std::size_t> shapeB = { 6, 5, 4 };
+    x.resize(shapeB);
+
+    std::vector<int> currentDimB(shapeB.size());
+    std::vector<int> currentStrideB(shapeB.size());
+
+    desc = std::dynamic_pointer_cast<TensorImpl_cuda_>(x.getImpl())->getCudnnTensorDesc(x);
+    REQUIRE_NOTHROW(cudnnGetTensorNdDescriptor(desc, shapeB.size(), &currentDataType, &currentNbDims, currentDimB.data(), currentStrideB.data()));
+
+    REQUIRE(std::equal(currentDimB.begin(), currentDimB.end(), shapeB.begin(), [](int a, std::size_t b) {
+                            return static_cast<std::size_t>(a) == b;
+                        }
+                      )
+            );
+}
diff --git a/version.txt b/version.txt
index f4778493c50025c6ab147a1fec7486ef0c706792..69367fd08f3ce302151ebc9779193d517dfa32de 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1,2 @@
-0.2.2
\ No newline at end of file
+0.3.0
+