diff --git a/.gitignore b/.gitignore index 9fbfccca6dfda997d8a0dbfc4b373590feeecad8..f3571f3fefd133675c1989b50247a12d107bc685 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +# common +.cache + # C++ Build build*/ install*/ @@ -10,6 +13,9 @@ install*/ __pycache__ *.pyc *.egg-info +dist*/ +wheelhouse/* +_version.py # Mermaid *.mmd @@ -18,4 +24,4 @@ __pycache__ xml*/ # ONNX -*.onnx \ No newline at end of file +*.onnx diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 271581e4d845ea93d5fd3a09f471edec69913277..1c2606707d16cd401e575354a1cbb99a11451ff8 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,39 +1,64 @@ -################################################################################ -# Pre-configured CI/CD for your Aidge module. -# -# Three stages are already pre-configured to run on Eclipse Aidge CI: -# - build: ubuntu_cpp, ubuntu_python and windows_cpp; -# - test: ubuntu_cpp, ubuntu_python and windows_cpp; -# - coverage: ubuntu_cpp and ubuntu_python. -# -# If your project is pure C++ or pure Python, you can remove the "_python" or -# "_cpp" jobs respectively. -# "ubuntu" jobs require an Ubuntu runner with a docker executor with tag -# "docker". -# "windows" jobs require a Windows runner with a docker-windows executor with -# tag "windows". -# -# You can change the docker images in the YML scripts directly. The default -# images are: -# - nvidia/cuda:12.2.0-devel-ubuntu22.04 for Ubuntu jobs; -# - buildtools for Windows jobs, built on top of -# mcr.microsoft.com/windows/servercore:ltsc2022 with Microsoft Visual Studio -# 2022 BuildTools installed. -# -# See Aidge project wiki for more details on how to setup your own docker images -# and Gitlab runners. -################################################################################ +############################################################################### +# Aidge Continuous Integration and Deployment # +# # +############################################################################### stages: - # Build + - static_analysis - build - # Unit test stage - test - # Code coverage - coverage + - release + - deploy include: - - local: '/.gitlab/ci/_global.gitlab-ci.yml' - - local: '/.gitlab/ci/build.gitlab-ci.yml' - - local: '/.gitlab/ci/test.gitlab-ci.yml' - - local: '/.gitlab/ci/coverage.gitlab-ci.yml' + - project: 'eclipse/aidge/gitlab_shared_files' + ref: 'main' + file: + # choose which jobs to run by including the corresponding files. + - '.gitlab/ci/ubuntu_cpp.gitlab-ci.yml' + + - '.gitlab/ci/ubuntu_python.gitlab-ci.yml' + - '.gitlab/ci/release/cibuildwheel_ubuntu.gitlab-ci.yml' + + # - '.gitlab/ci/windows_cpp.gitlab-ci.yml' + + # - '.gitlab/ci/windows_python.gitlab-ci.yml' + # - '.gitlab/ci/release/cibuildwheel_windows.gitlab-ci.yml' + + +release:pip:ubuntu: + tags: + - release:cuda + variables: + DOCKER_HOST: unix:///var/run/docker.sock + CIBW_ENVIRONMENT: >- + BUILD_WITH_CUDA=1 + AIDGE_DEPENDENCIES='aidge_core aidge_backend_cpu' + AIDGE_INSTALL='/AIDGE_INSTALL_CIBUILDWHEEL' + CUDA_TOOLKIT_VERSION='11-8' + DOCKER_HOST='unix:///var/run/docker.sock' + ARCH='x86_64' + CUDNN_VERSION='9' + CUDA_MAJOR_VERSION='11' + CUDA_MINOR_VERSION='8' + SEARCH_PATH='/home/ubuntu/builds/$CI_RUNNER_SHORT_TOKEN/$CI_CONCURRENT_ID' + + parallel: + matrix: + - CIBW_BUILD: "cp38-manylinux_x86_64" + - CIBW_BUILD: "cp39-manylinux_x86_64" + - CIBW_BUILD: "cp310-manylinux_x86_64" + + before_script: + # retrieve aidge dependencies + - DEPENDENCY_JOB="build:ubuntu_python" + - !reference [.ubuntu:download:repositories, before_script] # located in common.gitlab-ci.yml + + script: + - /home/ubuntu/.local/bin/cibuildwheel --output-dir wheelhouse + + after_script: + # Ensure all files are owned by the correct user at the end of the job + - sudo chown -R $(whoami):$(whoami) . + diff --git a/.gitlab/ci/_global.gitlab-ci.yml b/.gitlab/ci/_global.gitlab-ci.yml deleted file mode 100644 index ccc83c5d24623f1f00eaa78bc596f0cb1ff429dc..0000000000000000000000000000000000000000 --- a/.gitlab/ci/_global.gitlab-ci.yml +++ /dev/null @@ -1,25 +0,0 @@ -################################################################################ -# Centralized definitions of common job parameter values. # -# Parameters with many optional configurations may be in separate files. # -# # -################################################################################ -variables: - GIT_SUBMODULE_STRATEGY: recursive - OMP_NUM_THREADS: 4 - GIT_SSL_NO_VERIFY: 1 - DEBIAN_FRONTEND: noninteractive - -# See https://docs.gitlab.com/ee/ci/yaml/workflow.html#switch-between-branch-pipelines-and-merge-request-pipelines -workflow: - rules: - - if: $CI_PIPELINE_SOURCE == "merge_request_event" - - if: $CI_COMMIT_BRANCH && $CI_OPEN_MERGE_REQUESTS - when: never - - if: $CI_COMMIT_BRANCH - -default: - image: nvidia/cuda:12.2.0-devel-ubuntu22.04 - before_script: - - apt update - - apt install -y cmake cppcheck python-is-python3 pip git gcovr unzip curl - - apt install -y libcudnn8-dev diff --git a/.gitlab/ci/build.gitlab-ci.yml b/.gitlab/ci/build.gitlab-ci.yml deleted file mode 100644 index c5d22e779753ee0fbfa0bcbd828f85639ace8b9f..0000000000000000000000000000000000000000 --- a/.gitlab/ci/build.gitlab-ci.yml +++ /dev/null @@ -1,242 +0,0 @@ -include: - #- remote: 'https://gitlab.eclipse.org/eclipse/aidge/gitlab_shared_files/-/raw/main/.gitlab/ci/shared_script.gitlab-ci.yml' - - remote: 'https://gitlab.eclipse.org/hrouis/gitlab_shared_files/-/raw/test_hro/.gitlab/ci/shared_script.gitlab-ci.yml' - -build:ubuntu_cpp: - stage: build - needs: [] - tags: - - docker - - script: - # Download dependencies - - DEPENDENCY_JOB="build:ubuntu_cpp" - # aidge_core - - DEPENDENCY_NAME="aidge_core" - - !reference [.download_dependency, script] - # aidge_backend_cpu - - DEPENDENCY_NAME="aidge_backend_cpu" - - !reference [.download_dependency, script] - - # Build current module - - export CMAKE_PREFIX_PATH=../install_cpp - - mkdir -p build_cpp - - cd build_cpp - - cmake -DCMAKE_INSTALL_PREFIX:PATH=../install_cpp -DCMAKE_BUILD_TYPE=Debug -DWERROR=ON -DCOVERAGE=ON .. - - make -j4 all install - - artifacts: - expire_in: 1 week - paths: - - build_cpp/ - - install_cpp/ - -build:ubuntu_cpp_g++10: - stage: build - needs: [] - tags: - - docker - - script: - # Download dependencies - - DEPENDENCY_JOB="build:ubuntu_cpp" - # aidge_core - - DEPENDENCY_NAME="aidge_core" - - !reference [.download_dependency, script] - # aidge_backend_cpu - - DEPENDENCY_NAME="aidge_backend_cpu" - - !reference [.download_dependency, script] - - # Build current module - - export CMAKE_PREFIX_PATH=../install_cpp - - apt install -y g++-10 - - mkdir -p build_cpp - - mkdir -p install_cpp - - cd build_cpp - - export CXX=/usr/bin/g++-10 - - cmake -DCMAKE_INSTALL_PREFIX:PATH=../install_cpp -DCMAKE_BUILD_TYPE=Debug -DWERROR=ON -DCOVERAGE=ON .. - - make -j4 all install - -build:ubuntu_cpp_g++12: - stage: build - needs: [] - tags: - - docker - - script: - # Download dependencies - - DEPENDENCY_JOB="build:ubuntu_cpp" - # aidge_core - - DEPENDENCY_NAME="aidge_core" - - !reference [.download_dependency, script] - # aidge_backend_cpu - - DEPENDENCY_NAME="aidge_backend_cpu" - - !reference [.download_dependency, script] - - - # Build current module - - export CMAKE_PREFIX_PATH=../install_cpp - - apt install -y g++-12 - - mkdir -p build_cpp - - mkdir -p install_cpp - - cd build_cpp - - export CXX=/usr/bin/g++-12 - - cmake -DCMAKE_INSTALL_PREFIX:PATH=../install_cpp -DCMAKE_BUILD_TYPE=Debug -DWERROR=ON -DCOVERAGE=ON .. - - make -j4 all install - -build:ubuntu_cpp_clang12: - stage: build - needs: [] - tags: - - docker - - script: - # Download dependencies - - DEPENDENCY_JOB="build:ubuntu_cpp" - # aidge_core - - DEPENDENCY_NAME="aidge_core" - - !reference [.download_dependency, script] - # aidge_backend_cpu - - DEPENDENCY_NAME="aidge_backend_cpu" - - !reference [.download_dependency, script] - - - # Build current module - - export CMAKE_PREFIX_PATH=../install_cpp - - apt install -y clang-12 - - mkdir -p build_cpp - - mkdir -p install_cpp - - cd build_cpp - - export CXX=/usr/bin/clang++-12 - - cmake -DCMAKE_INSTALL_PREFIX:PATH=../install_cpp -DCMAKE_BUILD_TYPE=Debug -DWERROR=ON -DCOVERAGE=ON .. - - make -j4 all install - -build:ubuntu_cpp_clang15: - stage: build - needs: [] - tags: - - docker - - script: - # Download dependencies - - DEPENDENCY_JOB="build:ubuntu_cpp" - # aidge_core - - DEPENDENCY_NAME="aidge_core" - - !reference [.download_dependency, script] - # aidge_backend_cpu - - DEPENDENCY_NAME="aidge_backend_cpu" - - !reference [.download_dependency, script] - - # Build current module - - export CMAKE_PREFIX_PATH=../install_cpp - - apt install -y clang-15 - - mkdir -p build_cpp - - mkdir -p install_cpp - - cd build_cpp - - export CXX=/usr/bin/clang++-15 - - cmake -DCMAKE_INSTALL_PREFIX:PATH=../install_cpp -DCMAKE_BUILD_TYPE=Debug -DWERROR=ON -DCOVERAGE=ON .. - - make -j4 all install - -build:ubuntu_python: - stage: build - needs: [] - tags: - - docker - - script: - # Download dependencies - - DEPENDENCY_JOB="build:ubuntu_python" - # aidge_core (python) - - DEPENDENCY_NAME="aidge_core" - - !reference [.download_dependency, script] - # aidge_backend_cpu (python) - - DEPENDENCY_NAME="aidge_backend_cpu" - - !reference [.download_dependency, script] - - - python3 -m pip install virtualenv - - virtualenv venv - - source venv/bin/activate - - python3 -m pip install -r requirements.txt - - python3 -m pip install . - - artifacts: - expire_in: 1 week - paths: - - venv/ - -# build:windows_cpp: -# stage: build -# needs: [] -# tags: -# - windows - -# image: buildtools -# before_script: -# # Install Chocolatey -# - Set-ExecutionPolicy Bypass -Scope Process -Force; [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072; iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1')) -# # Install dependencies -# - choco install cmake.install --installargs '"ADD_CMAKE_TO_PATH=System"' -Y -# - choco install git -Y -# - choco install python -Y -# - choco install cuda -Y -# # Update PATH -# - $env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User") -# script: -# # Download dependencies -# # aidge_core -# - 'curl "https://gitlab.eclipse.org/api/v4/projects/5139/jobs/artifacts/main/download?job=build:windows_cpp" -o build_artifacts.zip' -# - Expand-Archive -Path .\build_artifacts.zip -DestinationPath . -Force -# - Remove-Item .\build_cpp\ -Recurse -# # aidge_backend_cpu -# - 'curl "https://gitlab.eclipse.org/api/v4/projects/5140/jobs/artifacts/main/download?job=build:windows_cpp" -o build_artifacts.zip' -# - Expand-Archive -Path .\build_artifacts.zip -DestinationPath . -Force -# - Remove-Item .\build_cpp\ -Recurse - -# - $env:CMAKE_PREFIX_PATH = '../install_cpp' -# - mkdir -p build_cpp -# - cd build_cpp -# - cmake -DCMAKE_INSTALL_PREFIX:PATH=../install_cpp -DCMAKE_BUILD_TYPE=Debug .. -# - cmake --build . -j2 -# - cmake --install . --config Debug - -# artifacts: -# expire_in: 1 week -# paths: -# - build_cpp/ -# - install_cpp/ - -# build:windows_python: -# stage: build -# needs: [] -# tags: -# - windows - -# image: buildtools -# before_script: -# # Install Chocolatey -# - Set-ExecutionPolicy Bypass -Scope Process -Force; [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072; iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1')) -# # Install dependencies -# - choco install cmake.install --installargs '"ADD_CMAKE_TO_PATH=System"' -Y -# - choco install git -Y -# - choco install python -Y -# - choco install cuda -Y -# # Update PATH -# - $env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User") -# script: -# # Download dependencies -# # aidge_core (Python) -# - 'curl "https://gitlab.eclipse.org/api/v4/projects/5139/jobs/artifacts/main/download?job=build:windows_python" -o build_artifacts.zip' -# - Expand-Archive -Path .\build_artifacts.zip -DestinationPath . -Force -# # aidge_backend_cpu (Python) -# - 'curl "https://gitlab.eclipse.org/api/v4/projects/5140/jobs/artifacts/main/download?job=build:windows_python" -o build_artifacts.zip' -# - Expand-Archive -Path .\build_artifacts.zip -DestinationPath . -Force - -# - python -m pip install virtualenv -# - virtualenv venv -# - venv\Scripts\Activate.ps1 -# - python -m pip install -r requirements.txt -# - python -m pip install . -# artifacts: -# expire_in: 1 week -# paths: -# - venv/ diff --git a/.gitlab/ci/cibuildwheel_build_deps_before_build_wheel.ps1 b/.gitlab/ci/cibuildwheel_build_deps_before_build_wheel.ps1 new file mode 100644 index 0000000000000000000000000000000000000000..c2715ea5550432838d3cc8692e97204b278d2c85 --- /dev/null +++ b/.gitlab/ci/cibuildwheel_build_deps_before_build_wheel.ps1 @@ -0,0 +1,23 @@ +$ErrorActionPreference = "Stop" + +# Retrieve and clean the dependencies string from the environment variable +$AIDGE_DEPENDENCIES = $env:AIDGE_DEPENDENCIES -split ' ' +Write-Host "Aidge dependencies : $AIDGE_DEPENDENCIES" +if ( $($AIDGE_DEPENDENCIES.Length) -eq 0) { + Write-Host "- No dependencies provided for current repsitory" + New-Item -ItemType Directory -Force -Path ".\build" | Out-Null + Remove-Item -Path ".\build\*" -Recurse -Force + } else { + Write-Host "Retrieving given dependencies to build current package : $AIDGE_DEPENDENCIES" + foreach ($dep in $($AIDGE_DEPENDENCIES -split " ")) { + Write-Host "Retrieving : $dep" + $curr_loc=$(Get-Location) + Set-Location ../$dep + Get-Location + Get-ChildItem . + New-Item -Path ".\build" -ItemType Directory -Force | Out-Null + Get-ChildItem -Path ".\build" -File | Remove-Item -Force + python -m pip install . -v + Set-Location $curr_loc + } +} diff --git a/.gitlab/ci/cibuildwheel_build_deps_before_build_wheel.sh b/.gitlab/ci/cibuildwheel_build_deps_before_build_wheel.sh new file mode 100755 index 0000000000000000000000000000000000000000..4f74488ae41714a4ce03ba7514bf93842768c5ae --- /dev/null +++ b/.gitlab/ci/cibuildwheel_build_deps_before_build_wheel.sh @@ -0,0 +1,40 @@ +#!/bin/bash +set -e +if [[ "$1" == "" ]]; then + echo "build aidge deps in cibuildwheel container before building wheel." + echo "search path defines where the dependencies will be searched." + echo "Hint : In wheel containers, files are mounted on /host by default." + echo "\nusage : ./cibuildwheel_build_deps_before_build_wheel.sh $search_path" +fi +set -x +if [[ $AIDGE_DEPENDENCIES == "" ]]; then # case for aidge_ core + mkdir -p build # creating build if its not already there to hold the build of cpp files + rm -rf build/* # build from scratch +else + for repo in $AIDGE_DEPENDENCIES ; do # case for other projects + search_path=$1 + REPO_PATH=$(find $search_path ! -writable -prune -o -type d \ + -name "$repo" \ + -not -path "*/install/*" \ + -not -path "*/.git/*" \ + -not -path "*/miniconda/*" \ + -not -path "*/conda/*" \ + -not -path "*/.local/*" \ + -not -path "*/lib/*" \ + -not -path "*/$repo/$repo/*" \ + -not -path "*/proc/*" \ + -print -quit) + if [[ -z "$REPO_PATH" ]]; then + echo "ERROR : dependency $repo not found in search_path \"$search_path\". ABORTING." + exit -1 + fi + + cd $REPO_PATH + mkdir -p build # creating build if its not already there to hold the build of cpp files + rm -rf build/* # build from scratch + pip install . -v + cd - + done +fi +set +x +set +e diff --git a/.gitlab/ci/coverage.gitlab-ci.yml b/.gitlab/ci/coverage.gitlab-ci.yml deleted file mode 100644 index 33547fc3f52771c456fba3d34a6e8d96eebafd8a..0000000000000000000000000000000000000000 --- a/.gitlab/ci/coverage.gitlab-ci.yml +++ /dev/null @@ -1,41 +0,0 @@ -coverage:ubuntu_cpp: - stage: coverage - needs: ["build:ubuntu_cpp"] - tags: - - docker - script: - - cd build_cpp - - ctest --output-on-failure - # HTML report for visualization - - gcovr --html-details --exclude-unreachable-branches -o coverage.html --root ${CI_PROJECT_DIR} --filter '\.\./include/' --filter '\.\./src/' - # Coberta XML report for Gitlab integration - - gcovr --xml-pretty --exclude-unreachable-branches --print-summary -o coverage.xml --root ${CI_PROJECT_DIR} --filter '\.\./include/' --filter '\.\./src/' - coverage: /^\s*lines:\s*\d+.\d+\%/ - artifacts: - name: ${CI_JOB_NAME}-${CI_COMMIT_REF_NAME}-${CI_COMMIT_SHA} - expire_in: 2 days - reports: - coverage_report: - coverage_format: cobertura - path: build_cpp/coverage.xml - -coverage:ubuntu_python: - stage: coverage - needs: ["build:ubuntu_python"] - tags: - - docker - script: - - source venv/bin/activate - - python3 -m pip install numpy coverage - - cd ${CI_PROJECT_NAME} - # Retrieve the installation path of the module, since it is installed with pip. - - export MODULE_LOCATION=`python -c "import ${CI_PROJECT_NAME} as _; print(_.__path__[0])"` - - python3 -m coverage run --source=$MODULE_LOCATION -m unittest discover -s unit_tests/ -v -b - - python3 -m coverage report - - python3 -m coverage xml - coverage: '/(?i)total.*? (100(?:\.0+)?\%|[1-9]?\d(?:\.\d+)?\%)$/' - artifacts: - reports: - coverage_report: - coverage_format: cobertura - path: ${CI_PROJECT_NAME}/coverage.xml diff --git a/.gitlab/ci/test.gitlab-ci.yml b/.gitlab/ci/test.gitlab-ci.yml deleted file mode 100644 index 92b932f86193d1525b9bba8cad0b92271f3c966f..0000000000000000000000000000000000000000 --- a/.gitlab/ci/test.gitlab-ci.yml +++ /dev/null @@ -1,48 +0,0 @@ -test:ubuntu_cpp: - stage: test - needs: ["build:ubuntu_cpp"] - tags: - - docker - script: - - cd build_cpp - - ctest --output-junit ctest-results.xml --output-on-failure - artifacts: - reports: - junit: build_cpp/ctest-results.xml - -test:ubuntu_python: - stage: test - needs: ["build:ubuntu_python"] - tags: - - docker - script: - - source venv/bin/activate - - cd ${CI_PROJECT_NAME} - - python3 -m pip install numpy unittest-xml-reporting - - python3 -m pip list - # Run on discovery all tests located in core/unit_tests/python - - python3 -m xmlrunner discover -s unit_tests/ -v -b --output-file xmlrunner-results.xml - artifacts: - reports: - junit: ${CI_PROJECT_NAME}/xmlrunner-results.xml - -# test:windows_cpp: -# stage: test -# needs: ["build:windows_cpp"] -# tags: -# - windows -# image: buildtools -# before_script: -# # Install Chocolatey -# - Set-ExecutionPolicy Bypass -Scope Process -Force; [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072; iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1')) -# # Install dependencies -# - choco install cmake.install --installargs '"ADD_CMAKE_TO_PATH=System"' -Y -# - choco install python -Y -# # Update PATH -# - $env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User") -# script: -# - cd build_cpp -# - ctest --output-junit ctest-results.xml --output-on-failure -# artifacts: -# reports: -# junit: build_cpp/ctest-results.xml diff --git a/CMakeLists.txt b/CMakeLists.txt index 01ebb6f258b173aee6df867c5c5c991ec936df57..bea1b398ced94f3ec4da97b5db6237fd9882d87d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,11 +1,15 @@ # CMake >= 3.18 is required for good support of FindCUDAToolkit cmake_minimum_required(VERSION 3.18) +set(CXX_STANDARD 14) + +file(STRINGS "${CMAKE_SOURCE_DIR}/version.txt" version) -file(READ "${CMAKE_SOURCE_DIR}/version.txt" version) -add_definitions(-DPROJECT_VERSION="${version}") -file(READ "${CMAKE_SOURCE_DIR}/project_name.txt" project) +project(aidge_backend_cuda + VERSION ${version} + DESCRIPTION "CUDA implementations of the operators of aidge framework." + LANGUAGES CXX) -message(STATUS "Project name: ${project}") +message(STATUS "Project name: ${CMAKE_PROJECT_NAME}") message(STATUS "Project version: ${version}") execute_process( @@ -13,23 +17,18 @@ execute_process( WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE GIT_COMMIT_HASH OUTPUT_STRIP_TRAILING_WHITESPACE + ERROR_QUIET ) message(STATUS "Latest git commit: ${GIT_COMMIT_HASH}") - # Define a preprocessor macro with the Git commit version add_definitions(-DGIT_COMMIT_HASH="${GIT_COMMIT_HASH}") - -# Note : project name is {project} and python module name is also {project} -set(module_name _${project}) # target name - - -project(${project}) -set(CXX_STANDARD 14) +# Note : project name is ${CMAKE_PROJECT_NAME} and python module name is also ${CMAKE_PROJECT_NAME} +set(module_name _${CMAKE_PROJECT_NAME}) # target name ############################################## # Define options -option(PYBIND "python binding" ON) +option(PYBIND "python binding" OFF) option(WERROR "Warning as error" OFF) option(TEST "Enable tests" ON) option(COVERAGE "Enable coverage" OFF) @@ -38,34 +37,76 @@ option(ENABLE_ASAN "Enable ASan (AddressSanitizer) for runtime analysis of memor ############################################## # Import utils CMakeLists set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake") -include(PybindModuleCreation) if(CMAKE_COMPILER_IS_GNUCXX AND COVERAGE) Include(CodeCoverage) endif() -enable_language(CUDA) +############################################## +# Find system dependencies +############################################## +# FIND AIDGE Dependencies +if(NOT $ENV{AIDGE_INSTALL} STREQUAL "") + set(CMAKE_INSTALL_PREFIX $ENV{AIDGE_INSTALL}) + list(APPEND CMAKE_PREFIX_PATH $ENV{AIDGE_INSTALL}) + message(WARNING "Env var AIDGE_INSTALL detected : $ENV{AIDGE_INSTALL}. Set CMAKE_INSTALL_PREFIX to AIDGE_INSTALL & added to CMAKE_PREFIX_PATH" + "\n\tCMAKE_INSTALL_PREFIX = ${CMAKE_INSTALL_PREFIX}" + "\n\tCMAKE_PREFIX_PATH = ${CMAKE_PREFIX_PATH}") +endif() +find_package(aidge_core REQUIRED) +if(TEST) + find_package(aidge_backend_cpu REQUIRED) +endif() + +########## +# CUDA +if(NOT $ENV{AIDGE_INSTALL} STREQUAL "") + message(WARNING "Env var CIBUILDWHEEL detected : currently building for a release job." + "\nSetting manually CUDACXX, PATH & LD_LIBRARY_PATH Variables") + list(APPEND ENV{LD_LIBRARY_PATH} /usr/local/cuda/lib64) + list(APPEND ENV{PATH} /usr/local/cuda/bin) + set(ENV{CUDACXX} /usr/local/cuda/bin/nvcc) +endif() +find_package(CUDAToolkit REQUIRED) +if(NOT DEFINED CMAKE_CUDA_STANDARD) + set(CMAKE_CUDA_STANDARD 14) + set(CMAKE_CUDA_STANDARD_REQUIRED ON) +endif() +if(NOT DEFINED CMAKE_CUDA_ARCHITECURE) + set(CMAKE_CUDA_ARCHITECTURE native) +endif() message(STATUS "Cuda compiler version = ${CMAKE_CUDA_COMPILER_VERSION}") # Define a preprocessor macro with the Cuda compiler version add_definitions(-DCUDA_COMPILER_VERSION="${CMAKE_CUDA_COMPILER_VERSION}") +message(STATUS "CUDA STANDARD : ${CMAKE_CUDA_STANDARD}") +message(STATUS "CUDA ARCHITECTURE : ${CMAKE_CUDA_ARCHITECTURES}") -############################################## -# Find system dependencies -find_package(CUDAToolkit REQUIRED) +enable_language(CUDA) -find_package(aidge_core REQUIRED) -if(TEST) - find_package(aidge_backend_cpu REQUIRED) -endif() ############################################## # Create target and set properties - file(GLOB_RECURSE src_files "src/*.cpp" "src/*.cu") file(GLOB_RECURSE inc_files "include/*.hpp") add_library(${module_name} ${src_files} ${inc_files}) + +# PYTHON BINDING +if (PYBIND) + # Handles Python + pybind11 headers dependencies + include(PybindModuleCreation) + # creates a target of the same name as CMAKE_PROJECT_NAME + generate_python_binding(${CMAKE_PROJECT_NAME} ${module_name}) # the python bindings module has the same name as the project. + + target_link_libraries(${module_name} + PUBLIC + pybind11::pybind11 + PRIVATE + Python::Module + ) +endif() + target_link_libraries(${module_name} PUBLIC _aidge_core # _ is added because we link the target not the project @@ -75,7 +116,7 @@ target_link_libraries(${module_name} ) if( ${ENABLE_ASAN} ) - message("Building ${module_name} with ASAN.") + message("Building ${module_name} with ASAN.") set(SANITIZE_FLAGS -fsanitize=address -fno-omit-frame-pointer) target_link_libraries(${module_name} PUBLIC @@ -103,27 +144,9 @@ target_include_directories(${module_name} ${CMAKE_CURRENT_SOURCE_DIR}/src ) -if(NOT DEFINED CMAKE_CUDA_STANDARD) - set(CMAKE_CUDA_STANDARD 14) - set(CMAKE_CUDA_STANDARD_REQUIRED ON) -endif() - set_property(TARGET ${module_name} PROPERTY POSITION_INDEPENDENT_CODE ON) set_target_properties(${module_name} PROPERTIES CUDA_SEPARABLE_COMPILATION ON) -# PYTHON BINDING -if (PYBIND) - generate_python_binding(${project} ${module_name}) - - # Handles Python + pybind11 headers dependencies - target_link_libraries(${module_name} - PUBLIC - pybind11::pybind11 - PRIVATE - Python::Python - ) -endif() - target_compile_features(${module_name} PRIVATE cxx_std_14) target_compile_options(${module_name} PRIVATE @@ -142,11 +165,10 @@ endif() ############################################## # Installation instructions - include(GNUInstallDirs) -set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/${project}) +set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/${CMAKE_PROJECT_NAME}) -install(TARGETS ${module_name} EXPORT ${project}-targets +install(TARGETS ${module_name} EXPORT ${CMAKE_PROJECT_NAME}-targets LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} @@ -157,8 +179,8 @@ install(DIRECTORY include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) #Export the targets to a script -install(EXPORT ${project}-targets - FILE "${project}-targets.cmake" +install(EXPORT ${CMAKE_PROJECT_NAME}-targets + FILE "${CMAKE_PROJECT_NAME}-targets.cmake" DESTINATION ${INSTALL_CONFIGDIR} # COMPONENT ${module_name} ) @@ -167,32 +189,34 @@ install(EXPORT ${project}-targets include(CMakePackageConfigHelpers) write_basic_package_version_file( - "${CMAKE_CURRENT_BINARY_DIR}/${project}-config-version.cmake" + "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_PROJECT_NAME}-config-version.cmake" VERSION ${version} COMPATIBILITY AnyNewerVersion ) -configure_package_config_file("${project}-config.cmake.in" - "${CMAKE_CURRENT_BINARY_DIR}/${project}-config.cmake" +configure_package_config_file("${CMAKE_PROJECT_NAME}-config.cmake.in" + "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_PROJECT_NAME}-config.cmake" INSTALL_DESTINATION ${INSTALL_CONFIGDIR} ) #Install the config, configversion and custom find modules install(FILES - "${CMAKE_CURRENT_BINARY_DIR}/${project}-config.cmake" - "${CMAKE_CURRENT_BINARY_DIR}/${project}-config-version.cmake" + "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_PROJECT_NAME}-config.cmake" + "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_PROJECT_NAME}-config-version.cmake" DESTINATION ${INSTALL_CONFIGDIR} ) ############################################## ## Exporting from the build tree -export(EXPORT ${project}-targets - FILE "${CMAKE_CURRENT_BINARY_DIR}/${project}-targets.cmake") - +export(EXPORT ${CMAKE_PROJECT_NAME}-targets + FILE "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_PROJECT_NAME}-targets.cmake") ############################################## ## Add test if(TEST) + if(PYBIND) + message(FATAL_ERROR "PYBIND and TEST are both enabled. But cannot compile with catch_2.\nChoose between pybind and Catch2 for compilation.") + endif() enable_testing() add_subdirectory(unit_tests) endif() diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000000000000000000000000000000000000..7cbc972fca5c7af1ffb6df6c08480ebad982884c --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,8 @@ +include README.md LICENCE +recursive-include aidge_backend_cuda *.py +recursive-exclude aidge_backend_cuda/unit_tests *.py + +recursive-include include *.hpp +recursive-include src *.cpp +recursive-include python_binding *.cpp +include CMakeLists.txt diff --git a/README.md b/README.md index 30b34248e75429d321b3b1bfa1861496ed50878f..09e16ed4f48371ff11093aa69a17c576c2b8d173 100644 --- a/README.md +++ b/README.md @@ -3,15 +3,28 @@ # Aidge CUDA library You can find in this folder the library that implements the CUDA operators. +[TOC] -## Pip installation +## Installation -You will need to install first the aidge_core library before installing aidge_backend_cuda. -Also, make sure that the install path was set before installing aidge_core library. -Then run in your python environnement : +### Dependencies +- `GCC` +- `Make`/`Ninja` +- `CMake` +- `Python` (optional, if you have no intend to use this library in python with pybind) + +#### Aidge dependencies + - `aidge_core` + - `aidge_backend_cpu` + +### Pip installation ``` bash pip install . -v ``` +> **TIPS:** Use environment variables to change compilation options: +> - `AIDGE_INSTALL`: to set the installation folder. Defaults to /usr/local/lib. :warning: This path must be identical to aidge_core install path. +> - `AIDGE_PYTHON_BUILD_TYPE`: to set the compilation mode to **Debug** or **Release** +> - `AIDGE_BUILD_GEN`: to set the build backend with ## Standard C++ Compilation diff --git a/aidge_backend_cuda/__init__.py b/aidge_backend_cuda/__init__.py index a2c06b0ec8a29b7100b0cf7c461092c491197331..c59afd66efc6197d93268e9e205f35048a26d60e 100644 --- a/aidge_backend_cuda/__init__.py +++ b/aidge_backend_cuda/__init__.py @@ -1 +1,2 @@ -from aidge_backend_cuda.aidge_backend_cuda import * # import so generated by PyBind +from aidge_backend_cuda.aidge_backend_cuda import * # import so generated by PyBind +from ._version import * diff --git a/aidge_backend_cuda/unit_tests/test_tensor.py b/aidge_backend_cuda/unit_tests/test_tensor.py index 6c4717442803badd3d0ac2ea96fb3be44baeaaff..035e0f96d9dee1e948c2aa621181c3f215a457c3 100644 --- a/aidge_backend_cuda/unit_tests/test_tensor.py +++ b/aidge_backend_cuda/unit_tests/test_tensor.py @@ -6,15 +6,17 @@ import numpy as np class test_tensor(unittest.TestCase): - """Test tensor binding - """ + """Test tensor binding""" + def setUp(self): pass + def tearDown(self): pass def test_getavailable_backends(self): self.assertTrue("cuda" in aidge_core.Tensor.get_available_backends()) -if __name__ == '__main__': + +if __name__ == "__main__": unittest.main() diff --git a/cmake/PybindModuleCreation.cmake b/cmake/PybindModuleCreation.cmake index 8030c1a8639e4b7ae0c5fb865e928a4260c6ae7d..8f386bef59ed86dfa366eca5d4fccae24b28d24e 100644 --- a/cmake/PybindModuleCreation.cmake +++ b/cmake/PybindModuleCreation.cmake @@ -1,21 +1,25 @@ -function(generate_python_binding name target_to_bind) +function(generate_python_binding pybind_module_name target_to_bind) add_definitions(-DPYBIND) Include(FetchContent) + set(PYBIND_VERSION v2.10.4) + set(PYBIND11_FINDPYTHON ON) + message(STATUS "Retrieving pybind ${PYBIND_VERSION} from git") + FetchContent_Declare( - PyBind11 - GIT_REPOSITORY https://github.com/pybind/pybind11.git - GIT_TAG v2.10.4 # or a later release + PyBind11 + GIT_REPOSITORY https://github.com/pybind/pybind11.git + GIT_TAG ${PYBIND_VERSION} # or a later release ) # Use the New FindPython mode, recommanded. Requires CMake 3.15+ - find_package(Python COMPONENTS Interpreter Development) + find_package(Python COMPONENTS Interpreter Development.Module) FetchContent_MakeAvailable(PyBind11) - message(STATUS "Creating binding for module ${name}") + message(STATUS "Creating binding for module ${pybind_module_name}") file(GLOB_RECURSE pybind_src_files "python_binding/*.cpp") - pybind11_add_module(${name} MODULE ${pybind_src_files} "NO_EXTRAS") # NO EXTRA recquired for pip install - target_include_directories(${name} PUBLIC "python_binding") - target_link_libraries(${name} PUBLIC ${target_to_bind}) + pybind11_add_module(${pybind_module_name} MODULE ${pybind_src_files} "NO_EXTRAS") # NO EXTRA recquired for pip install + target_include_directories(${pybind_module_name} PUBLIC "python_binding") + target_link_libraries(${pybind_module_name} PUBLIC ${target_to_bind}) endfunction() diff --git a/include/aidge/backend/cuda.hpp b/include/aidge/backend/cuda.hpp index 580dce246b4c43e9a82fc977103145f79ae0976e..d5e9d1654f0a4fe894ed0e965a25b32c9e5caa06 100644 --- a/include/aidge/backend/cuda.hpp +++ b/include/aidge/backend/cuda.hpp @@ -14,17 +14,32 @@ #include "aidge/backend/cuda/data/TensorImpl.hpp" #include "aidge/backend/cuda/operator/AddImpl.hpp" +#include "aidge/backend/cuda/operator/AndImpl.hpp" +#include "aidge/backend/cuda/operator/ArgMaxImpl.hpp" #include "aidge/backend/cuda/operator/AvgPoolingImpl.hpp" #include "aidge/backend/cuda/operator/BatchNormImpl.hpp" #include "aidge/backend/cuda/operator/ConvImpl.hpp" +#include "aidge/backend/cuda/operator/DivImpl.hpp" #include "aidge/backend/cuda/operator/FCImpl.hpp" #include "aidge/backend/cuda/operator/GlobalAveragePoolingImpl.hpp" +#include "aidge/backend/cuda/operator/LnImpl.hpp" #include "aidge/backend/cuda/operator/MaxPoolingImpl.hpp" +#include "aidge/backend/cuda/operator/MulImpl.hpp" #include "aidge/backend/cuda/operator/PadImpl.hpp" +#include "aidge/backend/cuda/operator/PowImpl.hpp" +#include "aidge/backend/cuda/operator/ReduceMeanImpl.hpp" +#include "aidge/backend/cuda/operator/ReduceSumImpl.hpp" #include "aidge/backend/cuda/operator/ReLUImpl.hpp" +#include "aidge/backend/cuda/operator/ShiftMaxImpl.hpp" +#include "aidge/backend/cuda/operator/ShiftGELUImpl.hpp" #include "aidge/backend/cuda/operator/ReshapeImpl.hpp" #include "aidge/backend/cuda/operator/SigmoidImpl.hpp" #include "aidge/backend/cuda/operator/SubImpl.hpp" #include "aidge/backend/cuda/operator/TanhImpl.hpp" +#include "aidge/backend/cuda/operator/ShiftMaxImpl.hpp" +#include "aidge/backend/cuda/operator/ShiftGELUImpl.hpp" +#include "aidge/backend/cuda/operator/ILayerNormImpl.hpp" + + #endif /* AIDGE_BACKEND_CUDA_IMPORTS_H_ */ diff --git a/include/aidge/backend/cuda/data/TensorImpl.hpp b/include/aidge/backend/cuda/data/TensorImpl.hpp index 96045781647f93f0627ca0853a0cdaa66a08af83..541afeecc751332d41ff082b790282abcad5a1b0 100644 --- a/include/aidge/backend/cuda/data/TensorImpl.hpp +++ b/include/aidge/backend/cuda/data/TensorImpl.hpp @@ -221,7 +221,39 @@ public: &strides[0])); } } + else { + // Compare if the shape of the tensor has changed + cudnnDataType_t currentDataType; + int currentNbDims; + // Since we don't know the nb dims of the current tensor, we init with CUDNN_DIM_MAX then remove the trailing zeros + std::vector<int> currentDims(CUDNN_DIM_MAX); + std::vector<int> currentStrides(CUDNN_DIM_MAX); + + CHECK_CUDNN_STATUS(cudnnGetTensorNdDescriptor(mCudnnTensor, CUDNN_DIM_MAX, ¤tDataType, ¤tNbDims, currentDims.data(), currentStrides.data())); + // Remove the trailing zeros + currentDims.erase(std::find_if(currentDims.rbegin(), currentDims.rend(), [](int x) { return x != 0; }).base(), + currentDims.end()); + + std::vector<int> dims(tensor.dims().cbegin(), tensor.dims().cend()); + if (dims.size() < 4) { + dims.resize(4, 1); + } + + // Update descriptor if shape has changed + if (dims!=currentDims) { + std::vector<int> strides(tensor.strides().cbegin(), tensor.strides().cend()); + if (strides.size() < 4) { + strides.resize(4, 1); + } + + CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(mCudnnTensor, + CudaContext::data_type<T>::value, + dims.size(), + &dims[0], + &strides[0])); + } + } return mCudnnTensor; } @@ -255,7 +287,7 @@ static Registrar<Tensor> registrarTensorImpl_cuda_Float32( static Registrar<Tensor> registrarTensorImpl_cuda_Float16( {"cuda", DataType::Float16}, Aidge::TensorImpl_cuda<half_float::half>::create); static Registrar<Tensor> registrarTensorImpl_cuda_Int32( - {"cuda", DataType::Int32}, Aidge::TensorImpl_cuda<int>::create); + {"cuda", DataType::Int32}, Aidge::TensorImpl_cuda<int32_t>::create); } // namespace } // namespace Aidge diff --git a/include/aidge/backend/cuda/operator/AddImpl.hpp b/include/aidge/backend/cuda/operator/AddImpl.hpp index cd1819753cd00a325443d9c9c992f3d2347bb377..429d6f1b04489d9e38ce96d584a1ce9528dd0b2d 100644 --- a/include/aidge/backend/cuda/operator/AddImpl.hpp +++ b/include/aidge/backend/cuda/operator/AddImpl.hpp @@ -27,30 +27,33 @@ #include "aidge/backend/cuda/utils/CudaUtils.hpp" namespace Aidge { +// Operator implementation entry point for the backend class AddImpl_cuda : public OperatorImpl { -private: - - public: - AddImpl_cuda(const Add_Op &op) : OperatorImpl(op, "cuda") {} + AddImpl_cuda(const Add_Op& op) : OperatorImpl(op, "cuda") {} - static std::unique_ptr<AddImpl_cuda> create(const Add_Op &op) { + static std::unique_ptr<AddImpl_cuda> create(const Add_Op& op) { return std::make_unique<AddImpl_cuda>(op); } -public: - void forward(); - void backward(); - // ~AddImpl_cuda(); + virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + return { + {DataType::Float64}, + {DataType::Float32}, + {DataType::Float16}, + }; + } + + void forward() override; + void backward() override; + private: template <class T> void forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides); template <class T> void backward_(const Tensor& outGrad, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides); }; -namespace { -// add cuda backend to Add_Op implementation registry -static Registrar<Add_Op> registrarAddImpl_cuda("cuda", Aidge::AddImpl_cuda::create); -} // namespace +// Implementation entry point registration to Operator +REGISTRAR(Add_Op, "cuda", Aidge::AddImpl_cuda::create); } // namespace Aidge #endif /* AIDGE_BACKEND_CUDA_OPERATOR_ADDIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/AndImpl.hpp b/include/aidge/backend/cuda/operator/AndImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..4105ec87db2c58e218c629a1c94f31efd37c80ee --- /dev/null +++ b/include/aidge/backend/cuda/operator/AndImpl.hpp @@ -0,0 +1,57 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_BACKEND_CUDA_OPERATOR_ANDIMPL_H_ +#define AIDGE_BACKEND_CUDA_OPERATOR_ANDIMPL_H_ + +#include <array> +#include <memory> +#include <tuple> +#include <vector> + +#include <cudnn.h> + +#include "aidge/backend/OperatorImpl.hpp" +#include "aidge/operator/And.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" + +#include "aidge/backend/cuda/utils/CudaUtils.hpp" + +namespace Aidge { +// Operator implementation entry point for the backend +class AndImpl_cuda : public OperatorImpl { +public: + AndImpl_cuda(const And_Op& op) : OperatorImpl(op, "cuda") {} + + static std::unique_ptr<AndImpl_cuda> create(const And_Op& op) { + return std::make_unique<AndImpl_cuda>(op); + } + + virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + return { + {DataType::Float64}, + {DataType::Float32}, + {DataType::Float16}, + }; + } + + void forward() override; + +private: + template <class T> void forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides); +}; + +// Implementation entry point registration to Operator +REGISTRAR(And_Op, "cuda", Aidge::AndImpl_cuda::create); +} // namespace Aidge + +#endif /* AIDGE_BACKEND_CUDA_OPERATOR_ANDIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/AndImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/AndImpl_CUDA_kernels.hpp new file mode 100644 index 0000000000000000000000000000000000000000..bae79a03d03cd5fb7d5fdc4fbebf1dd7562370ae --- /dev/null +++ b/include/aidge/backend/cuda/operator/AndImpl_CUDA_kernels.hpp @@ -0,0 +1,37 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CUDA_OPERATOR_ANDIMPL_KERNELS_H_ +#define AIDGE_CUDA_OPERATOR_ANDIMPL_KERNELS_H_ + +#include <stdexcept> +#include <cfloat> +#include <cuda.h> +#include <cuda_runtime_api.h> +#include <cuda_fp16.h> + +#include "aidge/data/Data.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" + +namespace Aidge { + +template <class T> +void AndForward(const T* input1, const T* input2, T* output, + const std::vector<int>& input1Dims,const std::vector<int>& input2Dims, + const std::vector<int>& inputStrides, const std::vector<int>& input2Strides,const std::vector<int>& outputStrides, + int outSize); +} +#endif /* AIDGE_CUDA_OPERATOR_ANDIMPL_KERNELS_H_ */ + + + + + diff --git a/include/aidge/backend/cuda/operator/ArgMaxImpl.hpp b/include/aidge/backend/cuda/operator/ArgMaxImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a89aebf96914f258f6be616b940ec195ec9ae2a9 --- /dev/null +++ b/include/aidge/backend/cuda/operator/ArgMaxImpl.hpp @@ -0,0 +1,60 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_BACKEND_CUDA_OPERATOR_ARGMAXIMPL_H_ +#define AIDGE_BACKEND_CUDA_OPERATOR_ARGMAXIMPL_H_ + +#include <array> +#include <memory> +#include <tuple> +#include <vector> + +#include <cudnn.h> + +#include "aidge/backend/OperatorImpl.hpp" +#include "aidge/operator/ArgMax.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" + +#include "aidge/backend/cuda/utils/CudaUtils.hpp" + +namespace Aidge { +// Operator implementation entry point for the backend +class ArgMaxImpl_cuda : public OperatorImpl { +public: + ArgMaxImpl_cuda(const ArgMax_Op& op) : OperatorImpl(op, "cuda") {} + + static std::unique_ptr<ArgMaxImpl_cuda> create(const ArgMax_Op& op) { + return std::make_unique<ArgMaxImpl_cuda>(op); + } + + virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + return { + {DataType::Float64}, + {DataType::Float32}, + {DataType::Float16}, + }; + } + + void forward() override; + +private: + // CuDNN specific variables + std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback; + + template <class T> void forward_(const Tensor& input, std::int32_t axis, DimSize_t selectLastIdx); +}; + +// Implementation entry point registration to Operator +REGISTRAR(ArgMax_Op, "cuda", Aidge::ArgMaxImpl_cuda::create); +} // namespace Aidge + +#endif /* AIDGE_BACKEND_CUDA_OPERATOR_ARGMAXIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/ArgMaxImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/ArgMaxImpl_CUDA_kernels.hpp new file mode 100644 index 0000000000000000000000000000000000000000..8c07bf597f6422a26cedd4176fdb1ef29bcabcef --- /dev/null +++ b/include/aidge/backend/cuda/operator/ArgMaxImpl_CUDA_kernels.hpp @@ -0,0 +1,31 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CUDA_OPERATOR_ARGMAXIMPL_KERNEL_H_ +#define AIDGE_CUDA_OPERATOR_ARGMAXIMPL_KERNEL_H_ + +#include <stdexcept> +#include <cfloat> +#include <cuda.h> +#include <cuda_runtime_api.h> +#include <cuda_fp16.h> + +#include "aidge/data/Data.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" + +namespace Aidge +{ + template <class T> + void ArgMax_cuda_forward_kernel(const T* input, T* output, + const std::vector<int>& inputDims, const std::vector<int>& inputStrides, + int axis, int total_elems, std::size_t selectLastIdx); +} +#endif /* AIDGE_CUDA_OPERATOR_ARGMAXIMPL_KERNEL_H_ */ \ No newline at end of file diff --git a/include/aidge/backend/cuda/operator/AvgPoolingImpl.hpp b/include/aidge/backend/cuda/operator/AvgPoolingImpl.hpp index 540ec574f9b5fbcea8b8f28e390cbe05f1e0fa8e..7f8fb4075affd3e5f17533ea67b051dbb6395f04 100644 --- a/include/aidge/backend/cuda/operator/AvgPoolingImpl.hpp +++ b/include/aidge/backend/cuda/operator/AvgPoolingImpl.hpp @@ -27,35 +27,41 @@ #include "aidge/backend/cuda/utils/CudaUtils.hpp" namespace Aidge { +// Operator implementation entry point for the backend template <DimIdx_t DIM> class AvgPoolingImpl_cuda : public OperatorImpl { -private: - // CuDNN specific variables - cudnnPoolingDescriptor_t mAvgPoolingDesc = nullptr; - cudnnPoolingMode_t mMode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; - std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback; - public: - AvgPoolingImpl_cuda(const AvgPooling_Op<DIM> &op) : OperatorImpl(op, "cuda") {} + AvgPoolingImpl_cuda(const AvgPooling_Op<DIM>& op) : OperatorImpl(op, "cuda") {} - static std::unique_ptr<AvgPoolingImpl_cuda> create(const AvgPooling_Op<2> &op) { + static std::unique_ptr<AvgPoolingImpl_cuda> create(const AvgPooling_Op<DIM>& op) { return std::make_unique<AvgPoolingImpl_cuda>(op); } -public: - void forward(); - void backward(); + virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + return { + {DataType::Float64}, + {DataType::Float32}, + {DataType::Float16}, + }; + } + + void forward() override; + void backward() override; ~AvgPoolingImpl_cuda(); private: + // CuDNN specific variables + cudnnPoolingDescriptor_t mAvgPoolingDesc = nullptr; + cudnnPoolingMode_t mMode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; + std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback; + template <class T> void forward_(const Tensor& input); template <class T> void backward_(const Tensor& output_grad); }; -namespace { -// add cuda backend to AvgPooling_Op<2> implementation registry -static Registrar<AvgPooling_Op<2>> registrarAvgPoolingImpl_cuda("cuda", Aidge::AvgPoolingImpl_cuda<2>::create); -} // namespace +// Implementation entry point registration to Operator +using AvgPooling2D_Op = AvgPooling_Op<2>; +REGISTRAR(AvgPooling2D_Op, "cuda", Aidge::AvgPoolingImpl_cuda<2>::create); } // namespace Aidge #endif /* AIDGE_BACKEND_CUDA_OPERATOR_AVGPOOLINGIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/BatchNormImpl.hpp b/include/aidge/backend/cuda/operator/BatchNormImpl.hpp index 3451d07f289371202570434f96546344c0c4fb26..5ba8656ef8a25ffa53584641a938f637ecff9b94 100644 --- a/include/aidge/backend/cuda/operator/BatchNormImpl.hpp +++ b/include/aidge/backend/cuda/operator/BatchNormImpl.hpp @@ -27,35 +27,41 @@ #include "aidge/backend/cuda/utils/CudaUtils.hpp" namespace Aidge { +// Operator implementation entry point for the backend template <DimIdx_t DIM> class BatchNormImpl_cuda : public OperatorImpl { -private: - // CuDNN specific variables - cudnnTensorDescriptor_t mBNDesc = nullptr; - cudnnBatchNormMode_t mMode; - double mEpsilon; - public: - BatchNormImpl_cuda(const BatchNorm_Op<DIM> &op) : OperatorImpl(op, "cuda") {} + BatchNormImpl_cuda(const BatchNorm_Op<DIM>& op) : OperatorImpl(op, "cuda") {} - static std::unique_ptr<BatchNormImpl_cuda> create(const BatchNorm_Op<DIM> &op) { + static std::unique_ptr<BatchNormImpl_cuda> create(const BatchNorm_Op<DIM>& op) { return std::make_unique<BatchNormImpl_cuda>(op); } -public: - void forward(); - void backward(); + virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + return { + {DataType::Float64}, + {DataType::Float32}, + {DataType::Float16}, + }; + } + + void forward() override; + void backward() override; ~BatchNormImpl_cuda(); private: + // CuDNN specific variables + cudnnTensorDescriptor_t mBNDesc = nullptr; + cudnnBatchNormMode_t mMode; + double mEpsilon; + template <class T> void forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, const Tensor& input3, const Tensor& input4); template <class T> void backward_(const Tensor& input0, const Tensor& input1, const Tensor& input2); }; -namespace { -// add cuda backend to BatchNorm_Op<2> implementation registry -static Registrar<BatchNorm_Op<2>> registrarBatchNormImpl_cuda("cuda", Aidge::BatchNormImpl_cuda<2>::create); -} // namespace +// Implementation entry point registration to Operator +using BatchNorm2D_Op = BatchNorm_Op<2>; +REGISTRAR(BatchNorm2D_Op, "cuda", Aidge::BatchNormImpl_cuda<2>::create); } // namespace Aidge #endif /* AIDGE_BACKEND_CUDA_OPERATOR_BATCHNORMIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/ConvImpl.hpp b/include/aidge/backend/cuda/operator/ConvImpl.hpp index 0722048f7cf021104a9694a621b1c0dad00ce423..ce94ec6695735c93d5c8d0acfdc6153e91e7147d 100644 --- a/include/aidge/backend/cuda/operator/ConvImpl.hpp +++ b/include/aidge/backend/cuda/operator/ConvImpl.hpp @@ -29,13 +29,35 @@ namespace Aidge { +// Operator implementation entry point for the backend template <DimIdx_t DIM> class ConvImpl_cuda : public OperatorImpl { +public: + ConvImpl_cuda(const Operator&op, bool depthWise = false) : OperatorImpl(op, "cuda"), mDepthWise(depthWise) {} + + static std::unique_ptr<ConvImpl_cuda<DIM>> create(const Conv_Op<DIM>& op) { + return std::make_unique<ConvImpl_cuda<DIM>>(op); + } + + static std::unique_ptr<ConvImpl_cuda<DIM>> createDW(const ConvDepthWise_Op<DIM> &op) { + return std::make_unique<ConvImpl_cuda<DIM>>(op, true); + } + + virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + return { + {DataType::Any} + }; + } + + void forward() override; + void backward() override; + ~ConvImpl_cuda(); + private: // CuDNN specific variables cudnnConvolutionDescriptor_t mConvDesc = nullptr; cudnnFilterDescriptor_t mFilterDesc = nullptr; - cudnnConvolutionFwdAlgo_t mFwdAlgo; + cudnnConvolutionFwdAlgo_t mFwdAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; cudnnConvolutionBwdFilterAlgo_t mBwdFilterAlgo; cudnnConvolutionBwdDataAlgo_t mBwdDataAlgo; size_t mWorkspaceSize = 0; @@ -46,31 +68,15 @@ private: std::shared_ptr<Tensor> mInput2Fallback; bool mDepthWise = false; -public: - ConvImpl_cuda(const Operator&op, bool depthWise = false) : OperatorImpl(op, "cuda"), mDepthWise(depthWise) {} - - static std::unique_ptr<ConvImpl_cuda> create(const Conv_Op<DIM> &op) { - return std::make_unique<ConvImpl_cuda>(op); - } - - static std::unique_ptr<ConvImpl_cuda> createDW(const ConvDepthWise_Op<DIM> &op) { - return std::make_unique<ConvImpl_cuda>(op, true); - } - -public: - void forward(); - void backward(); - ~ConvImpl_cuda(); - -private: template <class T> void forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2); template <class T> void backward_(const Tensor& input0, const Tensor& input1, const Tensor& input2); }; -namespace { -static Registrar<Conv_Op<2>> registrarConvImpl_cuda("cuda", Aidge::ConvImpl_cuda<2>::create); -static Registrar<ConvDepthWise_Op<2>> registrarConvDepthWiseImpl_cuda("cuda", Aidge::ConvImpl_cuda<2>::createDW); -} // namespace +// Implementation entry point registration to Operator +using Conv2D_Op = Conv_Op<2>; +using ConvDepthWise2D_Op = ConvDepthWise_Op<2>; +REGISTRAR(Conv2D_Op, "cuda", Aidge::ConvImpl_cuda<2>::create); +REGISTRAR(ConvDepthWise2D_Op, "cuda", Aidge::ConvImpl_cuda<2>::createDW); } // namespace Aidge #endif /* AIDGE_BACKEND_CUDA_OPERATOR_CONVIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/DivImpl.hpp b/include/aidge/backend/cuda/operator/DivImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..4b15445cb791aa1cf2520018d1015e19aaf10ce3 --- /dev/null +++ b/include/aidge/backend/cuda/operator/DivImpl.hpp @@ -0,0 +1,59 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_BACKEND_CUDA_OPERATOR_DIVIMPL_H_ +#define AIDGE_BACKEND_CUDA_OPERATOR_DIVIMPL_H_ + +#include <array> +#include <memory> +#include <tuple> +#include <vector> + +#include <cudnn.h> + +#include "aidge/backend/OperatorImpl.hpp" +#include "aidge/operator/Div.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" + +#include "aidge/backend/cuda/utils/CudaUtils.hpp" + +namespace Aidge { +// Operator implementation entry point for the backend +class DivImpl_cuda : public OperatorImpl { +public: + DivImpl_cuda(const Div_Op& op) : OperatorImpl(op, "cuda") {} + + static std::unique_ptr<DivImpl_cuda> create(const Div_Op& op) { + return std::make_unique<DivImpl_cuda>(op); + } + + virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + return { + {DataType::Float64}, + {DataType::Float32}, + {DataType::Float16}, + }; + } + + void forward() override; + void backward() override; + +private: + template <class T> void forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides); + template <class T> void backward_(const Tensor& outGrad); +}; + +// Implementation entry point registration to Operator +REGISTRAR(Div_Op, "cuda", Aidge::DivImpl_cuda::create); +} // namespace Aidge + +#endif /* AIDGE_BACKEND_CUDA_OPERATOR_DIVIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/DivImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/DivImpl_CUDA_kernels.hpp new file mode 100644 index 0000000000000000000000000000000000000000..512bec77bb63570ffeb8f1681e4e25cd323535fa --- /dev/null +++ b/include/aidge/backend/cuda/operator/DivImpl_CUDA_kernels.hpp @@ -0,0 +1,39 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CUDA_OPERATOR_DIVIMPL_KERNELS_H_ +#define AIDGE_CUDA_OPERATOR_DIVIMPL_KERNELS_H_ + +#include <stdexcept> +#include <cfloat> +#include <cuda.h> +#include <cuda_runtime_api.h> +#include <cuda_fp16.h> + +#include "aidge/data/Data.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/utils/Types.h" + +namespace Aidge { + +template <class T> +void divForward(const T* input1, T* output, const T* intput2, + const std::vector<int>& input1Dims,const std::vector<int>& input2Dims, const std::vector<int>& outputDims, + const std::vector<int>& input1Strides, const std::vector<int>& input2Strides,const std::vector<int>& outputStrides, + int outSize); + +} +#endif /* AIDGE_CUDA_OPERATOR_DIVIMPL_KERNELS_H_ */ + + + + + diff --git a/include/aidge/backend/cuda/operator/FCImpl.hpp b/include/aidge/backend/cuda/operator/FCImpl.hpp index 46f7849d1f17aab5496bdbde013ef078ad1f5a7c..f2dd0c90c0096a1b57fb6860e5991d0c1e824be9 100644 --- a/include/aidge/backend/cuda/operator/FCImpl.hpp +++ b/include/aidge/backend/cuda/operator/FCImpl.hpp @@ -27,34 +27,37 @@ #include "aidge/backend/cuda/utils/CudaUtils.hpp" namespace Aidge { +// Operator implementation entry point for the backend class FCImpl_cuda : public OperatorImpl { -private: - std::shared_ptr<Tensor> mInput0Fallback; - std::shared_ptr<Tensor> mInput1Fallback; - std::shared_ptr<Tensor> mInput2Fallback; - - public: - FCImpl_cuda(const FC_Op &op) : OperatorImpl(op, "cuda") {} + FCImpl_cuda(const FC_Op& op) : OperatorImpl(op, "cuda") {} - static std::unique_ptr<FCImpl_cuda> create(const FC_Op &op) { + static std::unique_ptr<FCImpl_cuda> create(const FC_Op& op) { return std::make_unique<FCImpl_cuda>(op); } -public: - void forward(); - void backward(); - // ~FCImpl_cuda(); + virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + return { + {DataType::Float64}, + {DataType::Float32}, + {DataType::Float16}, + }; + } + + void forward() override; + void backward() override; private: + std::shared_ptr<Tensor> mInput0Fallback; + std::shared_ptr<Tensor> mInput1Fallback; + std::shared_ptr<Tensor> mInput2Fallback; + template <class T> void forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, std::size_t outChannels); template <class T> void backward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, std::size_t outChannels); }; -namespace { -// add cuda backend to FC_Op implementation registry -static Registrar<FC_Op> registrarFCImpl_cuda("cuda", Aidge::FCImpl_cuda::create); -} // namespace +// Implementation entry point registration to Operator +REGISTRAR(FC_Op, "cuda", Aidge::FCImpl_cuda::create); } // namespace Aidge #endif /* AIDGE_BACKEND_CUDA_OPERATOR_FCIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp index 8d1af8f7c5954c2eae9179926aec433eee34414f..a956960df0a4dccb4ef9eb0634e5f61b9ddede0a 100644 --- a/include/aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp +++ b/include/aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp @@ -9,8 +9,8 @@ * ********************************************************************************/ -#ifndef AIDGE_CUDA_OPERATOR_FCIMPL_FORWARD_KERNEL_H_ -#define AIDGE_CUDA_OPERATOR_FCIMPL_FORWARD_KERNEL_H_ +#ifndef AIDGE_CUDA_OPERATOR_FCIMPL_KERNELS_H_ +#define AIDGE_CUDA_OPERATOR_FCIMPL_KERNELS_H_ #include <stdexcept> #include <cfloat> @@ -42,4 +42,4 @@ cublasStatus_t cublasGemv(cublasHandle_t handle, cublasOperation_t trans, const T *beta, T *y, int incy); } -#endif /* AIDGE_CUDA_OPERATOR_FCIMPL_FORWARD_KERNEL_H_ */ \ No newline at end of file +#endif /* AIDGE_CUDA_OPERATOR_FCIMPL_KERNELS_H_ */ \ No newline at end of file diff --git a/include/aidge/backend/cuda/operator/GlobalAveragePoolingImpl.hpp b/include/aidge/backend/cuda/operator/GlobalAveragePoolingImpl.hpp index 6e0fad5c01efb6474f527dee0bfbfdc594788bc6..3f0386dcfa68d4b55bebeb524dfedfd5edeb0fe9 100644 --- a/include/aidge/backend/cuda/operator/GlobalAveragePoolingImpl.hpp +++ b/include/aidge/backend/cuda/operator/GlobalAveragePoolingImpl.hpp @@ -27,34 +27,37 @@ #include "aidge/backend/cuda/utils/CudaUtils.hpp" namespace Aidge { +// Operator implementation entry point for the backend class GlobalAveragePoolingImpl_cuda : public OperatorImpl { -private: - // CuDNN specific variables - cudnnPoolingDescriptor_t mGlobalAveragePoolingDesc = nullptr; - cudnnPoolingMode_t mMode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; - std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback; - public: - GlobalAveragePoolingImpl_cuda(const GlobalAveragePooling_Op &op) : OperatorImpl(op, "cuda") {} + GlobalAveragePoolingImpl_cuda(const GlobalAveragePooling_Op& op) : OperatorImpl(op, "cuda") {} - static std::unique_ptr<GlobalAveragePoolingImpl_cuda> create(const GlobalAveragePooling_Op &op) { + static std::unique_ptr<GlobalAveragePoolingImpl_cuda> create(const GlobalAveragePooling_Op& op) { return std::make_unique<GlobalAveragePoolingImpl_cuda>(op); } -public: - void forward(); - void backward(); + virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + return { + {DataType::Any} + }; + } + + void forward() override; + void backward() override; ~GlobalAveragePoolingImpl_cuda(); private: + // CuDNN specific variables + cudnnPoolingDescriptor_t mGlobalAveragePoolingDesc = nullptr; + cudnnPoolingMode_t mMode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; + std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback; + template <class T> void forward_(const Tensor& input); template <class T> void backward_(const Tensor& output_grad); }; -namespace { -// add cuda backend to GlobalAveragePooling_Op implementation registry -static Registrar<GlobalAveragePooling_Op> registrarGlobalAveragePoolingImpl_cuda("cuda", Aidge::GlobalAveragePoolingImpl_cuda::create); -} // namespace +// Implementation entry point registration to Operator +REGISTRAR(GlobalAveragePooling_Op, "cuda", Aidge::GlobalAveragePoolingImpl_cuda::create); } // namespace Aidge #endif /* AIDGE_BACKEND_CUDA_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/ILayerNormImpl.hpp b/include/aidge/backend/cuda/operator/ILayerNormImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..742401de7903f19ab4d8f51a153b0e864f21dd47 --- /dev/null +++ b/include/aidge/backend/cuda/operator/ILayerNormImpl.hpp @@ -0,0 +1,65 @@ +/******************************************************************************** + * Copyright (c) 2024 Thales + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * Author: Lucas RAKOTOARIVONY, Thales Research & Technology France + * Date: 10.09.2024 + * + ********************************************************************************/ + +#ifndef AIDGE_BACKEND_CUDA_OPERATOR_ILAYERNORMIMPL_H_ +#define AIDGE_BACKEND_CUDA_OPERATOR_ILAYERNORMIMPL_H_ + +#include <array> +#include <memory> +#include <tuple> +#include <vector> + +#include <cudnn.h> + +#include "aidge/backend/OperatorImpl.hpp" +#include "aidge/operator/ILayerNorm.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" + +#include "aidge/backend/cuda/utils/CudaUtils.hpp" + +namespace Aidge { +class ILayerNormImpl_cuda : public OperatorImpl { +public: + ILayerNormImpl_cuda(const ILayerNorm_Op &op) : OperatorImpl(op, "cuda") {} + + static std::unique_ptr<ILayerNormImpl_cuda> create(const ILayerNorm_Op &op) { + return std::make_unique<ILayerNormImpl_cuda>(op); + } + + virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + return { + {DataType::Float64}, + {DataType::Float32}, + {DataType::Float16}, + }; + } + + void forward() override; + void backward() override; + +private: + std::shared_ptr<Tensor> mInput0Fallback; + std::shared_ptr<Tensor> mInput1Fallback; + std::shared_ptr<Tensor> mInput2Fallback; + std::shared_ptr<Tensor> mOutputGradFallback; + + template <class T> void forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2); + template <class T> void backward_(const Tensor& output_grad); +}; + +// Implementation entry point registration to Operator +REGISTRAR(ILayerNorm_Op, "cuda", Aidge::ILayerNormImpl_cuda::create); +} // namespace Aidge + +#endif /* AIDGE_BACKEND_CUDA_OPERATOR_ILAYERNORMIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/ILayerNormImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/ILayerNormImpl_CUDA_kernels.hpp new file mode 100644 index 0000000000000000000000000000000000000000..aa54029ea29bc46809f227038a1a23d91bc161ee --- /dev/null +++ b/include/aidge/backend/cuda/operator/ILayerNormImpl_CUDA_kernels.hpp @@ -0,0 +1,92 @@ +/******************************************************************************** + * Copyright (c) 2024 Thales + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * Author: Lucas RAKOTOARIVONY, Thales Research & Technology France + * Date: 10.09.2024 + * + ********************************************************************************/ + +#ifndef AIDGE_CUDA_OPERATOR_ILAYERNORMIMPL_FORWARD_KERNEL_H_ +#define AIDGE_CUDA_OPERATOR_ILAYERNORMIMPL_FORWARD_KERNEL_H_ + +#include <stdexcept> +#include <cfloat> +#include <cuda.h> +#include <cuda_runtime_api.h> +#include <cuda_fp16.h> + +#include "aidge/data/Data.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" + +namespace Aidge { + +/** + * @brief Compute the forward for ILayerNorm + * @param input: Input tensor + * @param SF: Scaling factor of input tensor + * @param dims: Dimensions of input tensor + * @param quantized_tensor: Quantized output tensor + * @param square_tensor: Tensor use for computation + * @param weight: weight of ILayerNorm layer + * @param bias: bias of ILayerNorm layer + * @param new_SF: Scaling factor of output that can be use to dequantify +*/ +template <class T> +__global__ void ILayerNormforward_(T* input, double SF, int* dims, int* quantized_tensor,long long int* square_tensor, T* weight, T* biase, double new_SF); + +/** + * @brief Wrapper function to execute ILayerNormforward_ + * @note Output correspond to the non-quantized tensor, to obtain the quantized tensor we need to copy quantized_tensor and not input_cuda_tensor + * @param input: Input tensor + * @param output: Output tensor (not quantized) + * @param SF: Scaling factor of input tensor + * @param weight_raw: weight of ILayerNorm layer + * @param bias_raw: bias of ILayerNorm layer + * @param size: Number of elements in the input tensor + * @param dims: Dimensions of input tensor +*/ +template <class T> +void ILayerNormforward(const T* input, T* output, double SF, const T* weight_raw, const T* bias_raw, size_t size, std::vector<long unsigned int> dims_input); + +/** + * @brief Compute the backward for ILayerNorm + * @param output_grad: Gradient of output tensor + * @param input_tensor: Input tensor + * @param output_tensor: Output tensor obtained after forward + * @param mean: Arithmetic mean of input tensor + * @param var: Arithmetic variance of input tensor + * @param weight: weight of ILayerNorm layer + * @param bias: bias of ILayerNorm layer + * @param input_grad: Gradient of input tensor + * @param weight_grad: Gradient of ILayerNorm weight + * @param bias_grad: Gradient of ILayerNorm bias + * @param size: Number of elements in the input tensor +*/ +template <class T> +__global__ void ILayerNormbackward_(T* output_grad, T* input_tensor, T* output_tensor, T* mean, T* var, T* weight, T* bias, T* input_grad, T* weight_grad, T* bias_grad, int size); + +/** + * @brief Wrapper function to execute ILayerNormbackward_ + * @param input_tensor: Input tensor + * @param output_grad: Gradient of output tensor + * @param output_tensor: Output tensor obtained after forward + * @param mean: Arithmetic mean of input tensor + * @param var: Arithmetic variance of input tensor + * @param weight: weight of ILayerNorm layer + * @param bias: bias of ILayerNorm layer + * @param input_grad: Gradient of input tensor + * @param weight_grad: Gradient of ILayerNorm weight + * @param bias_grad: Gradient of ILayerNorm bias + * @param size: Number of elements in the input tensor +*/ +template <class T> +void ILayerNormbackward(const T* input_tensor, const T* output_grad, const T* output_tensor,const T* mean,const T* var, const T* weight, const T* bias, T* input_grad, T* weight_grad, T* bias_grad, size_t size); + +} + +#endif /* AIDGE_CUDA_OPERATOR_ILAYERNORMIMPL_FORWARD_KERNEL_H_ */ \ No newline at end of file diff --git a/include/aidge/backend/cuda/operator/LnImpl.hpp b/include/aidge/backend/cuda/operator/LnImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..1617754fbf5dd52e099a9787a25a827851933af9 --- /dev/null +++ b/include/aidge/backend/cuda/operator/LnImpl.hpp @@ -0,0 +1,62 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_BACKEND_CUDA_OPERATOR_LNIMPL_H_ +#define AIDGE_BACKEND_CUDA_OPERATOR_LNIMPL_H_ + +#include <array> +#include <memory> +#include <tuple> +#include <vector> + +#include <cudnn.h> + +#include "aidge/backend/OperatorImpl.hpp" +#include "aidge/operator/Ln.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" + +#include "aidge/backend/cuda/utils/CudaUtils.hpp" + +namespace Aidge { +// Operator implementation entry point for the backend +class LnImpl_cuda : public OperatorImpl { +public: + LnImpl_cuda(const Ln_Op& op) : OperatorImpl(op, "cuda") {} + + static std::unique_ptr<LnImpl_cuda> create(const Ln_Op& op) { + return std::make_unique<LnImpl_cuda>(op); + } + + virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + return { + {DataType::Float64}, + {DataType::Float32}, + {DataType::Float16}, + }; + } + + void forward() override; + void backward() override; + +private: + std::shared_ptr<Tensor> mInputFallback; + std::shared_ptr<Tensor> mOutputGradFallback; + + template <class T> void forward_(const Tensor& input); + template <class T> void backward_(const Tensor& output_grad); +}; + +// Implementation entry point registration to Operator +REGISTRAR(Ln_Op, "cuda", Aidge::LnImpl_cuda::create); +} // namespace Aidge + +#endif /* AIDGE_BACKEND_CUDA_OPERATOR_LNIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/LnImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/LnImpl_CUDA_kernels.hpp new file mode 100644 index 0000000000000000000000000000000000000000..9652d88116ca2cac92abbc517f8bc650655f43cc --- /dev/null +++ b/include/aidge/backend/cuda/operator/LnImpl_CUDA_kernels.hpp @@ -0,0 +1,36 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CUDA_OPERATOR_LNIMPL_KERNELS_H_ +#define AIDGE_CUDA_OPERATOR_LNIMPL_KERNELS_H_ + +#include <stdexcept> +#include <cfloat> +#include <cuda.h> +#include <cuda_runtime_api.h> +#include <cuda_fp16.h> + +#include "aidge/data/Data.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/utils/Types.h" + +namespace Aidge { + +template <class T> +void lnForward(const T* input, T* output, int size); + +} +#endif /* AIDGE_CUDA_OPERATOR_LNIMPL_KERNELS_H_ */ + + + + + diff --git a/include/aidge/backend/cuda/operator/MaxPoolingImpl.hpp b/include/aidge/backend/cuda/operator/MaxPoolingImpl.hpp index db7f1e376013db52aeb1b27f8cc3ff192c7f0629..a203e761beaeccec96b36bbd5a424a193cdb6387 100644 --- a/include/aidge/backend/cuda/operator/MaxPoolingImpl.hpp +++ b/include/aidge/backend/cuda/operator/MaxPoolingImpl.hpp @@ -27,35 +27,39 @@ #include "aidge/backend/cuda/utils/CudaUtils.hpp" namespace Aidge { +// Operator implementation entry point for the backend template <DimIdx_t DIM> class MaxPoolingImpl_cuda : public OperatorImpl { -private: - // CuDNN specific variables - cudnnPoolingDescriptor_t mMaxPoolingDesc = nullptr; - cudnnPoolingMode_t mMode = CUDNN_POOLING_MAX; - std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback; - public: - MaxPoolingImpl_cuda(const MaxPooling_Op<DIM> &op) : OperatorImpl(op, "cuda") {} + MaxPoolingImpl_cuda(const MaxPooling_Op<DIM>& op) : OperatorImpl(op, "cuda") {} - static std::unique_ptr<MaxPoolingImpl_cuda> create(const MaxPooling_Op<2> &op) { + static std::unique_ptr<MaxPoolingImpl_cuda> create(const MaxPooling_Op<DIM>& op) { return std::make_unique<MaxPoolingImpl_cuda>(op); } -public: - void forward(); - void backward(); + virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + return { + {DataType::Any} + }; + } + + void forward() override; + void backward() override; ~MaxPoolingImpl_cuda(); private: + // CuDNN specific variables + cudnnPoolingDescriptor_t mMaxPoolingDesc = nullptr; + cudnnPoolingMode_t mMode = CUDNN_POOLING_MAX; + std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback; + template <class T> void forward_(const Tensor& input); template <class T> void backward_(const Tensor& output_grad); }; -namespace { -// add cuda backend to MaxPooling_Op<2> implementation registry -static Registrar<MaxPooling_Op<2>> registrarMaxPoolingImpl_cuda("cuda", Aidge::MaxPoolingImpl_cuda<2>::create); -} // namespace +// Implementation entry point registration to Operator +using MaxPooling2D_Op = MaxPooling_Op<2>; +REGISTRAR(MaxPooling2D_Op, "cuda", Aidge::MaxPoolingImpl_cuda<2>::create); } // namespace Aidge #endif /* AIDGE_BACKEND_CUDA_OPERATOR_MAXPOOLINGIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/MulImpl.hpp b/include/aidge/backend/cuda/operator/MulImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..37d3d5a0df7b63dc63ad13737d8a8b463bf315c8 --- /dev/null +++ b/include/aidge/backend/cuda/operator/MulImpl.hpp @@ -0,0 +1,59 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_BACKEND_CUDA_OPERATOR_MULIMPL_H_ +#define AIDGE_BACKEND_CUDA_OPERATOR_MULIMPL_H_ + +#include <array> +#include <memory> +#include <tuple> +#include <vector> + +#include <cudnn.h> + +#include "aidge/backend/OperatorImpl.hpp" +#include "aidge/operator/Mul.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" + +#include "aidge/backend/cuda/utils/CudaUtils.hpp" + +namespace Aidge { +// Operator implementation entry point for the backend +class MulImpl_cuda : public OperatorImpl { +public: + MulImpl_cuda(const Mul_Op& op) : OperatorImpl(op, "cuda") {} + + static std::unique_ptr<MulImpl_cuda> create(const Mul_Op& op) { + return std::make_unique<MulImpl_cuda>(op); + } + + virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + return { + {DataType::Float64}, + {DataType::Float32}, + {DataType::Float16}, + }; + } + + void forward() override; + void backward() override; + +private: + template <class T> void forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides); + template <class T> void backward_(const Tensor& outputGrad, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides); +}; + +// Implementation entry point registration to Operator +REGISTRAR(Mul_Op, "cuda", Aidge::MulImpl_cuda::create); +} // namespace Aidge + +#endif /* AIDGE_BACKEND_CUDA_OPERATOR_MULIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/PadImpl.hpp b/include/aidge/backend/cuda/operator/PadImpl.hpp index 4452d3408e7b4780c1e5c4ea6553ba0b713df231..d51361d6ee5a3ec9a858d290b3f5fe5251b6fa97 100644 --- a/include/aidge/backend/cuda/operator/PadImpl.hpp +++ b/include/aidge/backend/cuda/operator/PadImpl.hpp @@ -27,35 +27,41 @@ #include "aidge/backend/cuda/utils/CudaUtils.hpp" namespace Aidge { +// Operator implementation entry point for the backend template <DimIdx_t DIM> class PadImpl_cuda : public OperatorImpl { -private: - // CuDNN specific variables - std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback; - int mLeftPad, mTopPad; - double mPadVal; - unsigned int mPadType; - public: - PadImpl_cuda(const Pad_Op<DIM> &op) : OperatorImpl(op, "cuda") {} + PadImpl_cuda(const Pad_Op<DIM>& op) : OperatorImpl(op, "cuda") {} - static std::unique_ptr<PadImpl_cuda> create(const Pad_Op<2> &op) { + static std::unique_ptr<PadImpl_cuda> create(const Pad_Op<DIM>& op) { return std::make_unique<PadImpl_cuda>(op); } -public: - void forward(); - void backward(); + virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + return { + {DataType::Float64}, + {DataType::Float32}, + {DataType::Float16}, + }; + } + + void forward() override; + void backward() override; private: + // CuDNN specific variables + std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback; + int mLeftPad, mTopPad; + double mPadVal; + unsigned int mPadType; + template <class T> void forward_(const Tensor& input); template <class T> void backward_(const Tensor& outGrad); }; -namespace { -// add cuda backend to Pad_Op<2> implementation registry -static Registrar<Pad_Op<2>> registrarPadImpl_cuda("cuda", Aidge::PadImpl_cuda<2>::create); -} // namespace +// Implementation entry point registration to Operator +using Pad2D_Op = Pad_Op<2>; +REGISTRAR(Pad2D_Op, "cuda", Aidge::PadImpl_cuda<2>::create); } // namespace Aidge #endif /* AIDGE_BACKEND_CUDA_OPERATOR_PADIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/PadImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/PadImpl_CUDA_kernels.hpp index c6a83160da5cf3fea3d3415959c965e16c1eb4ff..11ddb0ea8b0e6603bf009c4ae0a7fa3247a8904f 100644 --- a/include/aidge/backend/cuda/operator/PadImpl_CUDA_kernels.hpp +++ b/include/aidge/backend/cuda/operator/PadImpl_CUDA_kernels.hpp @@ -9,8 +9,8 @@ * ********************************************************************************/ -#ifndef AIDGE_CUDA_OPERATOR_PADIMPL_FORWARD_KERNEL_H_ -#define AIDGE_CUDA_OPERATOR_PADIMPL_FORWARD_KERNEL_H_ +#ifndef AIDGE_CUDA_OPERATOR_PADIMPL_KERNELS_H_ +#define AIDGE_CUDA_OPERATOR_PADIMPL_KERNELS_H_ #include "aidge/data/Data.hpp" #include "aidge/backend/cuda/utils/CudaUtils.hpp" @@ -34,4 +34,4 @@ namespace Aidge const T *input, T *outputs); } -#endif /* AIDGE_CUDA_OPERATOR_PADIMPL_FORWARD_KERNEL_H_ */ \ No newline at end of file +#endif /* AIDGE_CUDA_OPERATOR_PADIMPL_KERNELS_H_ */ \ No newline at end of file diff --git a/include/aidge/backend/cuda/operator/PowImpl.hpp b/include/aidge/backend/cuda/operator/PowImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..403648d9a294ee598f117c8b05e6f0875e998307 --- /dev/null +++ b/include/aidge/backend/cuda/operator/PowImpl.hpp @@ -0,0 +1,59 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_BACKEND_CUDA_OPERATOR_POWIMPL_H_ +#define AIDGE_BACKEND_CUDA_OPERATOR_POWIMPL_H_ + +#include <array> +#include <memory> +#include <tuple> +#include <vector> + +#include <cudnn.h> + +#include "aidge/backend/OperatorImpl.hpp" +#include "aidge/operator/Pow.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" + +#include "aidge/backend/cuda/utils/CudaUtils.hpp" + +namespace Aidge { +// Operator implementation entry point for the backend +class PowImpl_cuda : public OperatorImpl { +public: + PowImpl_cuda(const Pow_Op& op) : OperatorImpl(op, "cuda") {} + + static std::unique_ptr<PowImpl_cuda> create(const Pow_Op& op) { + return std::make_unique<PowImpl_cuda>(op); + } + + virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + return { + {DataType::Float64}, + {DataType::Float32}, + {DataType::Float16}, + }; + } + + void forward() override; + void backward() override; + +private: + template <class T> void forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides); + template <class T> void backward_(const Tensor& outGrad); +}; + +// Implementation entry point registration to Operator +REGISTRAR(Pow_Op, "cuda", Aidge::PowImpl_cuda::create); +} // namespace Aidge + +#endif /* AIDGE_BACKEND_CUDA_OPERATOR_POWIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/PowImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/PowImpl_CUDA_kernels.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e89bea53ba766b0bd90f0c7acd631b0370d96298 --- /dev/null +++ b/include/aidge/backend/cuda/operator/PowImpl_CUDA_kernels.hpp @@ -0,0 +1,38 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CUDA_OPERATOR_POWIMPL_KERNELS_H_ +#define AIDGE_CUDA_OPERATOR_POWIMPL_KERNELS_H_ + +#include <stdexcept> +#include <cfloat> +#include <cuda.h> +#include <cuda_runtime_api.h> +#include <cuda_fp16.h> + +#include "aidge/data/Data.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" + +namespace Aidge { + +template <class T> +void powForward(const T* input, T* output, const T* exponent, + const std::vector<int>& inputDims,const std::vector<int>& exponentDims, const std::vector<int>& outputDims, + const std::vector<int>& inputStrides, const std::vector<int>& exponentStrides,const std::vector<int>& outputStrides, + int outSize); + +} +#endif /* AIDGE_CUDA_OPERATOR_POWIMPL_KERNELS_H_ */ + + + + + diff --git a/include/aidge/backend/cuda/operator/ReLUImpl.hpp b/include/aidge/backend/cuda/operator/ReLUImpl.hpp index 285713f460b9d5b5e868c0c07ab23804f30dd694..344923ba1ee08642a3e3e5f685bfd2c7de8a74b4 100644 --- a/include/aidge/backend/cuda/operator/ReLUImpl.hpp +++ b/include/aidge/backend/cuda/operator/ReLUImpl.hpp @@ -27,7 +27,25 @@ #include "aidge/backend/cuda/utils/CudaUtils.hpp" namespace Aidge { +// Operator implementation entry point for the backend class ReLUImpl_cuda : public OperatorImpl { +public: + ReLUImpl_cuda(const ReLU_Op& op) : OperatorImpl(op, "cuda") {} + + static std::unique_ptr<ReLUImpl_cuda> create(const ReLU_Op& op) { + return std::make_unique<ReLUImpl_cuda>(op); + } + + virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + return { + {DataType::Any} + }; + } + + void forward() override; + void backward() override; + ~ReLUImpl_cuda(); + private: // CuDNN specific variables #if CUDNN_VERSION >= 5000 @@ -38,27 +56,12 @@ private: std::shared_ptr<Tensor> mInputFallback; std::shared_ptr<Tensor> mOutputGradFallback; -public: - ReLUImpl_cuda(const ReLU_Op &op) : OperatorImpl(op, "cuda") {} - - static std::unique_ptr<ReLUImpl_cuda> create(const ReLU_Op &op) { - return std::make_unique<ReLUImpl_cuda>(op); - } - -public: - void forward(); - void backward(); - ~ReLUImpl_cuda(); - -private: template <class T> void forward_(const Tensor& input); template <class T> void backward_(const Tensor& output_grad); }; -namespace { -// add cuda backend to ReLU_Op implementation registry -static Registrar<ReLU_Op> registrarReLUImpl_cuda("cuda", Aidge::ReLUImpl_cuda::create); -} // namespace +// Implementation entry point registration to Operator +REGISTRAR(ReLU_Op, "cuda", Aidge::ReLUImpl_cuda::create); } // namespace Aidge #endif /* AIDGE_BACKEND_CUDA_OPERATOR_RELUIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/ReduceImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/ReduceImpl_CUDA_kernels.hpp new file mode 100644 index 0000000000000000000000000000000000000000..9d352b8b1d14aeaa4230accd7aa81c279c18b7a8 --- /dev/null +++ b/include/aidge/backend/cuda/operator/ReduceImpl_CUDA_kernels.hpp @@ -0,0 +1,30 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CUDA_OPERATOR_REDUCEIMPL_KERNEL_H_ +#define AIDGE_CUDA_OPERATOR_REDUCEIMPL_KERNEL_H_ + +#include "aidge/data/Data.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" + +namespace Aidge +{ + + template <class T> + void ReduceBackward(const T* input, + T* output, + const std::vector<std::size_t>& inputDims, + const std::vector<std::size_t>& outputDims, + const std::vector<int>& axes, + const std::vector<std::size_t>& factors, + int outSize); +} +#endif /* AIDGE_CUDA_OPERATOR_REDUCEIMPL_KERNEL_H_ */ \ No newline at end of file diff --git a/include/aidge/backend/cuda/operator/ReduceMeanImpl.hpp b/include/aidge/backend/cuda/operator/ReduceMeanImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a50ff21b35f0b062c6a9c327ea2892c15055a175 --- /dev/null +++ b/include/aidge/backend/cuda/operator/ReduceMeanImpl.hpp @@ -0,0 +1,62 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_BACKEND_CUDA_OPERATOR_REDUCEMEANIMPL_H_ +#define AIDGE_BACKEND_CUDA_OPERATOR_REDUCEMEANIMPL_H_ + +#include <array> +#include <memory> +#include <tuple> +#include <vector> + +#include <cudnn.h> + +#include "aidge/backend/OperatorImpl.hpp" +#include "aidge/operator/ReduceMean.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" + +#include "aidge/backend/cuda/utils/CudaUtils.hpp" + +namespace Aidge { +// Operator implementation entry point for the backend +class ReduceMeanImpl_cuda : public OperatorImpl { +public: + ReduceMeanImpl_cuda(const ReduceMean_Op& op) : OperatorImpl(op, "cuda") {} + + static std::unique_ptr<ReduceMeanImpl_cuda> create(const ReduceMean_Op& op) { + return std::make_unique<ReduceMeanImpl_cuda>(op); + } + + virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + return { + {DataType::Float64}, + {DataType::Float32}, + {DataType::Float16}, + }; + } + + void forward() override; + void backward() override; + +private: + // CuDNN specific variables + std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback; + + template <class T> void forward_(const Tensor& input, const std::vector<int>& axes, bool keepDims); + template <class T> void backward_(const Tensor& output_grad, const std::vector<int>& axes); +}; + +// Implementation entry point registration to Operator +REGISTRAR(ReduceMean_Op, "cuda", Aidge::ReduceMeanImpl_cuda::create); +} // namespace Aidge + +#endif /* AIDGE_BACKEND_CUDA_OPERATOR_REDUCEMEANIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/ReduceSumImpl.hpp b/include/aidge/backend/cuda/operator/ReduceSumImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a5a7ae48d7e5bd8f370964d7f81795ecbaa5986b --- /dev/null +++ b/include/aidge/backend/cuda/operator/ReduceSumImpl.hpp @@ -0,0 +1,62 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_BACKEND_CUDA_OPERATOR_REDUCESUMIMPL_H_ +#define AIDGE_BACKEND_CUDA_OPERATOR_REDUCESUMIMPL_H_ + +#include <array> +#include <memory> +#include <tuple> +#include <vector> + +#include <cudnn.h> + +#include "aidge/backend/OperatorImpl.hpp" +#include "aidge/operator/ReduceSum.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" + +#include "aidge/backend/cuda/utils/CudaUtils.hpp" + +namespace Aidge { +// Operator implementation entry point for the backend +class ReduceSumImpl_cuda : public OperatorImpl { +public: + ReduceSumImpl_cuda(const ReduceSum_Op& op) : OperatorImpl(op, "cuda") {} + + static std::unique_ptr<ReduceSumImpl_cuda> create(const ReduceSum_Op& op) { + return std::make_unique<ReduceSumImpl_cuda>(op); + } + + virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + return { + {DataType::Float64}, + {DataType::Float32}, + {DataType::Float16}, + }; + } + + void forward() override; + void backward() override; + +private: + // CuDNN specific variables + std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback; + + template <class T> void forward_(const Tensor& input, const std::vector<int>& axes, bool keepDims); + template <class T> void backward_(const Tensor& output_grad, const std::vector<int>& axes); +}; + +// Implementation entry point registration to Operator +REGISTRAR(ReduceSum_Op, "cuda", Aidge::ReduceSumImpl_cuda::create); +} // namespace Aidge + +#endif /* AIDGE_BACKEND_CUDA_OPERATOR_REDUCESUMIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/ReshapeImpl.hpp b/include/aidge/backend/cuda/operator/ReshapeImpl.hpp index 7b43df680bef115310669f0d55f2f78ef4fe9fa6..d412590c63f925806973038d67ee18e0847f79c2 100644 --- a/include/aidge/backend/cuda/operator/ReshapeImpl.hpp +++ b/include/aidge/backend/cuda/operator/ReshapeImpl.hpp @@ -27,27 +27,32 @@ #include "aidge/backend/cuda/utils/CudaUtils.hpp" namespace Aidge { +// Operator implementation entry point for the backend class ReshapeImpl_cuda : public OperatorImpl { -private: - std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback; - public: - ReshapeImpl_cuda(const Reshape_Op &op) : OperatorImpl(op, "cuda") {} + ReshapeImpl_cuda(const Reshape_Op& op) : OperatorImpl(op, "cuda") {} - static std::unique_ptr<ReshapeImpl_cuda> create(const Reshape_Op &op) { + static std::unique_ptr<ReshapeImpl_cuda> create(const Reshape_Op& op) { return std::make_unique<ReshapeImpl_cuda>(op); } -public: - void forward(); - void backward(); - ~ReshapeImpl_cuda(); + virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + return { + {DataType::Float64}, + {DataType::Float32}, + {DataType::Float16}, + }; + } + + void forward() override; + void backward() override; + +private: + std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback; }; -namespace { -// add cuda backend to Reshape_Op implementation registry -static Registrar<Reshape_Op> registrarReshapeImpl_cuda("cuda", Aidge::ReshapeImpl_cuda::create); -} // namespace +// Implementation entry point registration to Operator +REGISTRAR(Reshape_Op, "cuda", Aidge::ReshapeImpl_cuda::create); } // namespace Aidge #endif /* AIDGE_BACKEND_CUDA_OPERATOR_RESHAPEIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/ShiftGELUImpl.hpp b/include/aidge/backend/cuda/operator/ShiftGELUImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..f83b41ae139482cdb0cd1060846c77ba78fcc0ee --- /dev/null +++ b/include/aidge/backend/cuda/operator/ShiftGELUImpl.hpp @@ -0,0 +1,65 @@ +/******************************************************************************** + * Copyright (c) 2024 Thales + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * Author: Lucas RAKOTOARIVONY, Thales Research & Technology France + * Date: 25.06.2024 + * + ********************************************************************************/ + +#ifndef AIDGE_BACKEND_CUDA_OPERATOR_SHIFTGELUIMPL_H_ +#define AIDGE_BACKEND_CUDA_OPERATOR_SHIFTGELUIMPL_H_ + +#include <array> +#include <memory> +#include <tuple> +#include <vector> + +#include <cudnn.h> + +#include "aidge/backend/OperatorImpl.hpp" +#include "aidge/operator/ShiftGELU.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" + +#include "aidge/backend/cuda/utils/CudaUtils.hpp" + +namespace Aidge { +class ShiftGELUImpl_cuda : public OperatorImpl { +public: + ShiftGELUImpl_cuda(const ShiftGELU_Op &op) : OperatorImpl(op, "cuda") {} + + static std::unique_ptr<ShiftGELUImpl_cuda> create(const ShiftGELU_Op &op) { + return std::make_unique<ShiftGELUImpl_cuda>(op); + } + + virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + return { + {DataType::Float64}, + {DataType::Float32}, + {DataType::Float16}, + }; + } + + + void forward() override; + void backward() override; + +private: + std::shared_ptr<Tensor> mInputFallback; + std::shared_ptr<Tensor> mOutputGradFallback; + + template <class T> void forward_(const Tensor& input); + template <class T> void backward_(const Tensor& output_grad); + +}; + +// Implementation entry point registration to Operator +REGISTRAR(ShiftGELU_Op, "cuda", Aidge::ShiftGELUImpl_cuda::create); +} // namespace Aidge + +#endif /* AIDGE_BACKEND_CUDA_OPERATOR_SHIFTGELUIMPL_H_ */ \ No newline at end of file diff --git a/include/aidge/backend/cuda/operator/ShiftGELUImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/ShiftGELUImpl_CUDA_kernels.hpp new file mode 100644 index 0000000000000000000000000000000000000000..14268521451a631ccb9194d44ed7543af8d494f5 --- /dev/null +++ b/include/aidge/backend/cuda/operator/ShiftGELUImpl_CUDA_kernels.hpp @@ -0,0 +1,78 @@ +/******************************************************************************** + * Copyright (c) 2024 Thales + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * Author: Lucas RAKOTOARIVONY, Thales Research & Technology France + * Date: 25.06.2024 + * + ********************************************************************************/ + +#ifndef AIDGE_CUDA_OPERATOR_SHIFTGELUIMPL_KERNELS_H_ +#define AIDGE_CUDA_OPERATOR_SHIFTGELUIMPL_KERNELS_H_ + +#include <stdexcept> +#include <cfloat> +#include <cuda.h> +#include <cuda_runtime_api.h> +#include <cuda_fp16.h> + +#include "aidge/data/Data.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" + +namespace Aidge { + +/** + * @brief Compute the forward for ShiftGELU + * @param input: Input tensor + * @param quantized_tensor: Quantized output tensor + * @param GELUtensor: Pointer to an empty memory block allocated on the GPU (just use for computation) + * @param SumTensor: Pointer to an empty memory block allocated on the GPU (just use for computation) + * @param dims: Dimensions of input tensor + * @param SF: Scaling factor of input tensor + * @param N: Arithmetic precision, currently set at 15 like I-ViT (the greater the N, the more precise the operation, but the greater the number of bits required) + * @param output_bits: Desired bit precision (8 for int8, for example) +*/ +template <class T> +__global__ void ShiftGELUforward_(T* input,int* quantized_tensor,int* GELUtensor,int* SumTensor, int* dims, double SF, int N, int output_bits); + +/** + * @brief Wrapper function to execute ShiftGELUforward_ + * @note Output correspond to the non-quantized tensor, to obtain the quantized tensor we need to copy quantized_tensor and not input_cuda_tensor + * @param input: Input tensor + * @param output: Output tensor (not quantized) + * @param SF: Scaling factor of input tensor + * @param N: Arithmetic precision, currently set at 15 like I-ViT (the greater the N, the more precise the operation, but the greater the number of bits required) + * @param output_bits: Desired bit precision (8 for int8, for example) + * @param size: Number of elements in the input tensor + * @param dims_input: Dimensions of input tensor +*/ +template <class T> +void ShiftGELUforward(const T* input, T* output, double SF,int N, int output_bits, size_t size, std::vector<long unsigned int> dims_input); + +/** + * @brief Compute the backward for ShiftGELU + * @param input_grad: Gradient of input tensor (that we want to obtain) + * @param output_tensor: Output tensor obtained after forward + * @param output_grad: Gradient of output tensor + * @param size: Number of elements in the input tensor +*/ +template <class T> +__global__ void ShiftGELUbackward_(T* input_grad, const T* output_tensor, const T* output_grad, int size); + +/** + * @brief Wrapper function to execute ShiftGELUbackward_ + * @param output_tensor: Output tensor obtained after forward + * @param output_grad: Gradient of output tensor + * @param input_grad: Gradient of input tensor (that we want to obtain) + * @param size: Number of elements in the input tensor +*/ +template <class T> +void ShiftGELUbackward(const T* output_tensor, const T* output_grad, T* input_grad, size_t size); + +} + +#endif /* AIDGE_CUDA_OPERATOR_SHIFTGELUIMPL_FORWARD_KERNEL_H_ */ diff --git a/include/aidge/backend/cuda/operator/ShiftMaxImpl.hpp b/include/aidge/backend/cuda/operator/ShiftMaxImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..707b5616fde120f7e8ef38e6dc9f1552cfdb0d59 --- /dev/null +++ b/include/aidge/backend/cuda/operator/ShiftMaxImpl.hpp @@ -0,0 +1,64 @@ +/******************************************************************************** + * Copyright (c) 2024 Thales + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * Author: Lucas RAKOTOARIVONY, Thales Research & Technology France + * Date: 25.06.2024 + * + ********************************************************************************/ + +#ifndef AIDGE_BACKEND_CUDA_OPERATOR_SHIFTMAXIMPL_H_ +#define AIDGE_BACKEND_CUDA_OPERATOR_SHIFTMAXIMPL_H_ + +#include <array> +#include <memory> +#include <tuple> +#include <vector> + +#include <cudnn.h> + +#include "aidge/backend/OperatorImpl.hpp" +#include "aidge/operator/ShiftMax.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" + +#include "aidge/backend/cuda/utils/CudaUtils.hpp" + +namespace Aidge { +class ShiftMaxImpl_cuda : public OperatorImpl { +public: + ShiftMaxImpl_cuda(const ShiftMax_Op &op) : OperatorImpl(op, "cuda") {} + + static std::unique_ptr<ShiftMaxImpl_cuda> create(const ShiftMax_Op &op) { + return std::make_unique<ShiftMaxImpl_cuda>(op); + } + + virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + return { + {DataType::Float64}, + {DataType::Float32}, + {DataType::Float16}, + }; + } + + void forward() override; + void backward() override; + +private: + std::shared_ptr<Tensor> mInputFallback; + std::shared_ptr<Tensor> mOutputGradFallback; + + template <class T> void forward_(const Tensor& input); + template <class T> void backward_(const Tensor& output_grad); + +}; + +// Implementation entry point registration to Operator +REGISTRAR(ShiftMax_Op, "cuda", Aidge::ShiftMaxImpl_cuda::create); +} // namespace Aidge + +#endif /* AIDGE_BACKEND_CUDA_OPERATOR_SHIFTMAXIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/ShiftMaxImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/ShiftMaxImpl_CUDA_kernels.hpp new file mode 100644 index 0000000000000000000000000000000000000000..037a7cbb6362a8eca5a9e6f5a277b29a6a6bd907 --- /dev/null +++ b/include/aidge/backend/cuda/operator/ShiftMaxImpl_CUDA_kernels.hpp @@ -0,0 +1,79 @@ +/******************************************************************************** + * Copyright (c) 2024 Thales + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * Author: Lucas RAKOTOARIVONY, Thales Research & Technology France + * Date: 25.06.2024 + * + ********************************************************************************/ + +#ifndef AIDGE_CUDA_OPERATOR_SHIFTMAXIMPL_KERNELS_H_ +#define AIDGE_CUDA_OPERATOR_SHIFTMAXIMPL_KERNELS_H_ + +#include <stdexcept> +#include <cfloat> +#include <cuda.h> +#include <cuda_runtime_api.h> +#include <cuda_fp16.h> + +#include "aidge/data/Data.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" + +namespace Aidge { + +/** + * @brief Compute the forward for ShiftMax + * @param input: Input tensor + * @param quantized_tensor: Quantized output tensor + * @param factor: Pointer to an empty memory block allocated on the GPU (just use for computation) + * @param dims: Dimensions of input tensor + * @param SF: Scaling factor of input tensor + * @param N: Arithmetic precision, currently set at 15 like I-ViT (the greater the N, the more precise the operation, but the greater the number of bits required) + * @param output_bits: Desired bit precision (8 for int8, for example) + * @param new_SF: Scaling factor of output that can be use to dequantify +*/ +template <class T> +__global__ void ShiftMaxforward_(T* input,int* quantized_tensor,int* factor, int* dims, double SF, int N, int output_bits,double new_SF); + +/** + * @brief Wrapper function to execute ShiftMaxforward_ + * @note Output correspond to the non-quantized tensor, to obtain the quantized tensor we need to copy quantized_tensor and not input_cuda_tensor + * @param input: Input tensor + * @param output: Output tensor (not quantized) + * @param SF: Scaling factor of input tensor + * @param N: Arithmetic precision, currently set at 15 like I-ViT (the greater the N, the more precise the operation, but the greater the number of bits required) + * @param output_bits: Desired bit precision (8 for int8, for example) + * @param size: Number of elements in the input tensor + * @param dims_input: Dimensions of input tensor +*/ +template <class T> +void ShiftMaxforward(const T* input, T* output, double SF,int N, int output_bits, size_t size, std::vector<long unsigned int> dims_input); + +/** + * @brief Compute the backward for ShiftMax + * @param input_grad: Gradient of input tensor (that we want to obtain) + * @param output_tensor: Output tensor obtained after forward + * @param output_grad: Gradient of output tensor + * @param dims: Dimensions of input tensor +*/ +template <class T> +__global__ void ShiftMaxbackward_(T* input_grad, const T* output_tensor, const T* output_grad, const int* dims); + +/** + * @brief Wrapper function to execute ShiftMaxbackward_ + * @param output_tensor: Output tensor obtained after forward + * @param output_grad: Gradient of output tensor + * @param input_grad: Gradient of input tensor (that we want to obtain) + * @param size: Number of elements in the input tensor + * @param dims: Dimensions of input tensor +*/ +template <class T> +void ShiftMaxbackward(const T* output_tensor, const T* output_grad, T* input_grad, size_t size, std::vector<long unsigned int> dims); + +} + +#endif /* AIDGE_CUDA_OPERATOR_SHIFTMAXIMPL_FORWARD_KERNEL_H_ */ diff --git a/include/aidge/backend/cuda/operator/SigmoidImpl.hpp b/include/aidge/backend/cuda/operator/SigmoidImpl.hpp index 90dbb717732ad788b868fdc95eb55579a5e0b9f6..bc29b9e5f53716641a692cd63c29f4600f3cdd02 100644 --- a/include/aidge/backend/cuda/operator/SigmoidImpl.hpp +++ b/include/aidge/backend/cuda/operator/SigmoidImpl.hpp @@ -27,7 +27,25 @@ #include "aidge/backend/cuda/utils/CudaUtils.hpp" namespace Aidge { +// Operator implementation entry point for the backend class SigmoidImpl_cuda : public OperatorImpl { +public: + SigmoidImpl_cuda(const Sigmoid_Op& op) : OperatorImpl(op, "cuda") {} + + static std::unique_ptr<SigmoidImpl_cuda> create(const Sigmoid_Op& op) { + return std::make_unique<SigmoidImpl_cuda>(op); + } + + virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + return { + {DataType::Any} + }; + } + + void forward() override; + void backward() override; + ~SigmoidImpl_cuda(); + private: // CuDNN specific variables #if CUDNN_VERSION >= 5000 @@ -38,27 +56,12 @@ private: std::shared_ptr<Tensor> mInputFallback; std::shared_ptr<Tensor> mOutputGradFallback; -public: - SigmoidImpl_cuda(const Sigmoid_Op &op) : OperatorImpl(op, "cuda") {} - - static std::unique_ptr<SigmoidImpl_cuda> create(const Sigmoid_Op &op) { - return std::make_unique<SigmoidImpl_cuda>(op); - } - -public: - void forward(); - void backward(); - ~SigmoidImpl_cuda(); - -private: template <class T> void forward_(const Tensor& input); template <class T> void backward_(const Tensor& output_grad); }; -namespace { -// add cuda backend to Sigmoid_Op implementation registry -static Registrar<Sigmoid_Op> registrarSigmoidImpl_cuda("cuda", Aidge::SigmoidImpl_cuda::create); -} // namespace +// Implementation entry point registration to Operator +REGISTRAR(Sigmoid_Op, "cuda", Aidge::SigmoidImpl_cuda::create); } // namespace Aidge #endif /* AIDGE_BACKEND_CUDA_OPERATOR_SIGMOIDIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/SubImpl.hpp b/include/aidge/backend/cuda/operator/SubImpl.hpp index fd1a76692abdf16b9854b90f535f68329ae5877a..45c833f3e7f9f25258469a4d1e34e8598df068ef 100644 --- a/include/aidge/backend/cuda/operator/SubImpl.hpp +++ b/include/aidge/backend/cuda/operator/SubImpl.hpp @@ -27,30 +27,33 @@ #include "aidge/backend/cuda/utils/CudaUtils.hpp" namespace Aidge { +// Operator implementation entry point for the backend class SubImpl_cuda : public OperatorImpl { -private: - - public: - SubImpl_cuda(const Sub_Op &op) : OperatorImpl(op, "cuda") {} + SubImpl_cuda(const Sub_Op& op) : OperatorImpl(op, "cuda") {} - static std::unique_ptr<SubImpl_cuda> create(const Sub_Op &op) { + static std::unique_ptr<SubImpl_cuda> create(const Sub_Op& op) { return std::make_unique<SubImpl_cuda>(op); } -public: - void forward(); - void backward(); - // ~SubImpl_cuda(); + virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + return { + {DataType::Float64}, + {DataType::Float32}, + {DataType::Float16}, + }; + } + + void forward() override; + void backward() override; + private: template <class T> void forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides); template <class T> void backward_(const Tensor& outGrad, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides); }; -namespace { -// add cuda backend to Sub_Op implementation registry -static Registrar<Sub_Op> registrarSubImpl_cuda("cuda", Aidge::SubImpl_cuda::create); -} // namespace +// Implementation entry point registration to Operator +REGISTRAR(Sub_Op, "cuda", Aidge::SubImpl_cuda::create); } // namespace Aidge #endif /* AIDGE_BACKEND_CUDA_OPERATOR_SUBIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/TanhImpl.hpp b/include/aidge/backend/cuda/operator/TanhImpl.hpp index 35e879513fee0ec9354edecefd3d53860e54a0b1..166acd6adee397a3f284363a9db1e71152467b94 100644 --- a/include/aidge/backend/cuda/operator/TanhImpl.hpp +++ b/include/aidge/backend/cuda/operator/TanhImpl.hpp @@ -27,7 +27,25 @@ #include "aidge/backend/cuda/utils/CudaUtils.hpp" namespace Aidge { +// Operator implementation entry point for the backend class TanhImpl_cuda : public OperatorImpl { +public: + TanhImpl_cuda(const Tanh_Op& op) : OperatorImpl(op, "cuda") {} + + static std::unique_ptr<TanhImpl_cuda> create(const Tanh_Op& op) { + return std::make_unique<TanhImpl_cuda>(op); + } + + virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + return { + {DataType::Any} + }; + } + + void forward() override; + void backward() override; + ~TanhImpl_cuda(); + private: // CuDNN specific variables #if CUDNN_VERSION >= 5000 @@ -38,27 +56,12 @@ private: std::shared_ptr<Tensor> mInputFallback; std::shared_ptr<Tensor> mOutputGradFallback; -public: - TanhImpl_cuda(const Tanh_Op &op) : OperatorImpl(op, "cuda") {} - - static std::unique_ptr<TanhImpl_cuda> create(const Tanh_Op &op) { - return std::make_unique<TanhImpl_cuda>(op); - } - -public: - void forward(); - void backward(); - ~TanhImpl_cuda(); - -private: template <class T> void forward_(const Tensor& input); template <class T> void backward_(const Tensor& output_grad); }; -namespace { -// add cuda backend to Tanh_Op implementation registry -static Registrar<Tanh_Op> registrarTanhImpl_cuda("cuda", Aidge::TanhImpl_cuda::create); -} // namespace +// Implementation entry point registration to Operator +REGISTRAR(Tanh_Op, "cuda", Aidge::TanhImpl_cuda::create); } // namespace Aidge #endif /* AIDGE_BACKEND_CUDA_OPERATOR_TANHIMPL_H_ */ diff --git a/include/aidge/backend/cuda/utils/CudaContext.hpp b/include/aidge/backend/cuda/utils/CudaContext.hpp index 7218cc24aed718f57a1866be74e7ba9124a5a7f1..f21886e502b9017aa55e250e7257d16bc5d04501 100644 --- a/include/aidge/backend/cuda/utils/CudaContext.hpp +++ b/include/aidge/backend/cuda/utils/CudaContext.hpp @@ -157,8 +157,10 @@ namespace Aidge { return CUDNN_DATA_UINT8; case DataType::Int32: return CUDNN_DATA_INT32; +#if CUDNN_VERSION >= 8100 case DataType::Int64: return CUDNN_DATA_INT64; +#endif default: assert(false && "Unsupported CuDNN type"); } diff --git a/project_name.txt b/project_name.txt deleted file mode 100644 index d029f485dbbd8bc3386a50eee3a4d8aa201aeb74..0000000000000000000000000000000000000000 --- a/project_name.txt +++ /dev/null @@ -1 +0,0 @@ -aidge_backend_cuda \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000000000000000000000000000000000000..911af058463e100bc35453315919ec8bda3f2845 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,114 @@ +[project] +name = "aidge_backend_cuda" +description="CUDA implementations of the operators of aidge framework" +dependencies = [ + "numpy", +] +requires-python = ">= 3.7" +readme = "README.md" +license = { file = "LICENSE" } +classifiers = [ + "Development Status :: 2 - Pre-Alpha", + "Programming Language :: Python :: 3" +] +dynamic = ["version"] + +##################################################### +# SETUPTOOLS +[tool.setuptools] +[tool.setuptools.packages.find] +where = ["."] # list of folders that contain the packages (["."] by default) +include = ["aidge_backend_cuda*"] # package names should match these glob patterns (["*"] by default) +exclude = ["aidge_backend_cuda.unit_tests*"] # exclude packages matching these glob patterns (empty by default) +namespaces = false # to disable scanning PEP 420 namespaces (true by default) +# SETUPTOOLS_SCM +[tool.setuptools_scm] +write_to = "aidge_backend_cuda/_version.py" + +[build-system] +requires = [ + "setuptools>=68", + "setuptools-scm", + "cmake>=3.18.0", + "toml" +] +build-backend = "setuptools.build_meta" + +##################################################### +# CIBUILDWHEEL +[tool.cibuildwheel] +build-frontend = "build" +test-requires = "pytest" +test-command = "pytest {project}/aidge_backend_cuda/unit_tests" +# uncomment to run cibuildwheel locally on selected distros +# build=[ +# "cp38-manylinux_x86_64", +# "cp39-manylinux_x86_64", +# "cp310-manylinux_x86_64" +# ] + +[tool.cibuildwheel.container-engine] +# pass command line options to 'docker run' +name = "docker" +create-args = [ + "--runtime=nvidia", + "--gpus", "all", + "--privileged", + "-v","/cache", + "-v","/var/run/docker.sock:/var/run/docker.sock", +] + +### AIDGE DEPENDENCIES DECLARATION +[tool.cibuildwheel.environment] +# These variables are here for debug purpose but their values when called from CI are set in .gitlab-ci.yml +BUILD_WITH_CUDA=1 +AIDGE_DEPENDENCIES = "aidge_core aidge_backend_cpu" # format => "dep_1 dep_2 ... dep_n" +AIDGE_INSTALL="/AIDGE_INSTALL_CIBUILDWHEEL" +ARCH="x86_64" +CUDNN_VERSION="9" +CUDA_MAJOR_VERSION="11" +CUDA_MINOR_VERSION="8" +DOCKER_HOST="unix:///var/run/docker.sock" +SEARCH_PATH="/home/ubuntu/aidge/aidge" # debug path +# these two following variables are set within CMakeLists.txt when calling cibuildwheel from CI +LD_LIBRARY_PATH="/usr/local/cuda/lib64:$LD_LIBRARY_PATH" +PATH="/usr/local/cuda/bin:$PATH" +[tool.cibuildwheel.linux] +before-build = [ + "export CUDA_TOOLKIT_VERSION=$CUDA_MAJOR_VERSION-$CUDA_MINOR_VERSION", + "echo '\n\n\n\n yum -y install cuda-toolkit-$CUDA_TOOLKIT_VERSION.$ARCH'", + "yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/$ARCH/cuda-rhel7.repo", + "yum clean all", + "yum -y install cuda-toolkit-$CUDA_TOOLKIT_VERSION.$ARCH", + "yum list available | grep cudnn", + "yum -y install libcudnn$CUDNN_VERSION-cuda-$CUDA_MAJOR_VERSION.$ARCH", + "yum -y install libcudnn$CUDNN_VERSION-devel-cuda-$CUDA_MAJOR_VERSION.$ARCH", + "export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH", + "export PATH=/usr/local/cuda/bin:$PATH", + "which nvcc", + "nvcc --version", + "echo '\n\n\n\nInstalling required dependencies for aidge_backend_cuda.\n\n'", + "bash .gitlab/ci/cibuildwheel_build_deps_before_build_wheel.sh /host/$SEARCH_PATH" +] +before-test= [ + "export CUDA_TOOLKIT_VERSION=$CUDA_MAJOR_VERSION-$CUDA_MINOR_VERSION", + "echo '\n\n\n\n yum -y install cuda-toolkit-$CUDA_TOOLKIT_VERSION.$ARCH'", + "yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/$ARCH/cuda-rhel7.repo", + "yum clean all", + "yum -y install cuda-toolkit-$CUDA_TOOLKIT_VERSION.$ARCH", + "yum list available | grep cudnn", + "yum -y install libcudnn$CUDNN_VERSION-cuda-$CUDA_MAJOR_VERSION.$ARCH", + "yum -y install libcudnn$CUDNN_VERSION-devel-cuda-$CUDA_MAJOR_VERSION.$ARCH", + "export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH", + "export PATH=/usr/local/cuda/bin:$PATH", + "nvcc --version", + "echo '\n\n\n\nInstalling required dependencies for aidge_backend_cuda.\n\n'", + "bash .gitlab/ci/cibuildwheel_build_deps_before_build_wheel.sh /host/$SEARCH_PATH" +] +[tool.cibuildwheel.windows] +before-build = [ + "powershell -File .\\.gitlab\\ci\\cibuildwheel_build_deps_before_build_wheel.ps1" +] +before-test = [ + "powershell -File .\\.gitlab\\ci\\cibuildwheel_build_deps_before_build_wheel.ps1" +] diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/setup.py b/setup.py index 80500f3165dd87eb7b6dd73c78b89806cc8a874a..706fc53ca08319ee487ef789ebc85f0d513ab25b 100644 --- a/setup.py +++ b/setup.py @@ -1,37 +1,25 @@ #!/usr/bin/env python3 -""" Aidge - -#TODO To change -POC of the next framework named Aidge -""" - -DOCLINES = (__doc__ or '').split("\n") - import sys import os -# Python supported version checks -if sys.version_info[:2] < (3, 7): - raise RuntimeError("Python version >= 3.7 required.") - - -CLASSIFIERS = """\ -Development Status :: 2 - Pre-Alpha -""" - import shutil import pathlib -import subprocess import multiprocessing from math import ceil +import toml + from setuptools import setup, Extension from setuptools import find_packages from setuptools.command.build_ext import build_ext + def get_project_name() -> str: - return open(pathlib.Path().absolute() / "project_name.txt", "r").read() + with open(pathlib.Path().absolute() / "pyproject.toml", "r") as file: + project_toml = toml.load(file) + return project_toml["project"]["name"] + def get_project_version() -> str: aidge_root = pathlib.Path().absolute() @@ -43,8 +31,8 @@ class CMakeExtension(Extension): def __init__(self, name): super().__init__(name, sources=[]) -class CMakeBuild(build_ext): +class CMakeBuild(build_ext): def run(self): # This lists the number of processors available on the machine # The compilation will use half of them @@ -62,17 +50,45 @@ class CMakeBuild(build_ext): os.chdir(str(build_temp)) - # Impose to use the executable of the python - # used to launch setup.py to setup PythonInterp - param_py = "-DPYTHON_EXECUTABLE=" + sys.executable + compile_type = ( + "Release" + if "AIDGE_PYTHON_BUILD_TYPE" not in os.environ + else os.environ["AIDGE_PYTHON_BUILD_TYPE"] + ) + + install_path = ( + os.path.join(sys.prefix, "lib", "libAidge") + if "AIDGE_INSTALL" not in os.environ + else os.environ["AIDGE_INSTALL"] + ) + + # using ninja as default build system to build faster and with the same compiler as on windows + build_gen = ( + ["-G", os.environ["AIDGE_BUILD_GEN"]] + if "AIDGE_BUILD_GEN" in os.environ + else [] + ) + + self.spawn( + [ + "cmake", + *build_gen, + str(cwd), + "-DTEST=OFF", + f"-DCMAKE_INSTALL_PREFIX:PATH={install_path}", + f"-DCMAKE_BUILD_TYPE={compile_type}", + "-DPYBIND=ON", + "-DCMAKE_EXPORT_COMPILE_COMMANDS=ON", + "-DCOVERAGE=OFF", + "-DCMAKE_CUDA_ARCHITECTURES=native", + ] + ) - compile_type = 'Debug' - install_path = os.path.join(sys.prefix, "lib", "libAidge") if "AIDGE_INSTALL" not in os.environ else os.environ["AIDGE_INSTALL"] - - self.spawn(['cmake', str(cwd), param_py, '-DTEST=OFF', f'-DCMAKE_INSTALL_PREFIX:PATH={install_path}', f'-DCMAKE_BUILD_TYPE={compile_type}']) if not self.dry_run: - self.spawn(['cmake', '--build', '.', '--config', compile_type, '-j', max_jobs]) - self.spawn(['cmake', '--install', '.', '--config', compile_type]) + self.spawn( + ["cmake", "--build", ".", "--config", compile_type, "-j", max_jobs] + ) + self.spawn(["cmake", "--install", ".", "--config", compile_type]) os.chdir(str(cwd)) aidge_package = build_lib / (get_project_name()) @@ -83,8 +99,10 @@ class CMakeBuild(build_ext): # Copy all shared object files from build_temp/lib to aidge_package for root, _, files in os.walk(build_temp.absolute()): for file in files: - if (file.endswith('.so') or file.endswith('.pyd')) and (root != str(aidge_package.absolute())): - currentFile=os.path.join(root, file) + if (file.endswith(".so") or file.endswith(".pyd")) and ( + root != str(aidge_package.absolute()) + ): + currentFile = os.path.join(root, file) shutil.copy(currentFile, str(aidge_package.absolute())) # Copy version.txt in aidge_package @@ -92,23 +110,12 @@ class CMakeBuild(build_ext): shutil.copy("version.txt", str(aidge_package.absolute())) -if __name__ == '__main__': - +if __name__ == "__main__": setup( - name=get_project_name(), - version=get_project_version(), - python_requires='>=3.7', - description=DOCLINES[0], - long_description_content_type="text/markdown", - long_description="\n".join(DOCLINES[2:]), - classifiers=[c for c in CLASSIFIERS.split('\n') if c], - packages=find_packages(where="."), include_package_data=True, ext_modules=[CMakeExtension(get_project_name())], cmdclass={ - 'build_ext': CMakeBuild, + "build_ext": CMakeBuild, }, - install_requires=['aidge_core'], zip_safe=False, - ) diff --git a/src/data/TensorImpl.cu b/src/data/TensorImpl.cu index 898475b5db325afcaedff44756cc2157cf9e2eec..c70b024fbab1a031ea69d5d9b169dc115b7320db 100644 --- a/src/data/TensorImpl.cu +++ b/src/data/TensorImpl.cu @@ -98,3 +98,34 @@ bool Aidge::TensorImpl_cuda<T>::operator==(const TensorImpl &otherImpl) const { thrust::device_ptr<T> thrustOtherData(otherImplCuda.mData.data()); return thrust::equal(thrustData, thrustData + mNbElts, thrustOtherData); } + +template void Aidge::thrust_copy<double, double>(double const*, double*, unsigned long); +template void Aidge::thrust_copy<double, float>(double const*, float*, unsigned long); +template void Aidge::thrust_copy<double, int>(double const*, int*, unsigned long); +template void Aidge::thrust_copy<float, double>(float const*, double*, unsigned long); +template void Aidge::thrust_copy<float, float>(float const*, float*, unsigned long); +template void Aidge::thrust_copy<float, int>(float const*, int*, unsigned long); +template void Aidge::thrust_copy<int, double>(int const*, double*, unsigned long); +template void Aidge::thrust_copy<int, float>(int const*, float*, unsigned long); +template void Aidge::thrust_copy<int, int>(int const*, int*, unsigned long); +template void Aidge::thrust_copy<long, double>(long const*, double*, unsigned long); +template void Aidge::thrust_copy<long, float>(long const*, float*, unsigned long); +template void Aidge::thrust_copy<long, int>(long const*, int*, unsigned long); +template void Aidge::thrust_copy<short, double>(short const*, double*, unsigned long); +template void Aidge::thrust_copy<short, float>(short const*, float*, unsigned long); +template void Aidge::thrust_copy<short, int>(short const*, int*, unsigned long); +template void Aidge::thrust_copy<signed char, double>(signed char const*, double*, unsigned long); +template void Aidge::thrust_copy<signed char, float>(signed char const*, float*, unsigned long); +template void Aidge::thrust_copy<signed char, int>(signed char const*, int*, unsigned long); +template void Aidge::thrust_copy<unsigned char, double>(unsigned char const*, double*, unsigned long); +template void Aidge::thrust_copy<unsigned char, float>(unsigned char const*, float*, unsigned long); +template void Aidge::thrust_copy<unsigned char, int>(unsigned char const*, int*, unsigned long); +template void Aidge::thrust_copy<unsigned int, double>(unsigned int const*, double*, unsigned long); +template void Aidge::thrust_copy<unsigned int, float>(unsigned int const*, float*, unsigned long); +template void Aidge::thrust_copy<unsigned int, int>(unsigned int const*, int*, unsigned long); +template void Aidge::thrust_copy<unsigned long, double>(unsigned long const*, double*, unsigned long); +template void Aidge::thrust_copy<unsigned long, float>(unsigned long const*, float*, unsigned long); +template void Aidge::thrust_copy<unsigned long, int>(unsigned long const*, int*, unsigned long); +template void Aidge::thrust_copy<unsigned short, double>(unsigned short const*, double*, unsigned long); +template void Aidge::thrust_copy<unsigned short, float>(unsigned short const*, float*, unsigned long); +template void Aidge::thrust_copy<unsigned short, int>(unsigned short const*, int*, unsigned long); diff --git a/src/operator/AddImpl.cpp b/src/operator/AddImpl.cpp index 74d89c405530766324407fc42345f237931dc2f4..de7ea925554906ea5fe1e5dcba268b17a06a47bd 100644 --- a/src/operator/AddImpl.cpp +++ b/src/operator/AddImpl.cpp @@ -44,6 +44,10 @@ void Aidge::AddImpl_cuda::forward() { std::copy(inputs[i].dims().begin(), inputs[i].dims().end(), std::back_inserter(dims[i])); dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1)); + if (dims[i].size() < 4) { + dims[i].resize(4, 1); + } + // Compute the corresponding strides std::vector<int> tensorStrides(dims[i].size()); int product = 1; @@ -191,7 +195,7 @@ void Aidge::AddImpl_cuda::backward_(const Tensor& outputGrad, const std::vector< tensorDesc, &workspaceSize)); - float *d_workspace; + void *d_workspace; CHECK_CUDA_STATUS(cudaMalloc(&d_workspace, workspaceSize)); CHECK_CUDNN_STATUS(cudnnReduceTensor(CudaContext::cudnnHandle(), diff --git a/src/operator/AndImpl.cpp b/src/operator/AndImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e1ee9ebcb9437b89666da21a915907b5434ece26 --- /dev/null +++ b/src/operator/AndImpl.cpp @@ -0,0 +1,95 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <algorithm> +#include <cassert> +#include <numeric> +#include <vector> + +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/AndImpl.hpp" +#include "aidge/backend/cuda/operator/AndImpl_CUDA_kernels.hpp" +#include "aidge/backend/cuda/utils/CudaContext.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/operator/And.hpp" +#include "aidge/utils/Types.h" + +void Aidge::AndImpl_cuda::forward() { + const And_Op& op = static_cast<const And_Op&>(mOp); + // Check inputs + AIDGE_ASSERT(op.getInput(0), "missing input in And operator"); + AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run And forward because the 0-th input has no implementation."); + DataType datatypeFirstInput = op.getInput(0)->dataType(); + for (IOIndex_t i = 1; i < op.nbInputs(); ++i) { + AIDGE_ASSERT(op.getInput(i), "missing input in And operator"); + AIDGE_ASSERT(op.getInput(i)->hasImpl(), "cannot run And forward because the {}-th input has no implementation.", i); + AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, "Cannot And inputs with two differents data type."); + } + + std::vector<std::shared_ptr<Tensor>> inputFallbacks(op.nbInputs()); + std::vector<Tensor> inputs(op.nbInputs()); + std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims + std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides + for (IOIndex_t i = 0; i < op.nbInputs(); ++i) { + inputs[i] = op.getInput(i)->refCastFrom(inputFallbacks[i], *op.getOutput(0)); + + // Get tensor dims and broadcast them + std::copy(inputs[i].dims().begin(), inputs[i].dims().end(), std::back_inserter(dims[i])); + dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1)); + + if (dims[i].size() < 4) { + dims[i].resize(4, 1); + } + + // Compute the corresponding strides + std::vector<int> tensorStrides(dims[i].size()); + int product = 1; + for (size_t j = dims[i].size(); j > 0; --j) { + tensorStrides[j - 1] = product; + product *= dims[i][j - 1]; + } + strides[i] = tensorStrides; + } + + switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + forward_<double>(inputs, dims, strides); + break; + case DataType::Float32: + forward_<float>(inputs, dims, strides); + break; + case DataType::Float16: + forward_<half>(inputs, dims, strides); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + } +} + +template <class T> +void Aidge::AndImpl_cuda::forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const T * input1Ptr = static_cast<const T*>(inputs[0].getImpl()->rawPtr()); + const T * input2Ptr = static_cast<const T*>(inputs[1].getImpl()->rawPtr()); + T * outputPtr = static_cast<T*>(op.getOutput(0)->getImpl()->rawPtr()); + + std::vector<int> outputStrides(op.getOutput(0)->nbDims(), 1); + if(op.getOutput(0)->nbDims()>1) { + for (int i = op.getOutput(0)->nbDims()-2; i >= 0; i--) { + outputStrides[i] = outputStrides[i+1] * op.getOutput(0)->dims()[i+1]; + } + } + + Aidge::AndForward<T>(input1Ptr, input2Ptr, outputPtr, + inputsDims[0], inputsDims[1], + inputsStrides[0], inputsStrides[1], outputStrides, + static_cast<int>(op.getOutput(0)->size())); +} \ No newline at end of file diff --git a/src/operator/AndImpl_CUDA_kernels.cu b/src/operator/AndImpl_CUDA_kernels.cu new file mode 100644 index 0000000000000000000000000000000000000000..34bfccf98c013d8bfc934325f4e327cbae9e7b4a --- /dev/null +++ b/src/operator/AndImpl_CUDA_kernels.cu @@ -0,0 +1,95 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <cuda_fp16.h> + +#include "aidge/backend/cuda/operator/AndImpl_CUDA_kernels.hpp" + +// Helper function for comparison +template <typename T> +__device__ bool compareE(T a, T b) { + return a == b; +} +template <> +__device__ bool compareE<half>(half a, half b) { + return __half2float(a) == __half2float(b); +} + +template <typename T> +__global__ void and_cuda_Kernel(const T* input1, const T* input2, T* output, + int* input1_shape, int* input2_shape, + int* input1_strides, int* input2_strides, int* output_strides, + int num_dims, int size) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= size) return; + + int input1_idx = 0, input2_idx = 0; + int temp_idx = idx; + for (int i = 0; i < num_dims; ++i) { + int dim = temp_idx / output_strides[i]; + temp_idx %= output_strides[i]; + input1_idx += (input1_shape[i] == 1 ? 0 : dim) * input1_strides[i]; + input2_idx += (input2_shape[i] == 1 ? 0 : dim) * input2_strides[i]; + } + + output[idx] = static_cast<T>(compareE(input1[input1_idx], input2[input2_idx])); +} + +template <typename T> +void Aidge::AndForward(const T* input1, const T* input2, T* output, + const std::vector<int>& input1Dims,const std::vector<int>& input2Dims, + const std::vector<int>& input1Strides, const std::vector<int>& input2Strides,const std::vector<int>& outputStrides, + int outSize) +{ + int *d_input1_strides, *d_input2_strides, *d_output_strides, *d_input1_shape, *d_input2_shape; + // Allocate device memory + CHECK_CUDA_STATUS(cudaMalloc(&d_input1_shape, input1Dims.size() * sizeof(int))); + CHECK_CUDA_STATUS(cudaMalloc(&d_input2_shape, input1Dims.size() * sizeof(int))); + CHECK_CUDA_STATUS(cudaMalloc(&d_input1_strides, input1Dims.size() * sizeof(int))); + CHECK_CUDA_STATUS(cudaMalloc(&d_input2_strides, input1Dims.size() * sizeof(int))); + CHECK_CUDA_STATUS(cudaMalloc(&d_output_strides, input1Dims.size() * sizeof(int))); + + // Copy data from host to device; + CHECK_CUDA_STATUS(cudaMemcpy(d_input1_shape, input1Dims.data(), input1Dims.size() * sizeof(int), cudaMemcpyHostToDevice)); + CHECK_CUDA_STATUS(cudaMemcpy(d_input2_shape, input2Dims.data(), input1Dims.size() * sizeof(int), cudaMemcpyHostToDevice)); + CHECK_CUDA_STATUS(cudaMemcpy(d_input1_strides, input1Strides.data(), input1Dims.size() * sizeof(int), cudaMemcpyHostToDevice)); + CHECK_CUDA_STATUS(cudaMemcpy(d_input2_strides, input2Strides.data(), input1Dims.size() * sizeof(int), cudaMemcpyHostToDevice)); + CHECK_CUDA_STATUS(cudaMemcpy(d_output_strides, outputStrides.data(), input1Dims.size() * sizeof(int), cudaMemcpyHostToDevice)); + int blockSize = 256; + int numBlocks = (outSize + blockSize - 1) / blockSize; + + int num_dims = input1Dims.size(); + // Launch the kernel + and_cuda_Kernel<<<numBlocks, blockSize>>>(input1, input2, output, + d_input1_shape, d_input2_shape, + d_input1_strides, d_input2_strides, d_output_strides, + num_dims, outSize); + CHECK_CUDA_STATUS(cudaFree(d_input1_shape)); + CHECK_CUDA_STATUS(cudaFree(d_input2_shape)); + CHECK_CUDA_STATUS(cudaFree(d_input1_strides)); + CHECK_CUDA_STATUS(cudaFree(d_input2_strides)); + CHECK_CUDA_STATUS(cudaFree(d_output_strides)); +}; + +template void Aidge::AndForward(const double* input1, const double* input2, double* output, + const std::vector<int>& input1Dims,const std::vector<int>& input2Dims, + const std::vector<int>& inputStrides, const std::vector<int>& input2Strides,const std::vector<int>& outputStrides, + int outSize); + +template void Aidge::AndForward(const float* input1, const float* input2, float* output, + const std::vector<int>& input1Dims,const std::vector<int>& input2Dims, + const std::vector<int>& inputStrides, const std::vector<int>& input2Strides,const std::vector<int>& outputStrides, + int outSize); + +template void Aidge::AndForward(const half* input1, const half* input2, half* output, + const std::vector<int>& input1Dims,const std::vector<int>& input2Dims, + const std::vector<int>& inputStrides, const std::vector<int>& input2Strides,const std::vector<int>& outputStrides, + int outSize); \ No newline at end of file diff --git a/src/operator/ArgMaxImpl.cpp b/src/operator/ArgMaxImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..50d00592ca70333d6fbdd7a10761a0ea2e9beb4b --- /dev/null +++ b/src/operator/ArgMaxImpl.cpp @@ -0,0 +1,74 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <algorithm> +#include <cassert> +#include <numeric> +#include <vector> + +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/ArgMaxImpl.hpp" +#include "aidge/backend/cuda/operator/ArgMaxImpl_CUDA_kernels.hpp" +#include "aidge/backend/cuda/utils/CudaContext.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/operator/ArgMax.hpp" +#include "aidge/utils/Types.h" + +void Aidge::ArgMaxImpl_cuda::forward() { + const ArgMax_Op& op = dynamic_cast<const ArgMax_Op&>(mOp); + AIDGE_ASSERT(mOp.getRawInput(0), "missing input in ArgMax operator"); + AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run ArgMax forward because the input has no implementation."); + + const auto& input = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->refCastFrom(mInputFallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); + const std::int32_t axis = op.axis(); + const DimSize_t selectLastIdx = op.selectLastIndex(); + switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + forward_<double>(input, axis, selectLastIdx); + break; + case DataType::Float32: + forward_<float>(input, axis, selectLastIdx); + break; + case DataType::Float16: + forward_<half>(input, axis, selectLastIdx); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + } +} + + +template <class T> +void Aidge::ArgMaxImpl_cuda::forward_(const Tensor& input, std::int32_t axis, DimSize_t selectLastIdx) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + + + const T * inputPtr = static_cast<const T*>(input.getImpl()->rawPtr()); + T * outputPtr = static_cast<T*>(op.getOutput(0)->getImpl()->rawPtr()); + + std::vector<int> inputStrides(op.getInput(0)->nbDims(), 1); + if(op.getInput(0)->nbDims()>1) { + for (int i = op.getInput(0)->nbDims()-2; i >= 0; i--) { + inputStrides[i] = inputStrides[i+1] * op.getInput(0)->dims()[i+1]; + } + } + + std::vector<int> inputShape(input.nbDims()); + + // Use std::transform to convert each element + std::transform(input.dims().begin(), input.dims().end(), inputShape.begin(), + [](size_t value) { + return static_cast<int>(value); + }); + Aidge::ArgMax_cuda_forward_kernel<T>(inputPtr, outputPtr, + inputShape, inputStrides, + axis, static_cast<int>(op.getInput(0)->size()), selectLastIdx); +} diff --git a/src/operator/ArgMaxImpl_CUDA_kernels.cu b/src/operator/ArgMaxImpl_CUDA_kernels.cu new file mode 100644 index 0000000000000000000000000000000000000000..7010236d06135171309ac63f0e2d93fa85ff76d8 --- /dev/null +++ b/src/operator/ArgMaxImpl_CUDA_kernels.cu @@ -0,0 +1,161 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include "aidge/backend/cuda/operator/ArgMaxImpl_CUDA_kernels.hpp" +#define MAX_ERRORS 10 +// Helper function for comparison +template <typename T> +__device__ bool compareGT(T a, T b) { + return a > b; +} +template <> +__device__ bool compareGT<half>(half a, half b) { + return __half2float(a) > __half2float(b); +} + +// Helper function for comparison +template <typename T> +__device__ bool compareGE(T a, T b) { + return a >= b; +} +template <> +__device__ bool compareGE<half>(half a, half b) { + return __half2float(a) >= __half2float(b); +} +template <typename T> +__global__ void argmax_forward(const T* input, T* output, int* dims, int* strides, int axis, int total_elems, T minValue) { + const unsigned int index = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int stride = blockDim.x * gridDim.x; + + for (unsigned int idx = index; idx < total_elems; idx += stride) { + int axis_stride = strides[axis]; + int axis_dim = dims[axis]; + int outer_stride = idx / axis_stride; + int inner_stride = idx % axis_stride; + + T max_val = minValue; + int max_idx = 0; + + for (int i = 0; i < axis_dim; ++i) { + int offset = outer_stride * axis_dim * axis_stride + i * axis_stride + inner_stride; + if (offset >= total_elems) { + return; + } + T val = input[offset]; + if (compareGT(val, max_val)) { + max_val = val; + max_idx = i; + } + } + + int output_index = outer_stride * axis_stride + inner_stride; + if (output_index >= (total_elems / axis_dim)) { + return; + } + output[output_index] = max_idx; + } +} + +template <typename T> +__global__ void argmax_forward_selectLastIdx(const T* input, T* output, int* dims, int* strides, int axis, int total_elems, T minValue) { + const unsigned int index = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int stride = blockDim.x * gridDim.x; + + for (unsigned int idx = index; idx < total_elems; idx += stride) { + int axis_stride = strides[axis]; + int axis_dim = dims[axis]; + int outer_stride = idx / axis_stride; + int inner_stride = idx % axis_stride; + + T max_val = minValue; + int max_idx = 0; + + for (int i = 0; i < axis_dim; ++i) { + int offset = outer_stride * axis_dim * axis_stride + i * axis_stride + inner_stride; + if (offset >= total_elems) { + return; + } + T val = input[offset]; + if (compareGE(val, max_val)) { + max_val = val; + max_idx = i; + } + } + + int output_index = outer_stride * axis_stride + inner_stride; + if (output_index >= (total_elems / axis_dim)) { + return; + } + output[output_index] = max_idx; + } +} + +template <typename T> +T minValue(); + +template <> +double minValue<double>() { + return std::numeric_limits<double>::min(); +} + +template <> +float minValue<float>() { + return std::numeric_limits<float>::min(); +} + +template <> +half minValue<half>() { + return __float2half(std::numeric_limits<float>::min()); +} + +template <typename T> +void Aidge::ArgMax_cuda_forward_kernel(const T* input, T* output, + const std::vector<int>& inputDims, const std::vector<int>& inputStrides, + int axis, int total_elems, std::size_t selectLastIdx) { + + // Define block and grid sizes + int blockSize = 256; + int gridSize = (total_elems + blockSize - 1) / blockSize; + + + int *d_input_strides, *d_input_shape; + // Allocate device memory + CHECK_CUDA_STATUS(cudaMalloc(&d_input_shape, inputDims.size() * sizeof(int))); + CHECK_CUDA_STATUS(cudaMalloc(&d_input_strides, inputDims.size() * sizeof(int))); + + // Copy data from host to device; + CHECK_CUDA_STATUS(cudaMemcpy(d_input_shape, inputDims.data(), inputDims.size() * sizeof(int), cudaMemcpyHostToDevice)); + CHECK_CUDA_STATUS(cudaMemcpy(d_input_strides, inputStrides.data(), inputDims.size() * sizeof(int), cudaMemcpyHostToDevice)); + // Launch the kernel + if (selectLastIdx) { + argmax_forward_selectLastIdx<<<gridSize, blockSize>>>(input, output, d_input_shape, d_input_strides, axis, total_elems, minValue<T>()); + } + else { + argmax_forward<<<gridSize, blockSize>>>(input, output, d_input_shape, d_input_strides, axis, total_elems, minValue<T>()); + } + + CHECK_CUDA_STATUS(cudaFree(d_input_shape)); + CHECK_CUDA_STATUS(cudaFree(d_input_strides)); +} + + + +template void Aidge::ArgMax_cuda_forward_kernel(const double* input, double* output, + const std::vector<int>& inputDims, const std::vector<int>& inputStrides, + int axis, int total_elems, std::size_t selectLastIdx); + +template void Aidge::ArgMax_cuda_forward_kernel(const float* input, float* output, + const std::vector<int>& inputDims, const std::vector<int>& inputStrides, + int axis, int total_elems, std::size_t selectLastIdx); + +template void Aidge::ArgMax_cuda_forward_kernel(const half* input, half* output, + const std::vector<int>& inputDims, const std::vector<int>& inputStrides, + int axis, int total_elems, std::size_t selectLastIdx); \ No newline at end of file diff --git a/src/operator/ConvImpl.cpp b/src/operator/ConvImpl.cpp index b627f69a289340b42e1de4baa6bb09d1ea2e5e99..24e01db03692ffaa884b31a224a1947a9e1645a0 100644 --- a/src/operator/ConvImpl.cpp +++ b/src/operator/ConvImpl.cpp @@ -24,14 +24,18 @@ template <Aidge::DimIdx_t DIM> void Aidge::ConvImpl_cuda<DIM>::forward() { const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); - // FIXME: uncomment the following code once memory handling will work - assert(mOp.getRawInput(0) && "missing input #0"); - assert(mOp.getRawInput(1) && "missing input #1"); + AIDGE_ASSERT(op.getInput(0), "missing input #0"); + AIDGE_ASSERT(op.getInput(0)->hasImpl(), "the 0-th input has no implementation."); + AIDGE_ASSERT(op.getInput(1), "missing input #1"); + AIDGE_ASSERT(op.getInput(1)->hasImpl(), "the 1-th input has no implementation."); // Convert input data (no overhead if not needed!) const auto& input0 = op.getInput(0)->refCastFrom(mInput0Fallback, *op.getOutput(0)); const auto& input1 = op.getInput(1)->refCastFrom(mInput1Fallback, *op.getOutput(0)); - const auto& input2 = op.getInput(2)->refCastFrom(mInput2Fallback, *op.getOutput(0)); + Tensor input2; + if(op.getInput(2) && op.getInput(2)->hasImpl()) { + input2 = op.getInput(2)->refCastFrom(mInput2Fallback, *op.getOutput(0)); + } // Lazy-initialize CuDNN convolution descriptor if (mConvDesc == nullptr) { @@ -48,14 +52,14 @@ void Aidge::ConvImpl_cuda<DIM>::forward() { upscales = std::vector<int>(convOp.dilationDims().begin(), convOp.dilationDims().end()); } - CHECK_CUDNN_STATUS(cudnnCreateConvolutionDescriptor(&mConvDesc)); - CHECK_CUDNN_STATUS(cudnnSetConvolutionNdDescriptor(mConvDesc, - DIM, - &paddings[0], - &strides[0], - &upscales[0], - CUDNN_CROSS_CORRELATION, - DataTypeToCudnn(op.getOutput(0)->dataType()))); + CHECK_CUDNN_STATUS(cudnnCreateConvolutionDescriptor(&mConvDesc)); + CHECK_CUDNN_STATUS(cudnnSetConvolutionNdDescriptor(mConvDesc, + DIM, + &paddings[0], + &strides[0], + &upscales[0], + CUDNN_CROSS_CORRELATION, + DataTypeToCudnn(op.getOutput(0)->dataType()))); } // Lazy-initialize CuDNN filter descriptor @@ -72,27 +76,6 @@ void Aidge::ConvImpl_cuda<DIM>::forward() { // Set forward algorithm and allocate the required workspace if (mFwdWorkspace == nullptr) { - // Find the best CuDNN forward algorithm (the one with the lowest compute time) - int maxAlgoIterations = 0; - cudnnGetConvolutionForwardAlgorithmMaxCount(CudaContext::cudnnHandle(), - &maxAlgoIterations); - - assert(maxAlgoIterations > 0 && "No available CUDNN ConvolutionForwardAlgorithm"); - - int returnAlgoCounts = 0; - std::vector<cudnnConvolutionFwdAlgoPerf_t> returnFwdAlgo(maxAlgoIterations); - - CHECK_CUDNN_STATUS(cudnnFindConvolutionForwardAlgorithm( - CudaContext::cudnnHandle(), - std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0), - mFilterDesc, - mConvDesc, - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), - maxAlgoIterations, - &returnAlgoCounts, - &returnFwdAlgo[0])); - mFwdAlgo = returnFwdAlgo[0].algo; - // Allocate the workspace required by the chosen CuDNN forward algorithm size_t workspaceSize = 0; @@ -166,14 +149,18 @@ template <Aidge::DimIdx_t DIM> void Aidge::ConvImpl_cuda<DIM>::backward() { const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); - // FIXME: uncomment the following code once memory handling will work - assert(mOp.getRawInput(0) && "missing input #0"); - assert(mOp.getRawInput(1) && "missing input #1"); + AIDGE_ASSERT(op.getInput(0), "missing input #0"); + AIDGE_ASSERT(op.getInput(0)->hasImpl(), "the 0-th input has no implementation."); + AIDGE_ASSERT(op.getInput(1), "missing input #1"); + AIDGE_ASSERT(op.getInput(1)->hasImpl(), "the 1-th input has no implementation."); // Convert input data (no overhead if not needed!) - const auto& input0 = op.getInput(0)->ref(mInput0Fallback, *op.getOutput(0)); - const auto& input1 = op.getInput(1)->ref(mInput1Fallback, *op.getOutput(0)); - const auto& input2 = op.getInput(2)->ref(mInput2Fallback, *op.getOutput(0)); + const auto& input0 = op.getInput(0)->refCastFrom(mInput0Fallback, *op.getOutput(0)); + const auto& input1 = op.getInput(1)->refCastFrom(mInput1Fallback, *op.getOutput(0)); + Tensor input2; + if(op.getInput(2) && op.getInput(2)->hasImpl()) { + input2 = op.getInput(2)->refCastFrom(mInput2Fallback, *op.getOutput(0)); + } // Set forward algorithm and allocate the required workspace if (mBwdWorkspace == nullptr) { diff --git a/src/operator/DivImpl.cpp b/src/operator/DivImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0326a60c1a3aabf43ca3a1d892328991d6d72366 --- /dev/null +++ b/src/operator/DivImpl.cpp @@ -0,0 +1,113 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <algorithm> +#include <cassert> +#include <numeric> +#include <vector> + +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/DivImpl.hpp" +#include "aidge/backend/cuda/operator/DivImpl_CUDA_kernels.hpp" +#include "aidge/backend/cuda/utils/CudaContext.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/operator/Div.hpp" +#include "aidge/utils/Types.h" + +void Aidge::DivImpl_cuda::forward() { + const Div_Op& op = static_cast<const Div_Op&>(mOp); + // Check inputs + AIDGE_ASSERT(op.getInput(0), "missing input in Div operator"); + AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Div forward because the 0-th input has no implementation."); + DataType datatypeFirstInput = op.getInput(0)->dataType(); + for (IOIndex_t i = 1; i < op.nbInputs(); ++i) { + AIDGE_ASSERT(op.getInput(i), "missing input in Div operator"); + AIDGE_ASSERT(op.getInput(i)->hasImpl(), "cannot run Div forward because the {}-th input has no implementation.", i); + AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, "Cannot Div inputs with two differents data type."); + } + + std::vector<std::shared_ptr<Tensor>> inputFallbacks(op.nbInputs()); + std::vector<Tensor> inputs(op.nbInputs()); + std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims + std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides + for (IOIndex_t i = 0; i < op.nbInputs(); ++i) { + inputs[i] = op.getInput(i)->refCastFrom(inputFallbacks[i], *op.getOutput(0)); + + // Get tensor dims and broadcast them + std::copy(inputs[i].dims().begin(), inputs[i].dims().end(), std::back_inserter(dims[i])); + dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1)); + + if (dims[i].size() < 4) { + dims[i].resize(4, 1); + } + + // Compute the corresponding strides + std::vector<int> tensorStrides(dims[i].size()); + int product = 1; + for (size_t j = dims[i].size(); j > 0; --j) { + tensorStrides[j - 1] = product; + product *= dims[i][j - 1]; + } + strides[i] = tensorStrides; + } + + switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + forward_<double>(inputs, dims, strides); + break; + case DataType::Float32: + forward_<float>(inputs, dims, strides); + break; + case DataType::Float16: + forward_<half>(inputs, dims, strides); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + } +} + +template <class T> +void Aidge::DivImpl_cuda::forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + // const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; + // const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; + const T * input1Ptr = static_cast<const T*>(inputs[0].getImpl()->rawPtr()); + const T * input2Ptr = static_cast<const T*>(inputs[1].getImpl()->rawPtr()); + T * outputPtr = static_cast<T*>(op.getOutput(0)->getImpl()->rawPtr()); + + std::vector<int> outputStrides(op.getOutput(0)->nbDims(), 1); + if(op.getOutput(0)->nbDims()>1) { + for (int i = op.getOutput(0)->nbDims()-2; i >= 0; i--) { + outputStrides[i] = outputStrides[i+1] * op.getOutput(0)->dims()[i+1]; + } + } + std::vector<int> outDims(std::max(op.getOutput(0)->nbDims(),std::size_t(4)), 1); + for (std::size_t i = 0; i < op.getOutput(0)->nbDims(); i++) { + outDims[i] = static_cast<int>(op.getOutput(0)->dims()[i]); + } + + Aidge::divForward<T>(input1Ptr, outputPtr, input2Ptr, + inputsDims[0], inputsDims[1], outDims, + inputsStrides[0], inputsStrides[1], outputStrides, + static_cast<int>(op.getOutput(0)->size())); +} + +void Aidge::DivImpl_cuda::backward() { + // TODO +} + +template <class T> +void Aidge::DivImpl_cuda::backward_(const Tensor& outGrad) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; + const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; + // TODO +} \ No newline at end of file diff --git a/src/operator/DivImpl_CUDA_kernels.cu b/src/operator/DivImpl_CUDA_kernels.cu new file mode 100644 index 0000000000000000000000000000000000000000..7ff5a2865a32cb38bf92571bdea6bf90ca8516eb --- /dev/null +++ b/src/operator/DivImpl_CUDA_kernels.cu @@ -0,0 +1,100 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include "aidge/backend/cuda/operator/DivImpl_CUDA_kernels.hpp" + + +// Helper function for Div +template <typename T> +__device__ T div(T a, T b) { + return a / b; +} +template <> +__device__ half div<half>(half a, half b) { +#if __CUDA_ARCH__ >= 530 && defined(CUDART_VERSION) && CUDART_VERSION >= 8000 + return __hdiv(a, b); +#else + return __float2half(__half2float(a) / __half2float(b)); +#endif +} + +template <class T> +__global__ void divKernel(const T* input1, T* output, const T* input2, + int* input1_shape, int* input2_shape, int* output_shape, + int* input1_strides, int* input2_strides, int* output_strides, + int num_dims, int size) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= size) return; + + int input1_idx = 0, input2_idx = 0; + int temp_idx = idx; + for (int i = 0; i < num_dims; ++i) { + int dim = temp_idx / output_strides[i]; + temp_idx %= output_strides[i]; + input1_idx += (input1_shape[i] == 1 ? 0 : dim) * input1_strides[i]; + input2_idx += (input2_shape[i] == 1 ? 0 : dim) * input2_strides[i]; + } + output[idx] = div(input1[input1_idx], input2[input2_idx]); +} + +template <class T> +void Aidge::divForward(const T* input1, T* output, const T* input2, + const std::vector<int>& input1Dims,const std::vector<int>& input2Dims, const std::vector<int>& outputDims, + const std::vector<int>& input1Strides, const std::vector<int>& input2Strides,const std::vector<int>& outputStrides, + int outSize) +{ + int *d_input1_strides, *d_input2_strides, *d_output_strides, *d_input1_shape, *d_input2_shape, *d_output_shape; + // Allocate device memory + CHECK_CUDA_STATUS(cudaMalloc(&d_input1_shape, input1Dims.size() * sizeof(int))); + CHECK_CUDA_STATUS(cudaMalloc(&d_input2_shape, input1Dims.size() * sizeof(int))); + CHECK_CUDA_STATUS(cudaMalloc(&d_output_shape, input1Dims.size() * sizeof(int))); + CHECK_CUDA_STATUS(cudaMalloc(&d_input1_strides, input1Dims.size() * sizeof(int))); + CHECK_CUDA_STATUS(cudaMalloc(&d_input2_strides, input1Dims.size() * sizeof(int))); + CHECK_CUDA_STATUS(cudaMalloc(&d_output_strides, input1Dims.size() * sizeof(int))); + + // Copy data from host to device; + CHECK_CUDA_STATUS(cudaMemcpy(d_input1_shape, input1Dims.data(), input1Dims.size() * sizeof(int), cudaMemcpyHostToDevice)); + CHECK_CUDA_STATUS(cudaMemcpy(d_input2_shape, input2Dims.data(), input1Dims.size() * sizeof(int), cudaMemcpyHostToDevice)); + CHECK_CUDA_STATUS(cudaMemcpy(d_output_shape, outputDims.data(), input1Dims.size() * sizeof(int), cudaMemcpyHostToDevice)); + CHECK_CUDA_STATUS(cudaMemcpy(d_input1_strides, input1Strides.data(), input1Dims.size() * sizeof(int), cudaMemcpyHostToDevice)); + CHECK_CUDA_STATUS(cudaMemcpy(d_input2_strides, input2Strides.data(), input1Dims.size() * sizeof(int), cudaMemcpyHostToDevice)); + CHECK_CUDA_STATUS(cudaMemcpy(d_output_strides, outputStrides.data(), input1Dims.size() * sizeof(int), cudaMemcpyHostToDevice)); + int blockSize = 256; + int numBlocks = (outSize + blockSize - 1) / blockSize; + + int num_dims = input1Dims.size(); + // Launch the kernel + divKernel<<<numBlocks, blockSize>>>(input1, output, input2, + d_input1_shape, d_input2_shape, d_output_shape, + d_input1_strides, d_input2_strides, d_output_strides, + num_dims, outSize); + CHECK_CUDA_STATUS(cudaFree(d_input1_shape)); + CHECK_CUDA_STATUS(cudaFree(d_input2_shape)); + CHECK_CUDA_STATUS(cudaFree(d_output_shape)); + CHECK_CUDA_STATUS(cudaFree(d_input1_strides)); + CHECK_CUDA_STATUS(cudaFree(d_input2_strides)); + CHECK_CUDA_STATUS(cudaFree(d_output_strides)); +}; + +template void Aidge::divForward<double>(const double* input1, double* output, const double* input2, + const std::vector<int>& input1Dims,const std::vector<int>& input2Dims, const std::vector<int>& outputDims, + const std::vector<int>& input1Strides, const std::vector<int>& input2Strides,const std::vector<int>& outputStrides, + int outSize); + +template void Aidge::divForward<float>(const float* input1, float* output, const float* input2, + const std::vector<int>& input1Dims,const std::vector<int>& input2Dims, const std::vector<int>& outputDims, + const std::vector<int>& input1Strides, const std::vector<int>& input2Strides,const std::vector<int>& outputStrides, + int outSize); + +template void Aidge::divForward<half>(const half* input1, half* output, const half* input2, + const std::vector<int>& input1Dims,const std::vector<int>& input2Dims, const std::vector<int>& outputDims, + const std::vector<int>& input1Strides, const std::vector<int>& input2Strides,const std::vector<int>& outputStrides, + int outSize); diff --git a/src/operator/FCImpl.cpp b/src/operator/FCImpl.cpp index 9948ee1356ad4fedb5d830016ae66ca69a033e38..1a7bb8edb51312d08467354e20723ad19176bfee 100644 --- a/src/operator/FCImpl.cpp +++ b/src/operator/FCImpl.cpp @@ -110,7 +110,7 @@ void Aidge::FCImpl_cuda::forward_(const Tensor& input0, const Tensor& input1, co output, n)); - cudaFree(onesVector); + CHECK_CUDA_STATUS(cudaFree(onesVector)); } } @@ -125,7 +125,7 @@ void Aidge::FCImpl_cuda::backward() { const auto& input0 = fcOp.getInput(0)->refCastFrom(mInput0Fallback, *fcOp.getOutput(0)); const auto& input1 = fcOp.getInput(1)->refCastFrom(mInput1Fallback, *fcOp.getOutput(0)); - const auto& input2 = fcOp.getInput(2)->refCastFrom(mInput2Fallback, *fcOp.getOutput(0)); + const auto& input2 = (fcOp.getInput(2)) ? fcOp.getInput(2)->refCastFrom(mInput2Fallback, *fcOp.getOutput(0)) : Tensor(); switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { case DataType::Float64: @@ -156,10 +156,9 @@ void Aidge::FCImpl_cuda::backward_(const Tensor& input0, const Tensor& input1, c // Performing weightsGrad = (input) * T(outputGrad) // [n x m] = [n x k] * [k x m] - int m = input0.dims()[input0.nbDims()-1]; + int m = input1.dims()[1]; int k = input0.size()/m; int n = input1.size()/m; - int input0LastDim = input0.dims()[input0.nbDims()-1]; CHECK_CUBLAS_STATUS(cublasGemm( CudaContext::cublasHandle(), CUBLAS_OP_N, @@ -190,7 +189,7 @@ void Aidge::FCImpl_cuda::backward_(const Tensor& input0, const Tensor& input1, c CHECK_CUBLAS_STATUS(cublasGemv(CudaContext::cublasHandle(), CUBLAS_OP_N, outChannels, - input0LastDim, + k, reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&alpha), outputGrad, outChannels, @@ -199,7 +198,7 @@ void Aidge::FCImpl_cuda::backward_(const Tensor& input0, const Tensor& input1, c reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&beta), biasGrad, 1)); - cudaFree(onesVector); + CHECK_CUDA_STATUS(cudaFree(onesVector)); } // Performing inputGrad = (weights) * (outputGrad) CHECK_CUBLAS_STATUS(cublasGemm( @@ -207,7 +206,7 @@ void Aidge::FCImpl_cuda::backward_(const Tensor& input0, const Tensor& input1, c CUBLAS_OP_N, CUBLAS_OP_N, op.getInput(1)->grad()->size()/outChannels, - input0LastDim, + k, outChannels, reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&alpha), weights,//w diff --git a/src/operator/ILayerNormImpl.cpp b/src/operator/ILayerNormImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..47dd1d5d1a3f127c9e08788f605796020a7814a7 --- /dev/null +++ b/src/operator/ILayerNormImpl.cpp @@ -0,0 +1,204 @@ +/******************************************************************************** + * Copyright (c) 2024 Thales + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * Author: Lucas RAKOTOARIVONY, Thales Research & Technology France + * Date: 10.09.2024 + * + ********************************************************************************/ + +#include <cassert> +#include <chrono> // std::chrono::milliseconds +#include <numeric> // std::accumulate +#include <thread> // std::this_thread::sleep_for +#include <vector> +#include <algorithm> // For std::max +#include <cmath> // For pow +#include <typeinfo> + +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/ILayerNormImpl.hpp" +#include "aidge/backend/cuda/operator/ILayerNormImpl_CUDA_kernels.hpp" +#include "aidge/backend/cuda/utils/CudaContext.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/operator/ILayerNorm.hpp" +#include "aidge/utils/Types.h" + +void Aidge::ILayerNormImpl_cuda::forward() { + + + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + + assert(mOp.getRawInput(0) && "missing input #0"); + assert(mOp.getRawInput(1) && "missing input #1"); + assert(mOp.getRawInput(2) && "missing input #2"); + + const auto& input0 = op.getInput(0)->refCastFrom(mInput0Fallback, *op.getOutput(0)); + const auto& input1 = op.getInput(1)->refCastFrom(mInput1Fallback, *op.getOutput(0)); + const auto& input2 = op.getInput(2)->refCastFrom(mInput2Fallback, *op.getOutput(0)); + + switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + forward_<double>(input0, input1, input2); + break; + case DataType::Float32: + forward_<float>(input0, input1, input2); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + } +} + + +template<class T> +void Aidge::ILayerNormImpl_cuda::forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2) +{ + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const T * input_raw = static_cast<const T*>(input0.getImpl()->rawPtr()); + const T * weight = static_cast<const T*>(input1.getImpl()->rawPtr()); + const T * bias = static_cast<const T*>(input2.getImpl()->rawPtr()); + T * output = static_cast<T*>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr()); + + int N = 15; + int output_bits = 8; + size_t size = input0.size(); + std::vector<DimSize_t> dims_input = input0.dims(); + + // maybe find a most efficient way to compute scaling factor (a max and min function could help to retrieve scaling factor value) + + double min = std::numeric_limits<double>::max(); + double max = std::numeric_limits<double>::min(); + for(std::size_t i = 0; i < dims_input[0]; i++) { + for(std::size_t j = 0; j < dims_input[1]; j++) { + for(std::size_t k = 0; k < dims_input[2]; k++) { + for(std::size_t l = 0; l < dims_input[3]; l++) { + std::vector<std::size_t> coordIdx = {i, j, k, l}; + std::size_t newFlatIdx = input0.getIdx(coordIdx); + if (newFlatIdx < min) { + min = newFlatIdx; + } + if (newFlatIdx > max) { + max = newFlatIdx; + } + } + } + } + } + double m = std::max(std::abs(min), std::abs(max)); + double normalization_factor = static_cast<double>(1 << (output_bits - 1)) - 1; + double scaling_factor = m / normalization_factor; + + // The new scaling factor that we can use to dequantify the returned tensor (not used here) + // double new_SF = 1/std::pow(2,2*output_bits-1); + + ILayerNormforward(input_raw, output, scaling_factor, weight, bias, size, dims_input); +} + +void Aidge::ILayerNormImpl_cuda::backward() { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + + assert(op.getOutput(0)->grad() && "missing output #0"); + + const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad()); + + if (op.getInput(0)->grad()->dataType() == DataType::Float64) { + backward_<double>(output_grad); + } + else { + backward_<float>(output_grad); + } +} + +template <class T> +void Aidge::ILayerNormImpl_cuda::backward_(const Tensor& output_grad) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + size_t size = output_grad.size(); + std::vector<DimSize_t> dims_input = output_grad.dims(); + + const T * output = static_cast<const T*>(std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()); + + T * input_grad = static_cast<T*>(op.getInput(0)->grad()->getImpl()->rawPtr()); + T * weight_grad = static_cast<T*>(op.getInput(1)->grad()->getImpl()->rawPtr()); + T * bias_grad = static_cast<T*>(op.getInput(2)->grad()->getImpl()->rawPtr()); + + const T * input = static_cast<const T*>(op.getInput(0)->getImpl()->rawPtr()); + const T * weight = static_cast<const T*>(op.getInput(1)->getImpl()->rawPtr()); + const T * bias = static_cast<const T*>(op.getInput(2)->getImpl()->rawPtr()); + + // maybe find a most efficient way to compute mean and variance tensor + + std::vector<std::vector<std::vector<std::vector<T>>>> means(dims_input[0], + std::vector<std::vector<std::vector<T>>>(dims_input[1], + std::vector<std::vector<T>>(dims_input[2], + std::vector<T>(dims_input[3], 0.0f)))); + + for (std::size_t i = 0; i < dims_input[0]; i++) { + for (std::size_t j = 0; j < dims_input[1]; j++) { + for (std::size_t k = 0; k < dims_input[2]; k++) { + T sum = 0.0f; + + for (std::size_t l = 0; l < dims_input[3]; l++) { + std::vector<std::size_t> coordIdx = {i, j, k, l}; + sum += output_grad.getIdx(coordIdx); + } + for (std::size_t l = 0; l < dims_input[3]; l++) { + std::vector<std::size_t> coordIdx = {i, j, k, l}; + means[i][j][k][l] = sum / static_cast<T>(dims_input[3]); + } + } + } + } + std::vector<T> flat_means; + + for (const auto &vec3d : means) { + for (const auto &vec2d : vec3d) { + for (const auto &vec1d : vec2d) { + flat_means.insert(flat_means.end(), vec1d.begin(), vec1d.end()); + } + } + } + + std::vector<std::vector<std::vector<std::vector<T>>>> vars(dims_input[0], + std::vector<std::vector<std::vector<T>>>(dims_input[1], + std::vector<std::vector<T>>(dims_input[2], + std::vector<T>(dims_input[3], 0.0f)))); + + for (std::size_t i = 0; i < dims_input[0]; i++) { + for (std::size_t j = 0; j < dims_input[1]; j++) { + for (std::size_t k = 0; k < dims_input[2]; k++) { + T sum_sq_diff = 0.0f; + + for (std::size_t l = 0; l < dims_input[3]; l++) { + std::vector<std::size_t> coordIdx = {i, j, k, l}; + T value = static_cast<T>(output_grad.getIdx(coordIdx)); + T diff = value - means[i][j][k][l]; + sum_sq_diff += diff * diff; + } + T variance = sum_sq_diff / static_cast<T>(dims_input[3]); + for (std::size_t l = 0; l < dims_input[3]; l++) { + vars[i][j][k][l] = variance; + } + } + } + } + + std::vector<T> flat_vars; + + for (const auto &vec3d : vars) { + for (const auto &vec2d : vec3d) { + for (const auto &vec1d : vec2d) { + flat_vars.insert(flat_vars.end(), vec1d.begin(), vec1d.end()); + } + } + } + + const T* mean_ = flat_means.data(); + const T* var_ = flat_vars.data(); + const T * output_grad_raw = static_cast<const T*>(output_grad.getImpl()->rawPtr()); + + ILayerNormbackward(output, output_grad_raw, input, mean_, var_, weight, bias, input_grad, weight_grad, bias_grad, size); +} diff --git a/src/operator/ILayerNormImpl_CUDA_kernels.cu b/src/operator/ILayerNormImpl_CUDA_kernels.cu new file mode 100644 index 0000000000000000000000000000000000000000..fafdc176fdad6a6130c9bc4374d75f8a773f2c16 --- /dev/null +++ b/src/operator/ILayerNormImpl_CUDA_kernels.cu @@ -0,0 +1,335 @@ +/******************************************************************************** + * Copyright (c) 2024 Thales + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * Author: Lucas RAKOTOARIVONY, Thales Research & Technology France + * Date: 10.09.2024 + * + ********************************************************************************/ + +#define MAX(X,Y) (((X) > (Y)) ? (X) : (Y)) +#define CLAMP(X) (((X) < (0)) ? (0) : (X)) + +#include <stdio.h> +#include <cuda_runtime.h> + +#include "aidge/backend/cuda/operator/ILayerNormImpl_CUDA_kernels.hpp" + +namespace Aidge{ + +template <class T> +__global__ void ILayerNormforward_(T* input, double SF, int* dims, int* quantized_tensor,long long int* square_tensor, T* weight, T* biase, double new_SF) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + int z = blockIdx.z * blockDim.z + threadIdx.z; + + int k = 1 << 16; + long long int sum = 0; + if (x < dims[0] && y < dims[1] && z < dims[2]) { + int maxIdx = x * dims[1] * dims[2] * dims[3] + y * dims[2] * dims[3] + z * dims[3]; + int val; + int mean_val = 0; + for (int i = 0; i < dims[3]; i++) { + int idx = maxIdx + i; + val = roundf(input[idx] / SF); + quantized_tensor[idx] = val; + mean_val += val; + } + for (int i = 0; i < dims[3]; i++) { + int idx = maxIdx + i; + quantized_tensor[idx] -= (mean_val/dims[3]) ; + square_tensor[idx] = (quantized_tensor[idx] * quantized_tensor[idx]); // I-ViT code implementation + //square_tensor[idx] = (quantized_tensor[idx] * quantized_tensor[idx])/dims[3]; // I-ViT paper implementation + } + for (int i = 0; i < dims[3]; i++) { + int idx = maxIdx + i; + sum += square_tensor[idx]; + biase[i] = (biase[i]/weight[i])/new_SF; + weight[i] = weight[i] * new_SF; + } + for(int h = 0; h < 10 ; h++) + { + k = floorf((k + floorf(sum / k))/2); + } + int factor = (((1 << 31) - 1) / k); + for (int i = 0; i < dims[3]; i++) { + int idx = maxIdx + i; + square_tensor[idx] = (biase[idx]/weight[idx])/new_SF; + quantized_tensor[idx] = (quantized_tensor[idx] * factor / 2) + biase[maxIdx]; + input[idx] = quantized_tensor[idx] * new_SF; + } + + } +} + +template <> +void ILayerNormforward<float>(const float* input, float* output, double SF, const float* weight_raw, const float* bias_raw, size_t size, std::vector<long unsigned int> dims_input) +{ + int dims_input_cuda[4] = {1, 1, 1, 1}; + for (std::size_t i = 0; i < std::min(dims_input.size(), size_t(4)); ++i) { + dims_input_cuda[i] = static_cast<int>(dims_input[i]); + } + + double new_SF = std::sqrt(dims_input_cuda[3]) / (1 << 30); + + float* input_cuda_tensor; + cudaMalloc(&input_cuda_tensor,size*sizeof(float)); + cudaMemcpy(input_cuda_tensor,input, size * sizeof(float),cudaMemcpyHostToDevice); + + int *quantized_tensor; + cudaMalloc(&quantized_tensor, size * sizeof(int)); + + int *dims; + cudaMalloc(&dims, 4 * sizeof(int)); + cudaMemcpy(dims, dims_input_cuda, 4 * sizeof(int), cudaMemcpyHostToDevice); + + float *weight; + cudaMalloc(&weight,dims_input_cuda[3]*sizeof(float)); + cudaMemcpy(weight,weight_raw,dims_input_cuda[3]*sizeof(float),cudaMemcpyHostToDevice); + + float *bias; + cudaMalloc(&bias,dims_input_cuda[3]*sizeof(float)); + cudaMemcpy(bias,bias_raw,dims_input_cuda[3]*sizeof(float),cudaMemcpyHostToDevice); + + long long int* Squaretensor; + cudaMalloc(&Squaretensor,(size)*sizeof(long long int)); + + dim3 threadsPerBlock(10, 10, 10); + dim3 numBlocks((dims_input_cuda[0] + threadsPerBlock.x - 1) / threadsPerBlock.x, + (dims_input_cuda[1] + threadsPerBlock.y - 1) / threadsPerBlock.y, + (dims_input_cuda[2] + threadsPerBlock.z - 1) / threadsPerBlock.z); + + ILayerNormforward_<float><<<numBlocks,threadsPerBlock>>>(input_cuda_tensor,SF,dims,quantized_tensor,Squaretensor,weight,bias,new_SF); + cudaDeviceSynchronize(); + + cudaError_t err = cudaGetLastError(); + if(err != cudaSuccess) + { + std::cerr << "CUDA Error: " << cudaGetErrorString(err) << std::endl; + } + cudaMemcpy(output,input_cuda_tensor, (size ) * sizeof(float), cudaMemcpyDeviceToHost); + + + cudaFree(input_cuda_tensor); + cudaFree(weight); + cudaFree(bias); + cudaFree(dims); + cudaFree(quantized_tensor); +} + +template <> +void ILayerNormforward<double>(const double* input, double* output, double SF, const double* weight_raw, const double* bias_raw, size_t size, std::vector<long unsigned int> dims_input) +{ + int dims_input_cuda[4] = {1, 1, 1, 1}; + for (std::size_t i = 0; i < std::min(dims_input.size(), size_t(4)); ++i) { + dims_input_cuda[i] = static_cast<int>(dims_input[i]); + } + + double new_SF = std::sqrt(dims_input_cuda[3]) / (1 << 30); + + double* input_cuda_tensor; + cudaMalloc(&input_cuda_tensor,size*sizeof(double)); + cudaMemcpy(input_cuda_tensor,input, size * sizeof(double),cudaMemcpyHostToDevice); + + int *quantized_tensor; + cudaMalloc(&quantized_tensor, size * sizeof(int)); + + int *dims; + cudaMalloc(&dims, 4 * sizeof(int)); + cudaMemcpy(dims, dims_input_cuda, 4 * sizeof(int), cudaMemcpyHostToDevice); + + double *weight; + cudaMalloc(&weight,dims_input_cuda[3]*sizeof(double)); + cudaMemcpy(weight,weight_raw,dims_input_cuda[3]*sizeof(double),cudaMemcpyHostToDevice); + + double *bias; + cudaMalloc(&bias,dims_input_cuda[3]*sizeof(double)); + cudaMemcpy(bias,bias_raw,dims_input_cuda[3]*sizeof(double),cudaMemcpyHostToDevice); + + long long int* Squaretensor; + cudaMalloc(&Squaretensor,(size)*sizeof(long long int)); + + dim3 threadsPerBlock(10, 10, 10); + dim3 numBlocks((dims_input_cuda[0] + threadsPerBlock.x - 1) / threadsPerBlock.x, + (dims_input_cuda[1] + threadsPerBlock.y - 1) / threadsPerBlock.y, + (dims_input_cuda[2] + threadsPerBlock.z - 1) / threadsPerBlock.z); + + ILayerNormforward_<double><<<numBlocks,threadsPerBlock>>>(input_cuda_tensor,SF,dims,quantized_tensor,Squaretensor,weight,bias,new_SF); + cudaDeviceSynchronize(); + + cudaError_t err = cudaGetLastError(); + if(err != cudaSuccess) + { + std::cerr << "CUDA Error: " << cudaGetErrorString(err) << std::endl; + } + + cudaMemcpy(output,input_cuda_tensor, (size ) * sizeof(double), cudaMemcpyDeviceToHost); + + cudaFree(input_cuda_tensor); + cudaFree(weight); + cudaFree(bias); + cudaFree(dims); + cudaFree(quantized_tensor); +} + +template <class T> +__global__ void ILayerNormbackward_(T* output_grad, T* input_tensor, T* output_tensor, T* mean, T* var, T* weight, T* bias, T* input_grad, T* weight_grad, T* bias_grad, int size) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < size) { + T d_norm = output_grad[i] * weight[i]; + T d_var = d_norm * (input_tensor[i] - mean[i]) * -0.5 * powf(var[i] + 1e-6, -1.5); + T d_mean = d_norm * -1 / sqrtf(var[i] + 1e-6) + d_var * -2 * mean[i] / size; + T d_input = d_norm / sqrtf(var[i] + 1e-6) + d_var * 2 * (input_tensor[i] - mean[i]) / size + d_mean / size; + + input_grad[i] = d_input; + weight_grad[i] = output_grad[i] * output_tensor[i]; + bias_grad[i] = output_grad[i]; + } +} + +template <> +void ILayerNormbackward<float>(const float* input_tensor, const float* output_grad, const float* output_tensor,const float* mean,const float* var, const float* weight, const float* bias, float* input_grad, float* weight_grad, float* bias_grad, size_t size) +{ + float* input_cuda_tensor; + cudaMalloc(&input_cuda_tensor,size*sizeof(float)); + cudaMemcpy(input_cuda_tensor,input_tensor,size*sizeof(float),cudaMemcpyHostToDevice); + + float* output_grad_; + cudaMalloc(&output_grad_,size*sizeof(float)); + cudaMemcpy(output_grad_,output_grad,size*sizeof(float),cudaMemcpyHostToDevice); + + float* output_tensor_; + cudaMalloc(&output_tensor_,size*sizeof(float)); + cudaMemcpy(output_tensor_,output_tensor,size*sizeof(float),cudaMemcpyHostToDevice); + + float* mean_; + cudaMalloc(&mean_,size*sizeof(float)); + cudaMemcpy(mean_,mean,size*sizeof(float),cudaMemcpyHostToDevice); + + float* var_; + cudaMalloc(&var_,size*sizeof(float)); + cudaMemcpy(var_,var,size*sizeof(float),cudaMemcpyHostToDevice); + + float* weight_; + cudaMalloc(&weight_,size*sizeof(float)); + cudaMemcpy(weight_,weight,size*sizeof(float),cudaMemcpyHostToDevice); + + float* bias_; + cudaMalloc(&bias_,size*sizeof(float)); + cudaMemcpy(bias_,bias,size*sizeof(float),cudaMemcpyHostToDevice); + + + float* input_grad_; + cudaMalloc(&input_grad_,size*sizeof(float)); + + float* weight_grad_; + cudaMalloc(&weight_grad_,size*sizeof(float)); + + float* bias_grad_; + cudaMalloc(&bias_grad_,size*sizeof(float)); + + + dim3 threadParBlock(256); + dim3 Blocks((size + threadParBlock.x -1) / threadParBlock.x); + + ILayerNormbackward_<<<Blocks,threadParBlock>>>(output_grad_,input_cuda_tensor,output_tensor_,mean_,var_,weight_,bias_,input_grad_, weight_grad_, bias_grad_, size); + + cudaDeviceSynchronize(); + cudaError_t err = cudaGetLastError(); + if(err != cudaSuccess) + { + printf("CUDA Error: %s\n", cudaGetErrorString(err)); + } + + cudaMemcpy(input_grad , input_grad_, (size) * sizeof(float), cudaMemcpyDeviceToHost); + cudaMemcpy(weight_grad , weight_grad_, (size) * sizeof(float), cudaMemcpyDeviceToHost); + cudaMemcpy(bias_grad , bias_grad_, (size) * sizeof(float), cudaMemcpyDeviceToHost); + + cudaFree(input_cuda_tensor); + cudaFree(output_grad_); + cudaFree(mean_); + cudaFree(var_); + cudaFree(weight_); + cudaFree(bias_); + cudaFree(input_grad_); + cudaFree(weight_grad_); + cudaFree(bias_grad_); + +} + +template <> +void ILayerNormbackward<double>(const double* input_tensor, const double* output_grad, const double* output_tensor,const double* mean,const double* var, const double* weight, const double* bias, double* input_grad, double* weight_grad, double* bias_grad, size_t size) +{ + double* input_cuda_tensor; + cudaMalloc(&input_cuda_tensor,size*sizeof(double)); + cudaMemcpy(input_cuda_tensor,input_tensor,size*sizeof(double),cudaMemcpyHostToDevice); + + double* output_grad_; + cudaMalloc(&output_grad_,size*sizeof(double)); + cudaMemcpy(output_grad_,output_grad,size*sizeof(double),cudaMemcpyHostToDevice); + + double* output_tensor_; + cudaMalloc(&output_tensor_,size*sizeof(double)); + cudaMemcpy(output_tensor_,output_tensor,size*sizeof(double),cudaMemcpyHostToDevice); + + double* mean_; + cudaMalloc(&mean_,size*sizeof(double)); + cudaMemcpy(mean_,mean,size*sizeof(double),cudaMemcpyHostToDevice); + + double* var_; + cudaMalloc(&var_,size*sizeof(double)); + cudaMemcpy(var_,var,size*sizeof(double),cudaMemcpyHostToDevice); + + double* weight_; + cudaMalloc(&weight_,size*sizeof(double)); + cudaMemcpy(weight_,weight,size*sizeof(double),cudaMemcpyHostToDevice); + + double* bias_; + cudaMalloc(&bias_,size*sizeof(double)); + cudaMemcpy(bias_,bias,size*sizeof(double),cudaMemcpyHostToDevice); + + + double* input_grad_; + cudaMalloc(&input_grad_,size*sizeof(double)); + + double* weight_grad_; + cudaMalloc(&weight_grad_,size*sizeof(double)); + + double* bias_grad_; + cudaMalloc(&bias_grad_,size*sizeof(double)); + + + dim3 threadParBlock(256); + dim3 Blocks((size + threadParBlock.x -1) / threadParBlock.x); + + ILayerNormbackward_<<<Blocks,threadParBlock>>>(output_grad_,input_cuda_tensor,output_tensor_,mean_,var_,weight_,bias_,input_grad_, weight_grad_, bias_grad_, size); + + cudaDeviceSynchronize(); + cudaError_t err = cudaGetLastError(); + if(err != cudaSuccess) + { + printf("CUDA Error: %s\n", cudaGetErrorString(err)); + } + + + cudaMemcpy(input_grad , input_grad_, (size) * sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(weight_grad , weight_grad_, (size) * sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(bias_grad , bias_grad_, (size) * sizeof(double), cudaMemcpyDeviceToHost); + + cudaFree(input_cuda_tensor); + cudaFree(output_grad_); + cudaFree(mean_); + cudaFree(var_); + cudaFree(weight_); + cudaFree(bias_); + cudaFree(input_grad_); + cudaFree(weight_grad_); + cudaFree(bias_grad_); +} + +} \ No newline at end of file diff --git a/src/operator/LnImpl.cpp b/src/operator/LnImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ed09ed45f5006c3760376a9d6f44f29d05bcfabe --- /dev/null +++ b/src/operator/LnImpl.cpp @@ -0,0 +1,80 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <cassert> +#include <vector> + +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/LnImpl.hpp" +#include "aidge/backend/cuda/operator/LnImpl_CUDA_kernels.hpp" +#include "aidge/backend/cuda/utils/CudaContext.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/operator/Ln.hpp" +#include "aidge/utils/Types.h" + +void Aidge::LnImpl_cuda::forward() { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + + assert(mOp.getRawInput(0) && "missing input #0"); + + const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0)); + + switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + forward_<double>(input); + break; + case DataType::Float32: + forward_<float>(input); + break; + case DataType::Float16: + forward_<half>(input); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + } +} + +template <class T> +void Aidge::LnImpl_cuda::forward_(const Tensor& input) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const T * inputPtr = static_cast<const T*>(input.getImpl()->rawPtr()); + T * outputPtr = static_cast<T*>(op.getOutput(0)->getImpl()->rawPtr()); + + + Aidge::lnForward<T>(inputPtr, outputPtr, static_cast<int>(op.getOutput(0)->size())); +} + +void Aidge::LnImpl_cuda::backward() { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + + assert(op.getOutput(0)->grad() && "missing output #0"); + + const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad()); + + switch(op.getInput(0)->grad()->dataType()) { + case DataType::Float64: + backward_<double>(output_grad); + break; + case DataType::Float32: + backward_<float>(output_grad); + break; + case DataType::Float16: + backward_<half>(output_grad); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + } +} + +template <class T> +void Aidge::LnImpl_cuda::backward_(const Tensor& output_grad) { + //TODO +} diff --git a/src/operator/LnImpl_CUDA_kernels.cu b/src/operator/LnImpl_CUDA_kernels.cu new file mode 100644 index 0000000000000000000000000000000000000000..21521eaa9dbe7f7bb0664a4fce71c67979f735ad --- /dev/null +++ b/src/operator/LnImpl_CUDA_kernels.cu @@ -0,0 +1,48 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include "aidge/backend/cuda/operator/LnImpl_CUDA_kernels.hpp" +// Base template for floating-point types (float, double) +template<typename T> +__device__ T ln_helper(T x) { + return std::log(x); // std::log works for both float and double +} + +// Specialization for half-precision type using CUDA's half +template<> +__device__ half ln_helper<half>(half x) { + float x_float = __half2float(x); // Convert __half to float + return __float2half(std::log(x_float)); // Compute log and convert back to half +} + +template <class T> +__global__ void lnKernel(const T* input, T* output, int size) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= size) return; + + output[idx] = ln_helper(input[idx]); +} + +template <class T> +void Aidge::lnForward(const T* input, T* output, int size) +{ + int blockSize = 256; + int numBlocks = (size + blockSize - 1) / blockSize; + + // Launch the kernel + lnKernel<<<numBlocks, blockSize>>>(input, output, size); +}; + +template void Aidge::lnForward<double>(const double* input, double* output, int size); + +template void Aidge::lnForward<float>(const float* input, float* output, int size); + +template void Aidge::lnForward<half>(const half* input, half* output, int size); \ No newline at end of file diff --git a/src/operator/MulImpl.cpp b/src/operator/MulImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..af87251e8f29eded7d24cca2f08b880557ebb482 --- /dev/null +++ b/src/operator/MulImpl.cpp @@ -0,0 +1,221 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <algorithm> +#include <cassert> +#include <numeric> +#include <vector> +#include <chrono> + +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/MulImpl.hpp" +#include "aidge/backend/cuda/utils/CudaContext.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/operator/Mul.hpp" +#include "aidge/utils/Types.h" + +void Aidge::MulImpl_cuda::forward() { + const Mul_Op& op = static_cast<const Mul_Op&>(mOp); + // Check inputs + AIDGE_ASSERT(op.getInput(0), "missing input in Mul operator"); + AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Mul forward because the 0-th input has no implementation."); + DataType datatypeFirstInput = op.getInput(0)->dataType(); + for (IOIndex_t i = 1; i < op.nbInputs(); ++i) { + AIDGE_ASSERT(op.getInput(i), "missing input in Mul operator"); + AIDGE_ASSERT(op.getInput(i)->hasImpl(), "cannot run Mul forward because the {}-th input has no implementation.", i); + AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, "Cannot Mul inputs with two differents data type."); + } + + std::vector<std::shared_ptr<Tensor>> inputFallbacks(op.nbInputs()); + std::vector<Tensor> inputs(op.nbInputs()); + std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims + std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides + for (IOIndex_t i = 0; i < op.nbInputs(); ++i) { + inputs[i] = op.getInput(i)->refCastFrom(inputFallbacks[i], *op.getOutput(0)); + + // Get tensor dims and broadcast them + std::copy(inputs[i].dims().begin(), inputs[i].dims().end(), std::back_inserter(dims[i])); + dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1)); + + if (dims[i].size() < 4) { + dims[i].resize(4, 1); + } + + // Compute the corresponding strides + std::vector<int> tensorStrides(dims[i].size()); + int product = 1; + for (size_t j = dims[i].size(); j > 0; --j) { + tensorStrides[j - 1] = product; + product *= dims[i][j - 1]; + } + strides[i] = tensorStrides; + } + + switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + forward_<double>(inputs, dims, strides); + break; + case DataType::Float32: + forward_<float>(inputs, dims, strides); + break; + case DataType::Float16: + forward_<half>(inputs, dims, strides); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + } +} + +template <class T> +void Aidge::MulImpl_cuda::forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; + const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; + + // Create a Tensor descriptor with the broadcasted dims and strides + cudnnTensorDescriptor_t tensorDesc0, tensorDesc1; + CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc0)); + CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc0, CudaContext::data_type<T>::value, inputsDims[0].size(), inputsDims[0].data(), inputsStrides[0].data())); + CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc1)); + CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc1, CudaContext::data_type<T>::value, inputsDims[1].size(), inputsDims[1].data(), inputsStrides[1].data())); + // Multiply inputs + cudnnOpTensorDescriptor_t opTensorDesc; + CHECK_CUDNN_STATUS(cudnnCreateOpTensorDescriptor(&opTensorDesc)); + CHECK_CUDNN_STATUS(cudnnSetOpTensorDescriptor(opTensorDesc, CUDNN_OP_TENSOR_MUL, CudaContext::data_type<T>::value, CUDNN_PROPAGATE_NAN)); + if(inputs[0].size()>inputs[1].size()) { + CHECK_CUDNN_STATUS(cudnnOpTensor(CudaContext::cudnnHandle(), + opTensorDesc, + &alpha, + tensorDesc0, + inputs[0].getImpl()->rawPtr(), + &alpha, + tensorDesc1, + inputs[1].getImpl()->rawPtr(), + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), + std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr())); + } + else { + CHECK_CUDNN_STATUS(cudnnOpTensor(CudaContext::cudnnHandle(), + opTensorDesc, + &alpha, + tensorDesc1, + inputs[1].getImpl()->rawPtr(), + &alpha, + tensorDesc0, + inputs[0].getImpl()->rawPtr(), + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), + std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr())); + } + + CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc0)); + CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc1)); + CHECK_CUDNN_STATUS(cudnnDestroyOpTensorDescriptor(opTensorDesc)); +} + +void Aidge::MulImpl_cuda::backward() { + const Mul_Op& op = static_cast<const Mul_Op&>(mOp); + // Check output + AIDGE_ASSERT(op.getOutput(0)->grad(), "missing output gradient in Mul operator"); + AIDGE_ASSERT(op.getOutput(0)->grad()->hasImpl(), "cannot run Mul backward because the output gradient has no implementation."); + + std::shared_ptr<Tensor> outputGradFallback; + const auto& outputGrad = op.getOutput(0)->grad()->refCastFrom(outputGradFallback, *op.getOutput(0)->grad()); + + std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims + std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides + for (IOIndex_t i = 0; i < op.nbInputs(); ++i) { + std::shared_ptr<Tensor> inputFallback; + const Tensor input = op.getInput(i)->refCastFrom(inputFallback, *op.getOutput(0)); + + // Get tensor dims and broadcast them + std::copy(input.dims().begin(), input.dims().end(), std::back_inserter(dims[i])); + dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1)); + + if (dims[i].size() < 4) { + dims[i].resize(4, 1); + } + + // Compute the corresponding strides + std::vector<int> tensorStrides(dims[i].size()); + int product = 1; + for (size_t j = dims[i].size(); j > 0; --j) { + tensorStrides[j - 1] = product; + product *= dims[i][j - 1]; + } + strides[i] = tensorStrides; + } + + switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + backward_<double>(outputGrad, dims, strides); + break; + case DataType::Float32: + backward_<float>(outputGrad, dims, strides); + break; + case DataType::Float16: + backward_<half>(outputGrad, dims, strides); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + } +} + +template <class T> +void Aidge::MulImpl_cuda::backward_(const Tensor& outputGrad, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; + const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; + + + // Create a Tensor descriptor with the broadcasted dims and strides + cudnnTensorDescriptor_t tensorDesc0, tensorDesc1; + CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc0)); + CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc0, CudaContext::data_type<T>::value, inputsDims[0].size(), inputsDims[0].data(), inputsStrides[0].data())); + CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc1)); + CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc1, CudaContext::data_type<T>::value, inputsDims[1].size(), inputsDims[1].data(), inputsStrides[1].data())); + + // Create the operation descriptor + cudnnOpTensorDescriptor_t opTensorDesc; + CHECK_CUDNN_STATUS(cudnnCreateOpTensorDescriptor(&opTensorDesc)); + CHECK_CUDNN_STATUS(cudnnSetOpTensorDescriptor(opTensorDesc, CUDNN_OP_TENSOR_MUL, CudaContext::data_type<T>::value, CUDNN_PROPAGATE_NAN)); + + // Input0_grad = output_grad * Input1 + CHECK_CUDNN_STATUS(cudnnOpTensor(CudaContext::cudnnHandle(), + opTensorDesc, + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), + outputGrad.getImpl()->rawPtr(), + &alpha, + tensorDesc1, + op.getInput(1)->getImpl()->rawPtr(), + &beta, + tensorDesc0, + op.getInput(0)->grad()->getImpl()->rawPtr())); + + // Input1_grad = output_grad * Input0 + CHECK_CUDNN_STATUS(cudnnOpTensor(CudaContext::cudnnHandle(), + opTensorDesc, + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), + outputGrad.getImpl()->rawPtr(), + &alpha, + tensorDesc0, + op.getInput(0)->getImpl()->rawPtr(), + &beta, + tensorDesc1, + op.getInput(1)->grad()->getImpl()->rawPtr())); + + CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc0)); + CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc1)); + CHECK_CUDNN_STATUS(cudnnDestroyOpTensorDescriptor(opTensorDesc)); +} \ No newline at end of file diff --git a/src/operator/PowImpl.cpp b/src/operator/PowImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..84af8c2a74c8ebaeb7d7380975089086e4db31da --- /dev/null +++ b/src/operator/PowImpl.cpp @@ -0,0 +1,113 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <algorithm> +#include <cassert> +#include <numeric> +#include <vector> + +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/PowImpl.hpp" +#include "aidge/backend/cuda/operator/PowImpl_CUDA_kernels.hpp" +#include "aidge/backend/cuda/utils/CudaContext.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/operator/Pow.hpp" +#include "aidge/utils/Types.h" + +void Aidge::PowImpl_cuda::forward() { + const Pow_Op& op = static_cast<const Pow_Op&>(mOp); + // Check inputs + AIDGE_ASSERT(op.getInput(0), "missing input in Pow operator"); + AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Pow forward because the 0-th input has no implementation."); + DataType datatypeFirstInput = op.getInput(0)->dataType(); + for (IOIndex_t i = 1; i < op.nbInputs(); ++i) { + AIDGE_ASSERT(op.getInput(i), "missing input in Pow operator"); + AIDGE_ASSERT(op.getInput(i)->hasImpl(), "cannot run Pow forward because the {}-th input has no implementation.", i); + AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, "Cannot Pow inputs with two differents data type."); + } + + std::vector<std::shared_ptr<Tensor>> inputFallbacks(op.nbInputs()); + std::vector<Tensor> inputs(op.nbInputs()); + std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims + std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides + for (IOIndex_t i = 0; i < op.nbInputs(); ++i) { + inputs[i] = op.getInput(i)->refCastFrom(inputFallbacks[i], *op.getOutput(0)); + + // Get tensor dims and broadcast them + std::copy(inputs[i].dims().begin(), inputs[i].dims().end(), std::back_inserter(dims[i])); + dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1)); + + if (dims[i].size() < 4) { + dims[i].resize(4, 1); + } + + // Compute the corresponding strides + std::vector<int> tensorStrides(dims[i].size()); + int product = 1; + for (size_t j = dims[i].size(); j > 0; --j) { + tensorStrides[j - 1] = product; + product *= dims[i][j - 1]; + } + strides[i] = tensorStrides; + } + + switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + forward_<double>(inputs, dims, strides); + break; + case DataType::Float32: + forward_<float>(inputs, dims, strides); + break; + case DataType::Float16: + forward_<half>(inputs, dims, strides); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + } +} + +template <class T> +void Aidge::PowImpl_cuda::forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + // const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; + // const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; + const T * input1Ptr = static_cast<const T*>(inputs[0].getImpl()->rawPtr()); + const T * input2Ptr = static_cast<const T*>(inputs[1].getImpl()->rawPtr()); + T * outputPtr = static_cast<T*>(op.getOutput(0)->getImpl()->rawPtr()); + + std::vector<int> outputStrides(op.getOutput(0)->nbDims(), 1); + if(op.getOutput(0)->nbDims()>1) { + for (int i = op.getOutput(0)->nbDims()-2; i >= 0; i--) { + outputStrides[i] = outputStrides[i+1] * op.getOutput(0)->dims()[i+1]; + } + } + std::vector<int> outDims(std::max(op.getOutput(0)->nbDims(),std::size_t(4)), 1); + for (std::size_t i = 0; i < op.getOutput(0)->nbDims(); i++) { + outDims[i] = static_cast<int>(op.getOutput(0)->dims()[i]); + } + + Aidge::powForward<T>(input1Ptr, outputPtr, input2Ptr, + inputsDims[0], inputsDims[1], outDims, + inputsStrides[0], inputsStrides[1], outputStrides, + static_cast<int>(op.getOutput(0)->size())); +} + +void Aidge::PowImpl_cuda::backward() { + // TODO +} + +template <class T> +void Aidge::PowImpl_cuda::backward_(const Tensor& outGrad) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; + const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; + // TODO +} \ No newline at end of file diff --git a/src/operator/PowImpl_CUDA_kernels.cu b/src/operator/PowImpl_CUDA_kernels.cu new file mode 100644 index 0000000000000000000000000000000000000000..acd03b21764dccdfd3c5bc279e255cd0b692537e --- /dev/null +++ b/src/operator/PowImpl_CUDA_kernels.cu @@ -0,0 +1,99 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <cuda_fp16.h> + +#include "aidge/backend/cuda/operator/PowImpl_CUDA_kernels.hpp" + +// Helper function for pow +template <typename T> +__device__ T pow(T x, T exponent) { + return std::pow(x, exponent); +} +template <> +__device__ half pow<half>(half x, half exponent) { + return __float2half(powf(__half2float(x), __half2float(exponent))); +} + +template <class T> +__global__ void pow_kernel(const T* input, T* output, const T* exponent, + int* input_shape, int* exponent_shape, int* output_shape, + int* input_strides, int* exponent_strides, int* output_strides, + int num_dims, int size) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= size) return; + + int input_idx = 0, exponent_idx = 0; + int temp_idx = idx; + for (int i = 0; i < num_dims; ++i) { + int dim = temp_idx / output_strides[i]; + temp_idx %= output_strides[i]; + input_idx += (input_shape[i] == 1 ? 0 : dim) * input_strides[i]; + exponent_idx += (exponent_shape[i] == 1 ? 0 : dim) * exponent_strides[i]; + } + + output[idx] = pow(input[input_idx], exponent[exponent_idx]); +} + + +template <class T> +void Aidge::powForward<T>(const T* input, T* output, const T* exponent, + const std::vector<int>& inputDims,const std::vector<int>& exponentDims, const std::vector<int>& outputDims, + const std::vector<int>& inputStrides, const std::vector<int>& exponentStrides,const std::vector<int>& outputStrides, + int outSize) +{ + int *d_input_strides, *d_exponent_strides, *d_output_strides, *d_input_shape, *d_exponent_shape, *d_output_shape; + // Allocate device memory + CHECK_CUDA_STATUS(cudaMalloc(&d_input_shape, inputDims.size() * sizeof(int))); + CHECK_CUDA_STATUS(cudaMalloc(&d_exponent_shape, inputDims.size() * sizeof(int))); + CHECK_CUDA_STATUS(cudaMalloc(&d_output_shape, inputDims.size() * sizeof(int))); + CHECK_CUDA_STATUS(cudaMalloc(&d_input_strides, inputDims.size() * sizeof(int))); + CHECK_CUDA_STATUS(cudaMalloc(&d_exponent_strides, inputDims.size() * sizeof(int))); + CHECK_CUDA_STATUS(cudaMalloc(&d_output_strides, inputDims.size() * sizeof(int))); + + // Copy data from host to device; + CHECK_CUDA_STATUS(cudaMemcpy(d_input_shape, inputDims.data(), inputDims.size() * sizeof(int), cudaMemcpyHostToDevice)); + CHECK_CUDA_STATUS(cudaMemcpy(d_exponent_shape, exponentDims.data(), inputDims.size() * sizeof(int), cudaMemcpyHostToDevice)); + CHECK_CUDA_STATUS(cudaMemcpy(d_output_shape, outputDims.data(), inputDims.size() * sizeof(int), cudaMemcpyHostToDevice)); + CHECK_CUDA_STATUS(cudaMemcpy(d_input_strides, inputStrides.data(), inputDims.size() * sizeof(int), cudaMemcpyHostToDevice)); + CHECK_CUDA_STATUS(cudaMemcpy(d_exponent_strides, exponentStrides.data(), inputDims.size() * sizeof(int), cudaMemcpyHostToDevice)); + CHECK_CUDA_STATUS(cudaMemcpy(d_output_strides, outputStrides.data(), inputDims.size() * sizeof(int), cudaMemcpyHostToDevice)); + int blockSize = 256; + int numBlocks = (outSize + blockSize - 1) / blockSize; + + int num_dims = inputDims.size(); + // Launch the kernel + pow_kernel<<<numBlocks, blockSize>>>(input, output, exponent, + d_input_shape, d_exponent_shape, d_output_shape, + d_input_strides, d_exponent_strides, d_output_strides, + num_dims, outSize); + CHECK_CUDA_STATUS(cudaFree(d_input_shape)); + CHECK_CUDA_STATUS(cudaFree(d_exponent_shape)); + CHECK_CUDA_STATUS(cudaFree(d_output_shape)); + CHECK_CUDA_STATUS(cudaFree(d_input_strides)); + CHECK_CUDA_STATUS(cudaFree(d_exponent_strides)); + CHECK_CUDA_STATUS(cudaFree(d_output_strides)); +}; + +template void Aidge::powForward<double>(const double* input, double* output, const double* exponent, + const std::vector<int>& inputDims,const std::vector<int>& exponentDims, const std::vector<int>& outputDims, + const std::vector<int>& inputStrides, const std::vector<int>& exponentStrides,const std::vector<int>& outputStrides, + int outSize); + +template void Aidge::powForward<float>(const float* input, float* output, const float* exponent, + const std::vector<int>& inputDims,const std::vector<int>& exponentDims, const std::vector<int>& outputDims, + const std::vector<int>& inputStrides, const std::vector<int>& exponentStrides,const std::vector<int>& outputStrides, + int outSize); + +template void Aidge::powForward<half>(const half* input, half* output, const half* exponent, + const std::vector<int>& inputDims,const std::vector<int>& exponentDims, const std::vector<int>& outputDims, + const std::vector<int>& inputStrides, const std::vector<int>& exponentStrides,const std::vector<int>& outputStrides, + int outSize); diff --git a/src/operator/ReduceImpl_CUDA_kernels.cu b/src/operator/ReduceImpl_CUDA_kernels.cu new file mode 100644 index 0000000000000000000000000000000000000000..7002e34116d2c1050987dc0cb93dbf7339a7ea93 --- /dev/null +++ b/src/operator/ReduceImpl_CUDA_kernels.cu @@ -0,0 +1,114 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include "aidge/backend/cuda/operator/ReduceImpl_CUDA_kernels.hpp" + +template <typename T> +__global__ void duplicateElements(const T* input, T* output, const std::size_t* shape, const std::size_t* new_shape, const int* axes, const std::size_t* factors, int num_dims, int num_axes) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int input_size = 1; + int output_size = 1; + + for (int i = 0; i < num_dims; ++i) { + input_size *= shape[i]; + output_size *= new_shape[i]; + } + + if (idx >= output_size) return; + + int* out_idx = new int[num_dims]; + int* in_idx = new int[num_dims]; + int remaining_idx = idx; + + for (int i = num_dims - 1; i >= 0; --i) { + out_idx[i] = remaining_idx % new_shape[i]; + remaining_idx /= new_shape[i]; + } + + for (int i = 0; i < num_dims; ++i) { + in_idx[i] = out_idx[i]; + } + + for (int i = 0; i < num_axes; ++i) { + int axis = axes[i]; + int factor = factors[i]; + in_idx[axis] = out_idx[axis] / factor; + } + + int in_linear_idx = 0; + int out_linear_idx = 0; + int input_stride = 1; + int output_stride = 1; + + for (int i = num_dims - 1; i >= 0; --i) { + in_linear_idx += in_idx[i] * input_stride; + out_linear_idx += out_idx[i] * output_stride; + input_stride *= shape[i]; + output_stride *= new_shape[i]; + } + + output[out_linear_idx] = input[in_linear_idx]; + + delete[] out_idx; + delete[] in_idx; +} + +template <typename T> +void Aidge::ReduceBackward(const T* input, T* output, const std::vector<std::size_t>& inputDims, const std::vector<std::size_t>& outputDims, const std::vector<int>& axes, const std::vector<std::size_t>& factors, int outSize) { + + std::size_t* d_shape; + std::size_t* d_new_shape; + int* d_axes; + std::size_t* d_factors; + cudaMalloc(&d_shape, inputDims.size() * sizeof(std::size_t)); + cudaMalloc(&d_new_shape, outputDims.size() * sizeof(std::size_t)); + cudaMalloc(&d_axes, axes.size() * sizeof(int)); + cudaMalloc(&d_factors, axes.size() * sizeof(std::size_t)); + + cudaMemcpy(d_shape, inputDims.data(), inputDims.size() * sizeof(std::size_t), cudaMemcpyHostToDevice); + cudaMemcpy(d_new_shape, outputDims.data(), outputDims.size() * sizeof(std::size_t), cudaMemcpyHostToDevice); + cudaMemcpy(d_axes, axes.data(), axes.size() * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(d_factors, factors.data(), axes.size() * sizeof(std::size_t), cudaMemcpyHostToDevice); + + int blockSize = 256; + int numBlocks = (outSize + blockSize - 1) / blockSize; + + duplicateElements<<<numBlocks, blockSize>>>(input, output, d_shape, d_new_shape, d_axes, d_factors, static_cast<int>(inputDims.size()), static_cast<int>(axes.size())); + cudaFree(d_shape); + cudaFree(d_new_shape); + cudaFree(d_axes); + cudaFree(d_factors); +} + + +template void Aidge::ReduceBackward(const double* input, + double* output, + const std::vector<std::size_t>& inputDims, + const std::vector<std::size_t>& outputDims, + const std::vector<int>& axes, + const std::vector<std::size_t>& factors, + int outSize); + +template void Aidge::ReduceBackward(const float* input, + float* output, + const std::vector<std::size_t>& inputDims, + const std::vector<std::size_t>& outputDims, + const std::vector<int>& axes, + const std::vector<std::size_t>& factors, + int outSize); + +template void Aidge::ReduceBackward(const half* input, + half* output, + const std::vector<std::size_t>& inputDims, + const std::vector<std::size_t>& outputDims, + const std::vector<int>& axes, + const std::vector<std::size_t>& factors, + int outSize); diff --git a/src/operator/ReduceMeanImpl.cpp b/src/operator/ReduceMeanImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ff83ea5153a95e109ce7ef83c42ed4d672561ad1 --- /dev/null +++ b/src/operator/ReduceMeanImpl.cpp @@ -0,0 +1,200 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <algorithm> +#include <cassert> +#include <numeric> +#include <vector> + +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/ReduceMeanImpl.hpp" +#include "aidge/backend/cuda/operator/ReduceImpl_CUDA_kernels.hpp" +#include "aidge/backend/cuda/utils/CudaContext.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/operator/ReduceMean.hpp" +#include "aidge/utils/Types.h" + +void Aidge::ReduceMeanImpl_cuda::forward() { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + AIDGE_ASSERT(op.getInput(0), "missing input in ReduceMean operator"); + AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run ReduceMean forward because the input has no implementation."); + + const auto& input = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->refCastFrom(mInputFallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); + + const ReduceMean_Op& rmOp = static_cast<const ReduceMean_Op&>(mOp); + bool keepDims = rmOp.keepDims(); + auto axes = rmOp.axes(); + if (axes.empty()) { + input.getImpl()->copy(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr(), input.size()); + } + else { + switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + forward_<double>(input, axes, keepDims); + break; + case DataType::Float32: + forward_<float>(input, axes, keepDims); + break; + case DataType::Float16: + forward_<half>(input, axes, keepDims); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + } + } +} + + +template <class T> +void Aidge::ReduceMeanImpl_cuda::forward_(const Tensor& input, const std::vector<int>& axes, bool keepDims) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; + const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; + + cudnnReduceTensorDescriptor_t reduceDesc; + cudnnTensorDescriptor_t outputDesc; + if (keepDims) { + outputDesc = std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)); + CHECK_CUDNN_STATUS(cudnnCreateReduceTensorDescriptor(&reduceDesc)); + CHECK_CUDNN_STATUS(cudnnSetReduceTensorDescriptor(reduceDesc, + CUDNN_REDUCE_TENSOR_AVG, + CudaContext::data_type<T>::value, + CUDNN_PROPAGATE_NAN, + CUDNN_REDUCE_TENSOR_NO_INDICES, + CUDNN_32BIT_INDICES)); + + + size_t workspaceSize; + CHECK_CUDNN_STATUS(cudnnGetReductionWorkspaceSize(CudaContext::cudnnHandle(), + reduceDesc, + std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input), + outputDesc, + &workspaceSize)); + + void *d_workspace; + CHECK_CUDA_STATUS(cudaMalloc(&d_workspace, workspaceSize)); + + CHECK_CUDNN_STATUS(cudnnReduceTensor(CudaContext::cudnnHandle(), + reduceDesc, + NULL, + 0, + d_workspace, + workspaceSize, + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input), + input.getImpl()->rawPtr(), + &beta, + outputDesc, + std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr())); + + CHECK_CUDNN_STATUS(cudnnDestroyReduceTensorDescriptor(reduceDesc)); + } + else { + CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&outputDesc)); + std::vector<int> outputDims; + std::copy(input.dims().begin(), input.dims().end(), std::back_inserter(outputDims)); + for (const auto axis:axes) { + outputDims[axis] = 1; + } + if (outputDims.size() < 4) { + outputDims.resize(4, 1); + } + // Compute the corresponding strides + std::vector<int> outputStrides(outputDims.size()); + int product = 1; + for (size_t i = outputDims.size(); i > 0; --i) { + outputStrides[i - 1] = product; + product *= outputDims[i - 1]; + } + CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(outputDesc, CudaContext::data_type<T>::value, outputDims.size(), outputDims.data(), outputStrides.data())); + + CHECK_CUDNN_STATUS(cudnnCreateReduceTensorDescriptor(&reduceDesc)); + CHECK_CUDNN_STATUS(cudnnSetReduceTensorDescriptor(reduceDesc, + CUDNN_REDUCE_TENSOR_AVG, + CudaContext::data_type<T>::value, + CUDNN_PROPAGATE_NAN, + CUDNN_REDUCE_TENSOR_NO_INDICES, + CUDNN_32BIT_INDICES)); + + + size_t workspaceSize; + CHECK_CUDNN_STATUS(cudnnGetReductionWorkspaceSize(CudaContext::cudnnHandle(), + reduceDesc, + std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input), + outputDesc, + &workspaceSize)); + + void *d_workspace; + CHECK_CUDA_STATUS(cudaMalloc(&d_workspace, workspaceSize)); + + CHECK_CUDNN_STATUS(cudnnReduceTensor(CudaContext::cudnnHandle(), + reduceDesc, + NULL, + 0, + d_workspace, + workspaceSize, + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input), + input.getImpl()->rawPtr(), + &beta, + outputDesc, + std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr())); + + CHECK_CUDNN_STATUS(cudnnDestroyReduceTensorDescriptor(reduceDesc)); + CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(outputDesc)); + } +} + +void Aidge::ReduceMeanImpl_cuda::backward() { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + AIDGE_ASSERT(op.getOutput(0)->grad(), "missing outputGrad in ReduceMean operator"); + AIDGE_ASSERT(op.getOutput(0)->grad()->hasImpl(), "cannot run ReduceMean backward because the output grad has no implementation."); + + const auto& outGrad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getInput(0)->grad()); + + const ReduceMean_Op& rmOp = static_cast<const ReduceMean_Op&>(mOp); + auto axes = rmOp.axes(); + switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + backward_<double>(outGrad, axes); + break; + case DataType::Float32: + backward_<float>(outGrad, axes); + break; + case DataType::Float16: + backward_<half>(outGrad, axes); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + } +} + +template <class T> +void Aidge::ReduceMeanImpl_cuda::backward_(const Tensor& outGrad, const std::vector<int>& axes) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + // const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; + // const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; + const T * outputGrad = static_cast<const T*>(op.getOutput(0)->grad()->getImpl()->rawPtr()); + T * inputGrad = static_cast<T*>(op.getInput(0)->grad()->getImpl()->rawPtr()); + + std::vector<std::size_t> factors; + for (auto axis:axes) { + factors.push_back(op.getInput(0)->grad()->dims()[axis]); + } + + Aidge::ReduceBackward(outputGrad, + inputGrad, + outGrad.dims(), + op.getInput(0)->grad()->dims(), + axes, + factors, + static_cast<int>(op.getInput(0)->grad()->size())); +} diff --git a/src/operator/ReduceSumImpl.cpp b/src/operator/ReduceSumImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..895584d87dab88f3f71a424a02a3b32954c4dc43 --- /dev/null +++ b/src/operator/ReduceSumImpl.cpp @@ -0,0 +1,199 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <algorithm> +#include <cassert> +#include <numeric> +#include <vector> + +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/ReduceSumImpl.hpp" +#include "aidge/backend/cuda/operator/ReduceImpl_CUDA_kernels.hpp" +#include "aidge/backend/cuda/utils/CudaContext.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/operator/ReduceSum.hpp" +#include "aidge/utils/Types.h" + +void Aidge::ReduceSumImpl_cuda::forward() { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + AIDGE_ASSERT(op.getInput(0), "missing input in ReduceSum operator"); + AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run ReduceSum forward because the input has no implementation."); + + const auto& input = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->refCastFrom(mInputFallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); + + const ReduceSum_Op& rsOp = static_cast<const ReduceSum_Op&>(mOp); + bool keepDims = rsOp.keepDims(); + auto axes = rsOp.axes(); + if (axes.empty()) { + input.getImpl()->copy(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr(), input.size()); + } + else { + switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + forward_<double>(input, axes, keepDims); + break; + case DataType::Float32: + forward_<float>(input, axes, keepDims); + break; + case DataType::Float16: + forward_<half>(input, axes, keepDims); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + } + } +} + + +template <class T> +void Aidge::ReduceSumImpl_cuda::forward_(const Tensor& input, const std::vector<int>& axes, bool keepDims) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; + const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; + + cudnnReduceTensorDescriptor_t reduceDesc; + cudnnTensorDescriptor_t outputDesc; + if (keepDims) { + outputDesc = std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)); + CHECK_CUDNN_STATUS(cudnnCreateReduceTensorDescriptor(&reduceDesc)); + CHECK_CUDNN_STATUS(cudnnSetReduceTensorDescriptor(reduceDesc, + CUDNN_REDUCE_TENSOR_ADD, + CudaContext::data_type<T>::value, + CUDNN_PROPAGATE_NAN, + CUDNN_REDUCE_TENSOR_NO_INDICES, + CUDNN_32BIT_INDICES)); + + + size_t workspaceSize; + CHECK_CUDNN_STATUS(cudnnGetReductionWorkspaceSize(CudaContext::cudnnHandle(), + reduceDesc, + std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input), + outputDesc, + &workspaceSize)); + + void *d_workspace; + CHECK_CUDA_STATUS(cudaMalloc(&d_workspace, workspaceSize)); + + CHECK_CUDNN_STATUS(cudnnReduceTensor(CudaContext::cudnnHandle(), + reduceDesc, + NULL, + 0, + d_workspace, + workspaceSize, + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input), + input.getImpl()->rawPtr(), + &beta, + outputDesc, + std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr())); + + CHECK_CUDNN_STATUS(cudnnDestroyReduceTensorDescriptor(reduceDesc)); + } + else { + CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&outputDesc)); + std::vector<int> outputDims; + std::copy(input.dims().begin(), input.dims().end(), std::back_inserter(outputDims)); + for (const auto axis:axes) { + outputDims[axis] = 1; + } + if (outputDims.size() < 4) { + outputDims.resize(4, 1); + } + // Compute the corresponding strides + std::vector<int> outputStrides(outputDims.size()); + int product = 1; + for (size_t i = outputDims.size(); i > 0; --i) { + outputStrides[i - 1] = product; + product *= outputDims[i - 1]; + } + CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(outputDesc, CudaContext::data_type<T>::value, outputDims.size(), outputDims.data(), outputStrides.data())); + + CHECK_CUDNN_STATUS(cudnnCreateReduceTensorDescriptor(&reduceDesc)); + CHECK_CUDNN_STATUS(cudnnSetReduceTensorDescriptor(reduceDesc, + CUDNN_REDUCE_TENSOR_ADD, + CudaContext::data_type<T>::value, + CUDNN_PROPAGATE_NAN, + CUDNN_REDUCE_TENSOR_NO_INDICES, + CUDNN_32BIT_INDICES)); + + + size_t workspaceSize; + CHECK_CUDNN_STATUS(cudnnGetReductionWorkspaceSize(CudaContext::cudnnHandle(), + reduceDesc, + std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input), + outputDesc, + &workspaceSize)); + + void *d_workspace; + CHECK_CUDA_STATUS(cudaMalloc(&d_workspace, workspaceSize)); + + CHECK_CUDNN_STATUS(cudnnReduceTensor(CudaContext::cudnnHandle(), + reduceDesc, + NULL, + 0, + d_workspace, + workspaceSize, + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input), + input.getImpl()->rawPtr(), + &beta, + outputDesc, + std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr())); + + CHECK_CUDNN_STATUS(cudnnDestroyReduceTensorDescriptor(reduceDesc)); + CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(outputDesc)); + } +} + +void Aidge::ReduceSumImpl_cuda::backward() { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + AIDGE_ASSERT(op.getOutput(0)->grad(), "missing outputGrad in ReduceSum operator"); + AIDGE_ASSERT(op.getOutput(0)->grad()->hasImpl(), "cannot run ReduceSum backward because the output grad has no implementation."); + + const auto& outGrad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getInput(0)->grad()); + + const ReduceSum_Op& rmOp = static_cast<const ReduceSum_Op&>(mOp); + auto axes = rmOp.axes(); + switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + backward_<double>(outGrad, axes); + break; + case DataType::Float32: + backward_<float>(outGrad, axes); + break; + case DataType::Float16: + backward_<half>(outGrad, axes); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + } +} + +template <class T> +void Aidge::ReduceSumImpl_cuda::backward_(const Tensor& outGrad, const std::vector<int>& axes) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + + const T * outputGrad = static_cast<const T*>(op.getOutput(0)->grad()->getImpl()->rawPtr()); + T * inputGrad = static_cast<T*>(op.getInput(0)->grad()->getImpl()->rawPtr()); + + std::vector<std::size_t> factors; + for (auto axis:axes) { + factors.push_back(op.getInput(0)->grad()->dims()[axis]); + } + + Aidge::ReduceBackward(outputGrad, + inputGrad, + outGrad.dims(), + op.getInput(0)->grad()->dims(), + axes, + factors, + static_cast<int>(op.getInput(0)->grad()->size())); +} diff --git a/src/operator/ReshapeImpl.cpp b/src/operator/ReshapeImpl.cpp index 8016a5a9d1dfc26454af2cb03b6fe573820245f5..783e244057b0fc42a782fd363c3a99aa6d73b46b 100644 --- a/src/operator/ReshapeImpl.cpp +++ b/src/operator/ReshapeImpl.cpp @@ -39,8 +39,3 @@ void Aidge::ReshapeImpl_cuda::backward() { std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->grad() -> getImpl() -> setRawPtr(output_grad.getImpl()->rawPtr(), output_grad.getImpl()->size()); } - -Aidge::ReshapeImpl_cuda::~ReshapeImpl_cuda() { - -} - diff --git a/src/operator/ShiftGELUImpl.cpp b/src/operator/ShiftGELUImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c2774804d04a422aefd0c66ed0d1fc1d949b1f06 --- /dev/null +++ b/src/operator/ShiftGELUImpl.cpp @@ -0,0 +1,119 @@ +/******************************************************************************** + * Copyright (c) 2024 Thales + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * Author: Lucas RAKOTOARIVONY, Thales Research & Technology France + * Date: 25.06.2024 + * + ********************************************************************************/ + +#include <cassert> +#include <chrono> // std::chrono::milliseconds +#include <numeric> // std::accumulate +#include <thread> // std::this_thread::sleep_for +#include <vector> +#include <algorithm> // For std::max +#include <cmath> // For pow +#include <typeinfo> + +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/ShiftGELUImpl.hpp" +#include "aidge/backend/cuda/operator/ShiftGELUImpl_CUDA_kernels.hpp" +#include "aidge/backend/cuda/utils/CudaContext.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/operator/ShiftGELU.hpp" +#include "aidge/utils/Types.h" + +void Aidge::ShiftGELUImpl_cuda::forward() { + + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + assert(mOp.getRawInput(0) && "missing input #0"); + const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0)); + + switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + forward_<double>(input); + break; + case DataType::Float32: + forward_<float>(input); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + } +} + +template<class T> +void Aidge::ShiftGELUImpl_cuda::forward_(const Tensor& input) +{ + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const T * input_raw = static_cast<const T*>(input.getImpl()->rawPtr()); + T * output = static_cast<T*>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr()); + + int N = 15; + int output_bits = 8; + size_t size = input.size(); + std::vector<DimSize_t> dims_input = input.dims(); + + // maybe find a most efficient way to compute scaling factor (a max and min function could help to retrieve scaling factor value) + + double min = std::numeric_limits<double>::max(); + double max = std::numeric_limits<double>::min(); + for(std::size_t i = 0; i < dims_input[0]; i++) { + for(std::size_t j = 0; j < dims_input[1]; j++) { + for(std::size_t k = 0; k < dims_input[2]; k++) { + for(std::size_t l = 0; l < dims_input[3]; l++) { + std::vector<std::size_t> coordIdx = {i, j, k, l}; + std::size_t newFlatIdx = input.getIdx(coordIdx); + if (newFlatIdx < min) { + min = newFlatIdx; + } + if (newFlatIdx > max) { + max = newFlatIdx; + } + } + } + } + } + + double m = std::max(std::abs(min), std::abs(max)); + double normalization_factor = static_cast<double>(1 << (output_bits - 1)) - 1; + double scaling_factor = m / normalization_factor; + + // The new scaling factor that we can use to dequantify the returned tensor (not used here) + // double new_SF = 1/std::pow(2,2*output_bits-1); + + ShiftGELUforward(input_raw, output, scaling_factor,N, output_bits, size, dims_input); +} + +void Aidge::ShiftGELUImpl_cuda::backward() { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + + assert(op.getOutput(0)->grad() && "missing output #0"); + + const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad()); + + if (op.getInput(0)->grad()->dataType() == DataType::Float64) { + backward_<double>(output_grad); + } + else { + backward_<float>(output_grad); + } +} + +template <class T> +void Aidge::ShiftGELUImpl_cuda::backward_(const Tensor& output_grad) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const T * input = static_cast<const T*>(std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()); + + size_t size = output_grad.size(); + + T * output = static_cast<T*>(op.getInput(0)->grad()->getImpl()->rawPtr()); + + const T * output_grad_raw = static_cast<const T*>(output_grad.getImpl()->rawPtr()); + ShiftGELUbackward(input, output_grad_raw, output, size); + +} \ No newline at end of file diff --git a/src/operator/ShiftGELUImpl_CUDA_kernels.cu b/src/operator/ShiftGELUImpl_CUDA_kernels.cu new file mode 100644 index 0000000000000000000000000000000000000000..aabd89c04e960f9f19eca69247173168d3eaf71e --- /dev/null +++ b/src/operator/ShiftGELUImpl_CUDA_kernels.cu @@ -0,0 +1,256 @@ +/******************************************************************************** + * Copyright (c) 2024 Thales + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * Author: Lucas RAKOTOARIVONY, Thales Research & Technology France + * Date: 25.06.2024 + * + ********************************************************************************/ +#define MAX(X,Y) (((X) > (Y)) ? (X) : (Y)) +#define CLAMP(X) (((X) < (0)) ? (0) : (X)) + +#include <stdio.h> +#include <cuda_runtime.h> + +#include "aidge/backend/cuda/operator/ShiftGELUImpl_CUDA_kernels.hpp" + +__device__ inline int ExpShift(int I,int N, double SF) +{ + int Ip = I + (I >> 1) - (I >> 4); + int I0 = floorf(-1.0/SF); + Ip = MAX(Ip,N*I0); + int q = floorf(Ip / (I0)); + int r = Ip -(I0*q); + int Ib = r/2 - I0; + Ib = CLAMP(Ib * powf(2,N-q)); + return (int)Ib; +} + +namespace Aidge{ + +template <class T> +__global__ void ShiftGELUforward_(T* input,int* quantized_tensor,int* GELUtensor,int* SumTensor, int* dims, double SF, int N, int output_bits) { + + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + int z = blockIdx.z * blockDim.z + threadIdx.z; + + double SF_sig = SF * 1.702; + double Final_SF = SF / powf(2,(output_bits-1)); + + if (x < dims[0] && y < dims[1] && z < dims[2]) { + int maxIdx = x * dims[1] * dims[2] * dims[3] + y * dims[2] * dims[3] + z * dims[3]; + for (int i = 0; i < dims[3]; i++) { + int idx = maxIdx + i; + quantized_tensor[idx] = roundf(input[idx] / SF); + } + int maxVal = quantized_tensor[maxIdx]; + for (int i = 1; i < dims[3]; i++) { + int idx = maxIdx + i; + maxVal = MAX(maxVal, quantized_tensor[idx]); + } + int Max_Exp = ExpShift(-maxVal,N,SF_sig); + for (int i = 0; i < dims[3]; i++) { + int idx = maxIdx + i; + GELUtensor[idx] = ExpShift(quantized_tensor[idx] - maxVal,N,SF_sig); + if(GELUtensor[idx] > INT_MAX - Max_Exp) { + SumTensor[idx] = 1; + } + else + { + SumTensor[idx] = floorf(INT_MAX/(GELUtensor[idx] + Max_Exp)); + } + SumTensor[idx] = floorf((GELUtensor[idx] * SumTensor[idx]) >> (31 - output_bits + 1)); + quantized_tensor[idx] *= SumTensor[idx]; + input[idx] = quantized_tensor[idx] * Final_SF; + } + } +} + +template <> +void ShiftGELUforward<float>(const float* input, float* output, double SF,int N, int output_bits, size_t size, std::vector<long unsigned int> dims_input) { + + double new_SF = 1/std::pow(2,2*output_bits-1); + + int dims_input_cuda[4]; + if (dims_input.size() >= 4) { + for (std::size_t i = 0; i < 4; ++i) { + dims_input_cuda[i] = static_cast<int>(dims_input[i]); + } + } + + float* input_cuda_tensor; + cudaMalloc(&input_cuda_tensor,size*sizeof(float)); + cudaMemcpy(input_cuda_tensor,input,size*sizeof(float),cudaMemcpyHostToDevice); + + int* quantized_tensor; + cudaMalloc(&quantized_tensor,size*sizeof(int)); + + int* GELUtensor; + cudaMalloc(&GELUtensor,size*sizeof(int)); + + int* SumTensor; + cudaMalloc(&SumTensor,size*sizeof(int)); + + int* dims; + cudaMalloc(&dims,4*sizeof(int)); + + cudaMemcpy(dims,dims_input_cuda,4*sizeof(int),cudaMemcpyHostToDevice); + + dim3 threadsPerBlock(10, 10, 10); + dim3 numBlocks((dims_input[0] + threadsPerBlock.x - 1) / threadsPerBlock.x, + (dims_input[1] + threadsPerBlock.y - 1) / threadsPerBlock.y, + (dims_input[2] + threadsPerBlock.z - 1) / threadsPerBlock.z); + + ShiftGELUforward_<float><<<numBlocks, threadsPerBlock>>>(input_cuda_tensor, quantized_tensor,GELUtensor,SumTensor, dims, SF,N,8); + cudaDeviceSynchronize(); + + cudaError_t err = cudaGetLastError(); + if(err != cudaSuccess) + { + printf("CUDA Error: %s\n", cudaGetErrorString(err)); + } + + cudaMemcpy(output,input_cuda_tensor,size*sizeof(float),cudaMemcpyDeviceToHost); + + cudaFree(quantized_tensor); + cudaFree(GELUtensor); + cudaFree(SumTensor); + cudaFree(dims); + cudaFree(input_cuda_tensor); +} + +template <> +void ShiftGELUforward<double>(const double* input, double* output, double SF,int N, int output_bits, size_t size, std::vector<long unsigned int> dims_input) { + + double new_SF = 1/std::pow(2,2*output_bits-1); + + int dims_input_cuda[4]; + if (dims_input.size() >= 4) { + for (std::size_t i = 0; i < 4; ++i) { + dims_input_cuda[i] = static_cast<int>(dims_input[i]); + } + } + + double* input_cuda_tensor; + cudaMalloc(&input_cuda_tensor,size*sizeof(double)); + cudaMemcpy(input_cuda_tensor,input,size*sizeof(double),cudaMemcpyHostToDevice); + + int* quantized_tensor; + cudaMalloc(&quantized_tensor,size*sizeof(int)); + + int* GELUtensor; + cudaMalloc(&GELUtensor,size*sizeof(int)); + + int* SumTensor; + cudaMalloc(&SumTensor,size*sizeof(int)); + + int* dims; + cudaMalloc(&dims,4*sizeof(int)); + + cudaMemcpy(dims,dims_input_cuda,4*sizeof(int),cudaMemcpyHostToDevice); + + dim3 threadsPerBlock(10, 10, 10); + dim3 numBlocks((dims_input[0] + threadsPerBlock.x - 1) / threadsPerBlock.x, + (dims_input[1] + threadsPerBlock.y - 1) / threadsPerBlock.y, + (dims_input[2] + threadsPerBlock.z - 1) / threadsPerBlock.z); + + ShiftGELUforward_<double><<<numBlocks, threadsPerBlock>>>(input_cuda_tensor, quantized_tensor,GELUtensor,SumTensor, dims, SF,N,8); + cudaDeviceSynchronize(); + + cudaError_t err = cudaGetLastError(); + if(err != cudaSuccess) + { + printf("CUDA Error: %s\n", cudaGetErrorString(err)); + } + + cudaMemcpy(output,input_cuda_tensor,size*sizeof(double),cudaMemcpyDeviceToHost); + + cudaFree(quantized_tensor); + cudaFree(GELUtensor); + cudaFree(SumTensor); + cudaFree(dims); + cudaFree(input_cuda_tensor); +} + +template <class T> +__global__ void ShiftGELUbackward_(T* input_grad, const T* output_tensor, const T* output_grad, int size) { + + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < size) { + float x = output_tensor[index]; + float grad = output_grad[index]; + + float cdf = 0.5 * (1.0 + tanh(sqrt(2.0 / M_PI) * (x + 0.044715 * pow(x, 3)))); + float pdf = exp(-0.5 * x * x) / sqrt(2.0 * M_PI); + float dx = pdf + x * cdf; + float backprop_grad = grad * dx; + input_grad[index] = backprop_grad; + } +} + +template <> +void ShiftGELUbackward<float>(const float* output_tensor, const float* output_grad, float* input_grad, size_t size) +{ + float* output_cuda_tensor; + cudaMalloc(&output_cuda_tensor,size*sizeof(float)); + cudaMemcpy(output_cuda_tensor,output_tensor,size*sizeof(float),cudaMemcpyHostToDevice); + + float* output_grad_; + cudaMalloc(&output_grad_,size*sizeof(float)); + cudaMemcpy(output_grad_,output_grad,size*sizeof(float),cudaMemcpyHostToDevice); + + float *input_grad_; + cudaMalloc(&input_grad_, size * sizeof(float)); + + dim3 threadParBlock(256); + dim3 Blocks((size + threadParBlock.x -1) / threadParBlock.x); + + ShiftGELUbackward_<float><<<Blocks,threadParBlock>>>(input_grad_,output_cuda_tensor,output_grad_,size); + cudaDeviceSynchronize(); + cudaError_t err = cudaGetLastError(); + if(err != cudaSuccess) + { + printf("CUDA Error: %s\n", cudaGetErrorString(err)); + } + cudaMemcpy(input_grad,input_grad_, (size) * sizeof(float), cudaMemcpyDeviceToHost); + cudaFree(output_cuda_tensor); + cudaFree(input_grad_); + cudaFree(output_grad_); +} + +template <> +void ShiftGELUbackward<double>(const double* output_tensor, const double* output_grad, double* input_grad, size_t size) +{ + double* output_cuda_tensor; + cudaMalloc(&output_cuda_tensor,size*sizeof(double)); + cudaMemcpy(output_cuda_tensor,output_tensor,size*sizeof(double),cudaMemcpyHostToDevice); + + double* output_grad_; + cudaMalloc(&output_grad_,size*sizeof(double)); + cudaMemcpy(output_grad_,output_grad,size*sizeof(double),cudaMemcpyHostToDevice); + + double *input_grad_; + cudaMalloc(&input_grad_, size * sizeof(double)); + + dim3 threadParBlock(256); + dim3 Blocks((size + threadParBlock.x -1) / threadParBlock.x); + + ShiftGELUbackward_<double><<<Blocks,threadParBlock>>>(input_grad_,output_cuda_tensor,output_grad_,size); + cudaDeviceSynchronize(); + cudaError_t err = cudaGetLastError(); + if(err != cudaSuccess) + { + printf("CUDA Error: %s\n", cudaGetErrorString(err)); + } + cudaMemcpy(input_grad,input_grad_, (size) * sizeof(double), cudaMemcpyDeviceToHost); + cudaFree(output_cuda_tensor); + cudaFree(input_grad_); + cudaFree(output_grad_); +} + +} \ No newline at end of file diff --git a/src/operator/ShiftMaxImpl.cpp b/src/operator/ShiftMaxImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1134cc5d6b99e53eb492c82e32d811bc0bcba0e0 --- /dev/null +++ b/src/operator/ShiftMaxImpl.cpp @@ -0,0 +1,121 @@ +/******************************************************************************** + * Copyright (c) 2024 Thales + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * Author: Lucas RAKOTOARIVONY, Thales Research & Technology France + * Date: 25.06.2024 + * + ********************************************************************************/ + +#include <cassert> +#include <chrono> // std::chrono::milliseconds +#include <numeric> // std::accumulate +#include <thread> // std::this_thread::sleep_for +#include <vector> +#include <algorithm> // For std::max +#include <cmath> // For pow +#include <typeinfo> + +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/ShiftMaxImpl.hpp" +#include "aidge/backend/cuda/operator/ShiftMaxImpl_CUDA_kernels.hpp" +#include "aidge/backend/cuda/utils/CudaContext.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/operator/ShiftMax.hpp" +#include "aidge/utils/Types.h" + +void Aidge::ShiftMaxImpl_cuda::forward() { + + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + assert(mOp.getRawInput(0) && "missing input #0"); + const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0)); + + switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + forward_<double>(input); + break; + case DataType::Float32: + forward_<float>(input); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + } +} + +template<class T> +void Aidge::ShiftMaxImpl_cuda::forward_(const Tensor& input) +{ + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const T * input_raw = static_cast<const T*>(input.getImpl()->rawPtr()); + T * output = static_cast<T*>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr()); + + int N = 15; + int output_bits = 8; + size_t size = input.size(); + std::vector<DimSize_t> dims_input = input.dims(); + + // maybe find a most efficient way to compute scaling factor (a max and min function could help to retrieve scaling factor value) + + double min = std::numeric_limits<double>::max(); + double max = std::numeric_limits<double>::min(); + for(std::size_t i = 0; i < dims_input[0]; i++) { + for(std::size_t j = 0; j < dims_input[1]; j++) { + for(std::size_t k = 0; k < dims_input[2]; k++) { + for(std::size_t l = 0; l < dims_input[3]; l++) { + std::vector<std::size_t> coordIdx = {i, j, k, l}; + std::size_t newFlatIdx = input.getIdx(coordIdx); + if (newFlatIdx < min) { + min = newFlatIdx; + } + if (newFlatIdx > max) { + max = newFlatIdx; + } + } + } + } + } + + double m = std::max(std::abs(min), std::abs(max)); + double normalization_factor = static_cast<double>(1 << (output_bits - 1)) - 1; + double scaling_factor = m / normalization_factor; + + // The new scaling factor that we can use to dequantify the returned tensor (not used here) + // double new_SF = 1/std::pow(2,2*output_bits-1); + + ShiftMaxforward(input_raw, output, scaling_factor,N, output_bits, size, dims_input); +} + + +void Aidge::ShiftMaxImpl_cuda::backward() { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + + assert(op.getOutput(0)->grad() && "missing output #0"); + + const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad()); + + if (op.getInput(0)->grad()->dataType() == DataType::Float64) { + backward_<double>(output_grad); + } + else { + backward_<float>(output_grad); + } +} + +template <class T> +void Aidge::ShiftMaxImpl_cuda::backward_(const Tensor& output_grad) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const T * output_tensor = static_cast<const T*>(std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()); + + size_t size = output_grad.size(); + std::vector<DimSize_t> dims_output = output_grad.dims(); + + T * input_grad = static_cast<T*>(op.getInput(0)->grad()->getImpl()->rawPtr()); + + const T * output_grad_raw = static_cast<const T*>(output_grad.getImpl()->rawPtr()); + ShiftMaxbackward(output_tensor, output_grad_raw, input_grad, size, dims_output); + +} \ No newline at end of file diff --git a/src/operator/ShiftMaxImpl_CUDA_kernels.cu b/src/operator/ShiftMaxImpl_CUDA_kernels.cu new file mode 100644 index 0000000000000000000000000000000000000000..ba3cfcb51e02fb0befbf9f7c1fc054e73a2a7157 --- /dev/null +++ b/src/operator/ShiftMaxImpl_CUDA_kernels.cu @@ -0,0 +1,286 @@ +/******************************************************************************** + * Copyright (c) 2024 Thales + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * Author: Lucas RAKOTOARIVONY, Thales Research & Technology France + * Date: 25.06.2024 + * + ********************************************************************************/ +#define MAX(X,Y) (((X) > (Y)) ? (X) : (Y)) +#define CLAMP(X) (((X) < (0)) ? (0) : (X)) + +#include <stdio.h> +#include <cuda_runtime.h> + +#include "aidge/backend/cuda/operator/ShiftMaxImpl_CUDA_kernels.hpp" + +__device__ inline int ExpShift(int I,int N, double SF) +{ + int Ip = I + (I >> 1) - (I >> 4); + int I0 = floorf(-1.0/SF); + Ip = MAX(Ip,N*I0); + int q = floorf(Ip / (I0)); + int r = Ip -(I0*q); + int Ib = r/2 - I0; + Ib = CLAMP(Ib * powf(2,N-q)); + return (int)Ib; +} + +namespace Aidge{ + +template <class T> +__global__ void ShiftMaxforward_(T* input,int* quantized_tensor,int* factor, int* dims, double SF, int N, int output_bits,double new_SF) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + int z = blockIdx.z * blockDim.z + threadIdx.z; + int sum = 0; + + if (x < dims[0] && y < dims[1] && z < dims[2]) { + int maxIdx = x * dims[1] * dims[2] * dims[3] + y * dims[2] * dims[3] + z * dims[3]; + for (int i = 0; i < dims[3]; i++) { + int idx = maxIdx + i; + quantized_tensor[idx] = roundf(input[idx] / SF); + } + int maxVal = quantized_tensor[maxIdx]; + for (int i = 1; i < dims[3]; i++) { + int idx = maxIdx + i; + maxVal = MAX(maxVal, quantized_tensor[idx]); + } + for (int i = 0; i < dims[3]; i++) { + int idx = maxIdx + i; + quantized_tensor[idx] = ExpShift(quantized_tensor[idx]-maxVal,N,SF); + } + for (int i = 0; i < dims[3]; i++) { + int idx = maxIdx + i; + if(quantized_tensor[idx] > 0 && sum > INT_MAX - quantized_tensor[idx]) + { + sum = INT_MAX; + break; + } + else { + sum += quantized_tensor[idx]; + } + } + factor[x * dims[1] * dims[2] + y * dims[2] + z] = floorf(INT_MAX/sum); + for(int i= 0; i < dims[3]; ++i) + { + int idx = maxIdx + i; + quantized_tensor[idx] = (quantized_tensor[idx] * factor[x * dims[1] * dims[2] + y * dims[2] + z]) >> (31-(2*output_bits-1)); + input[idx] =quantized_tensor[idx]*new_SF; + } + } +} + +template <> +void ShiftMaxforward<float>(const float* input, float* output, double SF, int N, int output_bits, size_t size, std::vector<long unsigned int> dims_input) { + + double new_SF = 1 / std::pow(2, 2 * output_bits - 1); // New scaling factor + + int dims_input_cuda[4] = {1, 1, 1, 1}; + for (std::size_t i = 0; i < std::min(dims_input.size(), size_t(4)); ++i) { + dims_input_cuda[i] = static_cast<int>(dims_input[i]); + } + + // Allocate memory on the GPU + float* input_cuda_tensor; + cudaMalloc(&input_cuda_tensor, size * sizeof(float)); + cudaMemcpy(input_cuda_tensor, input, size * sizeof(float), cudaMemcpyHostToDevice); + + int* quantized_tensor; + cudaMalloc(&quantized_tensor, size * sizeof(int)); + + int* factor; + cudaMalloc(&factor, size * sizeof(int)); + + int* dims; + cudaMalloc(&dims, 4 * sizeof(int)); + cudaMemcpy(dims, dims_input_cuda, 4 * sizeof(int), cudaMemcpyHostToDevice); + + // Calculate grid and block dimensions + dim3 threadsPerBlock(10, 10, 10); + dim3 numBlocks( + (dims_input_cuda[0] + threadsPerBlock.x - 1) / threadsPerBlock.x, + (dims_input_cuda[1] + threadsPerBlock.y - 1) / threadsPerBlock.y, + (dims_input_cuda[2] + threadsPerBlock.z - 1) / threadsPerBlock.z + ); + + // Launch the kernel (assuming a templated ShiftMaxWholeKernel function exists) + ShiftMaxforward_<float><<<numBlocks, threadsPerBlock>>>(input_cuda_tensor, quantized_tensor, factor, dims, SF, N, output_bits, new_SF); + cudaDeviceSynchronize(); + + // Check for CUDA errors + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + std::cerr << "CUDA Error: " << cudaGetErrorString(err) << std::endl; + } + + // Copy the result back to host + cudaMemcpy(output, input_cuda_tensor, size * sizeof(float), cudaMemcpyDeviceToHost); + + // Free allocated memory on GPU + cudaFree(quantized_tensor); + cudaFree(factor); + cudaFree(dims); + cudaFree(input_cuda_tensor); +} + +template <> +void ShiftMaxforward<double>(const double* input, double* output, double SF, int N, int output_bits, size_t size, std::vector<long unsigned int> dims_input) { + + double new_SF = 1 / std::pow(2, 2 * output_bits - 1); + + int dims_input_cuda[4] = {1, 1, 1, 1}; + for (std::size_t i = 0; i < std::min(dims_input.size(), size_t(4)); ++i) { + dims_input_cuda[i] = static_cast<int>(dims_input[i]); + } + + // Allocate memory on the GPU + double* input_cuda_tensor; + cudaMalloc(&input_cuda_tensor, size * sizeof(double)); + cudaMemcpy(input_cuda_tensor, input, size * sizeof(double), cudaMemcpyHostToDevice); + + int* quantized_tensor; + cudaMalloc(&quantized_tensor, size * sizeof(int)); + + int* factor; + cudaMalloc(&factor, size * sizeof(int)); + + int* dims; + cudaMalloc(&dims, 4 * sizeof(int)); + cudaMemcpy(dims, dims_input_cuda, 4 * sizeof(int), cudaMemcpyHostToDevice); + + // Calculate grid and block dimensions + dim3 threadsPerBlock(10, 10, 10); + dim3 numBlocks( + (dims_input_cuda[0] + threadsPerBlock.x - 1) / threadsPerBlock.x, + (dims_input_cuda[1] + threadsPerBlock.y - 1) / threadsPerBlock.y, + (dims_input_cuda[2] + threadsPerBlock.z - 1) / threadsPerBlock.z + ); + + // Launch the kernel (assuming a templated ShiftMaxWholeKernel function exists) + ShiftMaxforward_<double><<<numBlocks, threadsPerBlock>>>(input_cuda_tensor, quantized_tensor, factor, dims, SF, N, output_bits, new_SF); + cudaDeviceSynchronize(); + + // Check for CUDA errors + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + std::cerr << "CUDA Error: " << cudaGetErrorString(err) << std::endl; + } + + // Copy the result back to host + cudaMemcpy(output, input_cuda_tensor, size * sizeof(double), cudaMemcpyDeviceToHost); + + // Free allocated memory on GPU + cudaFree(quantized_tensor); + cudaFree(factor); + cudaFree(dims); + cudaFree(input_cuda_tensor); +} + + +template <class T> +__global__ void ShiftMaxbackward_(T* input_grad, const T* output_tensor, const T* output_grad, const int* dims) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < dims[0] * dims[1] * dims[2] * dims[3]) { + int w = (index / dims[3]) % dims[2]; + int h = (index / dims[3] / dims[2]) % dims[1]; + int n = index / dims[3] / dims[2] / dims[1]; + + float sum = 0.0f; + for (int i = 0; i < dims[3]; ++i) { + sum += output_tensor[n * dims[1] * dims[2] * dims[3] + h * dims[2] * dims[3] + w * dims[3] + i] * output_grad[n * dims[1] * dims[2] * dims[3] + h * dims[2] * dims[3] + w * dims[3] + i]; + } + input_grad[index] = output_tensor[index] * (output_grad[index] - sum); + } +} + +template <> +void ShiftMaxbackward<float>(const float* output_tensor, const float* output_grad, float* input_grad, size_t size, std::vector<long unsigned int> dims) +{ + int dims_input_cuda[4] = {1, 1, 1, 1}; + for (std::size_t i = 0; i < std::min(dims.size(), size_t(4)); ++i) { + dims_input_cuda[i] = static_cast<int>(dims[i]); + } + + float* output_cuda_tensor; + cudaMalloc(&output_cuda_tensor,size*sizeof(float)); + cudaMemcpy(output_cuda_tensor,output_tensor,size*sizeof(float),cudaMemcpyHostToDevice); + + float* output_grad_; + cudaMalloc(&output_grad_,size*sizeof(float)); + cudaMemcpy(output_grad_,output_grad,size*sizeof(float),cudaMemcpyHostToDevice); + + float *input_grad_; + cudaMalloc(&input_grad_, size * sizeof(float)); + + int *dims_; + cudaMalloc(&dims_, 4 * sizeof(int)); + cudaMemcpy(dims_, dims_input_cuda, 4 * sizeof(int), cudaMemcpyHostToDevice); + + dim3 threadParBlock(256); + dim3 Blocks((size + threadParBlock.x -1) / threadParBlock.x); + + ShiftMaxbackward_<float><<<Blocks,threadParBlock>>>(input_grad_,output_cuda_tensor,output_grad_,dims_); + cudaDeviceSynchronize(); + cudaError_t err = cudaGetLastError(); + if(err != cudaSuccess) + { + printf("CUDA Error: %s\n", cudaGetErrorString(err)); + } + + cudaMemcpy(input_grad, input_grad_, (size) * sizeof(float), cudaMemcpyDeviceToHost); + cudaFree(output_cuda_tensor); + cudaFree(input_grad_); + cudaFree(dims_); + cudaFree(output_grad_); +} + +template <> +void ShiftMaxbackward<double>(const double* output_tensor, const double* output_grad, double* input_grad, size_t size, std::vector<long unsigned int> dims) +{ + int dims_input_cuda[4] = {1, 1, 1, 1}; + for (std::size_t i = 0; i < std::min(dims.size(), size_t(4)); ++i) { + dims_input_cuda[i] = static_cast<int>(dims[i]); + } + + double* output_cuda_tensor; + cudaMalloc(&output_cuda_tensor,size*sizeof(double)); + cudaMemcpy(output_cuda_tensor,output_tensor,size*sizeof(double),cudaMemcpyHostToDevice); + + double* output_grad_; + cudaMalloc(&output_grad_,size*sizeof(double)); + cudaMemcpy(output_grad_,output_grad,size*sizeof(double),cudaMemcpyHostToDevice); + + double *input_grad_; + cudaMalloc(&input_grad_, size * sizeof(double)); + + int *dims_; + cudaMalloc(&dims_, 4 * sizeof(int)); + cudaMemcpy(dims_, dims_input_cuda, 4 * sizeof(int), cudaMemcpyHostToDevice); + + dim3 threadParBlock(256); + dim3 Blocks((size + threadParBlock.x -1) / threadParBlock.x); + + ShiftMaxbackward_<double><<<Blocks,threadParBlock>>>(input_grad_,output_cuda_tensor,output_grad_,dims_); + cudaDeviceSynchronize(); + cudaError_t err = cudaGetLastError(); + if(err != cudaSuccess) + { + printf("CUDA Error: %s\n", cudaGetErrorString(err)); + } + + cudaMemcpy(input_grad,input_grad_, (size) * sizeof(double), cudaMemcpyDeviceToHost); + cudaFree(output_cuda_tensor); + cudaFree(input_grad_); + cudaFree(dims_); + cudaFree(output_grad_); +} + + + +} \ No newline at end of file diff --git a/src/operator/SubImpl.cpp b/src/operator/SubImpl.cpp index adebd2a1bdcede94f159627f67860e7ec60a5d85..a04a1c3018b0c9ba455d21ba563253eb3e004e10 100644 --- a/src/operator/SubImpl.cpp +++ b/src/operator/SubImpl.cpp @@ -44,6 +44,10 @@ void Aidge::SubImpl_cuda::forward() { std::copy(inputs[i].dims().begin(), inputs[i].dims().end(), std::back_inserter(dims[i])); dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1)); + if (dims[i].size() < 4) { + dims[i].resize(4, 1); + } + // Compute the corresponding strides std::vector<int> tensorStrides(dims[i].size()); int product = 1; @@ -197,7 +201,7 @@ void Aidge::SubImpl_cuda::backward_(const Tensor& outputGrad, const std::vector< tensorDesc, &workspaceSize)); - float *d_workspace; + void *d_workspace; CHECK_CUDA_STATUS(cudaMalloc(&d_workspace, workspaceSize)); CHECK_CUDNN_STATUS(cudnnReduceTensor(CudaContext::cudnnHandle(), @@ -216,4 +220,4 @@ void Aidge::SubImpl_cuda::backward_(const Tensor& outputGrad, const std::vector< CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc)); } } -} \ No newline at end of file +} diff --git a/unit_tests/CMakeLists.txt b/unit_tests/CMakeLists.txt index ab65c924e4ac9abecc132e5d7cbc4dc91e172821..807adc55e8c85f31f5e94013e174bf8cbc5a2320 100644 --- a/unit_tests/CMakeLists.txt +++ b/unit_tests/CMakeLists.txt @@ -1,9 +1,11 @@ Include(FetchContent) +set(CATCH2_VERSION v3.0.1) +message(STATUS "Retrieving Catch2 ${CATCH2_VERSION} from git") FetchContent_Declare( Catch2 GIT_REPOSITORY https://github.com/catchorg/Catch2.git - GIT_TAG v3.0.1 # or a later release + GIT_TAG ${CATCH2_VERSION} # or a later release ) FetchContent_MakeAvailable(Catch2) diff --git a/unit_tests/Test_AndImpl.cpp b/unit_tests/Test_AndImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..66de926088bb47c06ea1f9f10655730404787149 --- /dev/null +++ b/unit_tests/Test_AndImpl.cpp @@ -0,0 +1,132 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <catch2/catch_test_macros.hpp> + +#include "aidge/backend/cpu.hpp" +#include "aidge/backend/cuda.hpp" +#include "aidge/data/Tensor.hpp" +#include "aidge/utils/TensorUtils.hpp" + +using namespace Aidge; + +TEST_CASE("[gpu/operator] And(forward)", "[And][GPU]") { + SECTION("Same size inputs") { + std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<float,3,3,3,2> { + { // + { // + {{20, 15},{31, 11},{22, 49}}, // + {{41, 10},{24, 51},{27, 52}}, // + {{26, 53},{27, 54},{28, 55}} // + }, // + { // + {{29, 56},{30, 57},{31, 58}}, // + {{32, 59},{33, 60},{34, 61}}, // + {{35, 62},{36, 63},{37, 64}} // + }, // + { // + {{38, 65},{39, 66},{40, 67}}, // + {{41, 68},{42, 69},{43, 70}}, // + {{44, 71},{45, 72},{46, 73}} // + } // + } // + }); // + input_1->setBackend("cuda"); + std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array4D<float,3,3,3,2> { + { // + { // + {{20, 47},{21, 48},{22, 49}}, // + {{23, 50},{24, 51},{25, 52}}, // + {{17, 53},{27, 26},{14, 33}} // + }, // + { // + {{29, 56},{30, 57},{31, 58}}, // + {{72, 44},{33, 20},{27, 55}}, // + {{35, 24},{25, 63},{28, 64}} // + }, // + { // + {{32, 65},{39, 66},{40, 70}}, // + {{41, 53},{42, 60},{34, 70}}, // + {{44, 71},{30, 12},{46, 73}} // + } // + } // + }); // + input_2->setBackend("cuda"); + const Tensor myOutput = Tensor(Array4D<float,3,3,3,2> { + { + { + {{1, 0},{0, 0},{1, 1}}, + {{0, 0},{1, 1},{0, 1}}, + {{0, 1},{1, 0},{0, 0}} + }, + { + {{1, 1},{1, 1},{1, 1}}, + {{0, 0},{1, 0},{0, 0}}, + {{1, 0},{0, 1},{0, 1}} + }, + { + {{0, 1},{1, 1},{1, 0}}, + {{1, 0},{1, 0},{0, 1}}, + {{1, 1},{0, 0},{1, 1}} + } + } + }); + + std::shared_ptr<Node> myAnd = And(); + auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator()); + op->associateInput(0, input_1); + op->associateInput(1, input_2); + op->setBackend("cuda"); + op->setDataType(DataType::Float32); + myAnd->forward(); + + + std::shared_ptr<Tensor> outputFallback; + const auto& cudaOutput = op->getOutput(0)->refCastFrom(outputFallback, myOutput); + REQUIRE(approxEq<float>(cudaOutput, myOutput)); + } + + SECTION("Broadcasting") { + std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<float,1,3,3,2> { + { // + { // + {{10, 20},{22, 23},{20, 20}}, // + {{10, 15},{10, 29},{20, 20}}, // + {{26, 25},{33, 20},{10, 20}} // + } // + } // + }); // + input_1->setBackend("cuda"); + std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<float,2> {{10, 20}}); + const Tensor myOutput = Tensor(Array4D<float,1,3,3,2> { + { // + { // + {{ 1, 1},{ 0, 0},{ 0, 1}}, // + {{ 1, 0},{ 1, 0},{ 0, 1}}, // + {{ 0, 0},{ 0, 1},{ 1, 1}} // + } // + } // + }); // + input_2->setBackend("cuda"); + std::shared_ptr<Node> myAnd = And(); + auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator()); + op->associateInput(0, input_1); + op->associateInput(1, input_2); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + myAnd->forward(); + + std::shared_ptr<Tensor> outputFallback; + const auto& cudaOutput = op->getOutput(0)->refCastFrom(outputFallback, myOutput); + REQUIRE(approxEq<float>(cudaOutput, myOutput)); + } +} \ No newline at end of file diff --git a/unit_tests/Test_ArgMaxImpl.cpp b/unit_tests/Test_ArgMaxImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d123b5bd3376c7169b2e003d8b366bb9045fe3e1 --- /dev/null +++ b/unit_tests/Test_ArgMaxImpl.cpp @@ -0,0 +1,155 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <catch2/catch_test_macros.hpp> + +#include "aidge/backend/cpu.hpp" +#include "aidge/backend/cuda.hpp" +#include "aidge/data/Tensor.hpp" +#include "aidge/operator/Add.hpp" +#include "aidge/utils/TensorUtils.hpp" + +using namespace Aidge; + +TEST_CASE("[cpu/operator] ArgMax(forward)", "[ArgMax][CPU]") { + SECTION("3D Tensor") { + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,2,3,4> { + { + { + { 1.0, 2.0, 3.0, 4.0}, + { 8.0, 0.0, 17.0, 1.0}, + { 5.0, 10.0, 6.0, 0.0} + }, + { + { 7.0, 1.0, 9.0, 4.0}, + { 0.0, 8.0, 4.0, 2.0}, + { 9.0, 2.0, 0.0, 5.0} + } + } + }); + myInput->setBackend("cuda"); + SECTION("Axis 2") { + + const Tensor myOutput = Tensor(Array3D<float,2,3, 1> { + { + { + {3.0}, + {2.0}, + {1.0} + }, + { + {2.0}, + {1.0}, + {0.0} + } + } + }); + + std::shared_ptr<Node> myArgMax = ArgMax(2); + auto op = std::static_pointer_cast<OperatorTensor>(myArgMax -> getOperator()); + op->associateInput(0,myInput); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + myArgMax->forward(); + + std::shared_ptr<Tensor> outputFallback; + const auto& cudaOutput = op->getOutput(0)->refCastFrom(outputFallback, myOutput); + REQUIRE(approxEq<float>(cudaOutput, myOutput)); + + } + SECTION("Axis 2 with keep_dims false") { + + const Tensor myOutput = Tensor(Array2D<float,2,3> { + { + { 3.0, 2.0, 1.0 }, + { 2.0, 1.0, 0.0 } + } + }); + + std::shared_ptr<Node> myArgMax = ArgMax(2,0); + auto op = std::static_pointer_cast<OperatorTensor>(myArgMax -> getOperator()); + op->associateInput(0,myInput); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + myArgMax->forward(); + + std::shared_ptr<Tensor> outputFallback; + const auto& cudaOutput = op->getOutput(0)->refCastFrom(outputFallback, myOutput); + REQUIRE(approxEq<float>(cudaOutput, myOutput)); + } + SECTION("Axis 1") { + const Tensor myOutput = Tensor(Array3D<float,2,1,4> { + { + { + { 1.0, 2.0, 1.0, 0.0 } + }, + { + { 2.0, 1.0, 0.0, 2.0 } + } + } + }); + + std::shared_ptr<Node> myArgMax = ArgMax(1); + auto op = std::static_pointer_cast<OperatorTensor>(myArgMax -> getOperator()); + op->associateInput(0,myInput); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + myArgMax->forward(); + + std::shared_ptr<Tensor> outputFallback; + const auto& cudaOutput = op->getOutput(0)->refCastFrom(outputFallback, myOutput); + REQUIRE(approxEq<float>(cudaOutput, myOutput)); + } + SECTION("Axis 0") { + const Tensor myOutput = Tensor(Array3D<float,1,3,4> { + { + { + { 1.0, 0.0, 1.0, 0.0 }, + { 0.0, 1.0, 0.0, 1.0 }, + { 1.0, 0.0, 0.0, 1.0 } + } + } + }); + + std::shared_ptr<Node> myArgMax = ArgMax(0); + auto op = std::static_pointer_cast<OperatorTensor>(myArgMax -> getOperator()); + op->associateInput(0,myInput); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + myArgMax->forward(); + + std::shared_ptr<Tensor> outputFallback; + const auto& cudaOutput = op->getOutput(0)->refCastFrom(outputFallback, myOutput); + REQUIRE(approxEq<float>(cudaOutput, myOutput)); + } + } + SECTION("Select_Last_Index") { + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array1D<float,10> { + { + 1.0, 5.0, 9.0, 0.0, 6.0, 2.0, 9.0, 4.0, 3.0, 9.0 + } + }); + const Tensor myOutput = Tensor(Array1D<float,1> {{9}}); + + std::shared_ptr<Node> myArgMax = ArgMax(0, 1, 1); + auto op = std::static_pointer_cast<OperatorTensor>(myArgMax -> getOperator()); + op->associateInput(0,myInput); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + myArgMax->forward(); + + std::shared_ptr<Tensor> outputFallback; + const auto& cudaOutput = op->getOutput(0)->refCastFrom(outputFallback, myOutput); + REQUIRE(approxEq<float>(cudaOutput, myOutput)); + } +} \ No newline at end of file diff --git a/unit_tests/Test_ConvImpl.cpp b/unit_tests/Test_ConvImpl.cpp index dc77e35b64fd22952e683e373fcc271c742ece75..72a4040a8ecbd091e24f8441d9c29970ea82c606 100644 --- a/unit_tests/Test_ConvImpl.cpp +++ b/unit_tests/Test_ConvImpl.cpp @@ -240,8 +240,8 @@ TEST_CASE("[gpu/operator] Conv(forward)") { for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { const std::size_t kernel = kernelDist(gen); - std::uniform_int_distribution<std::size_t> resolutionDist(std::size_t(kernel+2), - std::size_t(10)); + std::uniform_int_distribution<std::size_t> resolutionDist(std::size_t(kernel), + std::size_t(10)); const std::size_t nbDims = 4; std::vector<std::size_t> dims; for (std::size_t i = 0; i < nbDims; ++i) { @@ -351,8 +351,9 @@ TEST_CASE("[gpu/operator] Conv(forward)") { // forward CPU op_cpu->forward(); - float *computed_cpu = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr()); - REQUIRE(approxEq<float>(*computed_cuda, *computed_cpu)); + std::shared_ptr<Tensor> outputFallback; + const auto& cudaOutput = op_cuda->getOutput(0)->refCastFrom(outputFallback, *op_cpu->getOutput(0)); + REQUIRE(approxEq<float>(cudaOutput, *(op_cpu->getOutput(0)))); delete[] array0; delete[] weights; diff --git a/unit_tests/Test_DivImpl.cpp b/unit_tests/Test_DivImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..07cde5d6acb8eeeff2667e5c67aedb87b893e84c --- /dev/null +++ b/unit_tests/Test_DivImpl.cpp @@ -0,0 +1,140 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <array> +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution + +#include <catch2/catch_test_macros.hpp> + +#include "aidge/backend/cpu.hpp" +#include "aidge/backend/cuda.hpp" +#include "aidge/data/Tensor.hpp" +#include "aidge/utils/TensorUtils.hpp" + +namespace Aidge { + +TEST_CASE("[gpu/operator] Div", "[Div][GPU]") { +constexpr std::uint16_t NBTRIALS = 10; + // Create a random number generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<float> valueDist( + 0.1f, 1.1f); // Random float distribution between 0 and 1 + std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1), + std::size_t(10)); + std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(4), std::size_t(5)); + std::uniform_int_distribution<int> boolDist(0,1); + + // To measure execution time of 'forward()' + std::chrono::time_point<std::chrono::system_clock> start; + std::chrono::time_point<std::chrono::system_clock> end; + std::chrono::duration<double, std::micro> duration{}; + std::size_t number_of_operation = 0; + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) + { + // Create Div Operator CUDA + std::shared_ptr<Node> myDivCUDA = Div(); + auto op_cuda = std::static_pointer_cast<OperatorTensor>(myDivCUDA -> getOperator()); + + // Create Div Operator CPU + std::shared_ptr<Node> myDivCPU = Div(); + auto op_cpu = std::static_pointer_cast<OperatorTensor>(myDivCPU -> getOperator()); + op_cpu->setDataType(DataType::Float32); + op_cpu->setBackend("cpu"); + + const std::size_t nbDims = nbDimsDist(gen); + std::vector<std::size_t> dims0, dims1, dims; + for (std::size_t i = 0; i < nbDims; ++i) { + const std::size_t dim = dimSizeDist(gen); + dims0.push_back(dim); + if (boolDist(gen)) { + dims1.push_back(1); + }else{ + dims1.push_back(dim); + } + dims.push_back(std::max(dims0[i], dims1[i])); + } + + const std::size_t nb_elements0 = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t nb_elements1 = std::accumulate(dims1.cbegin(), dims1.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + number_of_operation += nb_elements; + float* array0 = new float[nb_elements0]; + float* array1 = new float[nb_elements1]; + + for (std::size_t i = 0; i < nb_elements0; ++i) { + array0[i] = valueDist(gen); + } + for (std::size_t i = 0; i < nb_elements1; ++i) { + array1[i] = valueDist(gen); + } + + // input0 CUDA + float* array0_d, *array1_d; + std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>(); + T0_cuda->setDataType(DataType::Float32); + T0_cuda->setBackend("cuda"); + T0_cuda->resize(dims0); + op_cuda->associateInput(0, T0_cuda); + cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements0); + cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements0, cudaMemcpyHostToDevice); + T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements0); + + // input0 CPU + std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>(); + op_cpu->associateInput(0,T0_cpu); + T0_cpu->setDataType(DataType::Float32); + T0_cpu->setBackend("cpu"); + T0_cpu->resize(dims0); + T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements0); + + // input1 CUDA + std::shared_ptr<Tensor> T1_cuda = std::make_shared<Tensor>(); + T1_cuda->setDataType(DataType::Float32); + T1_cuda->setBackend("cuda"); + T1_cuda->resize(dims1); + op_cuda->associateInput(1, T1_cuda); + cudaMalloc(reinterpret_cast<void **>(&array1_d), sizeof(float) * nb_elements1); + cudaMemcpy(array1_d, array1, sizeof(float) * nb_elements1, cudaMemcpyHostToDevice); + T1_cuda->getImpl()->setRawPtr(array1_d, nb_elements1); + + // input1 CPU + std::shared_ptr<Tensor> T1_cpu = std::make_shared<Tensor>(); + op_cpu->associateInput(1,T1_cpu); + T1_cpu->setDataType(DataType::Float32); + T1_cpu->setBackend("cpu"); + T1_cpu->resize(dims1); + T1_cpu -> getImpl() -> setRawPtr(array1, nb_elements1); + + // forward CUDA + op_cuda->setDataType(DataType::Float32); + op_cuda->setBackend("cuda"); + start = std::chrono::system_clock::now(); + op_cuda->forward(); + end = std::chrono::system_clock::now(); + duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + + // forward CPU + op_cpu->forward(); + float *computedCPU = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr()); + + std::shared_ptr<Tensor> outputFallback; + const auto& cudaOutput = op_cuda->getOutput(0)->refCastFrom(outputFallback, *op_cpu->getOutput(0)); + REQUIRE(approxEq<float>(cudaOutput, *(op_cpu->getOutput(0)))); + + delete[] array0; + delete[] array1; + cudaFree(array0_d); + cudaFree(array1_d); + } +} +} // namespace Aidge diff --git a/unit_tests/Test_ILayerNormImpl.cpp b/unit_tests/Test_ILayerNormImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0487b7c4716596e0d2e7bcbdaf812358be4de3bf --- /dev/null +++ b/unit_tests/Test_ILayerNormImpl.cpp @@ -0,0 +1,201 @@ +/******************************************************************************** + * Copyright (c) 2024 Thales + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * Author: Lucas RAKOTOARIVONY, Thales Research & Technology France + * Date: 10.09.2024 + * + ********************************************************************************/ + +#include <array> + +#include <catch2/catch_test_macros.hpp> + +#include "Test_cuda.hpp" + +#include "aidge/data/Tensor.hpp" + +#include "aidge/backend/cpu.hpp" +#include "aidge/backend/cuda.hpp" + +using namespace Aidge; + +TEST_CASE("[gpu/operator] ILayerNorm(forward)", "[ILayerNorm][GPU]") { + SECTION("4D Tensor") { + std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array4D<float,2,2,2,10> { + { + { + { + {0.96, 0.48, 0.54, 0.49, 0.59, 0.93, 0.00, 0.00, 0.61, 0.61}, + {0.85, 0.06, 0.11, 0.87, 0.55, 0.12, 0.80, 0.48, 0.41, 0.16} + }, + { + {0.24, 0.46, 0.97, 0.19, 0.65, 0.12, 0.44, 1.00, 0.37, 0.09}, + {0.44, 0.64, 0.21, 0.58, 0.05, 0.24, 0.56, 0.07, 0.49, 0.79} + } + }, + { + { + {0.00, 0.13, 0.55, 0.42, 0.49, 0.28, 0.52, 0.55, 0.34, 0.85}, + {0.98, 0.32, 0.09, 0.05, 0.37, 0.47, 0.63, 0.13, 0.70, 0.02} + }, + { + {0.69, 0.13, 0.74, 0.61, 0.25, 0.87, 0.46, 0.40, 0.81, 0.06}, + {0.89, 0.32, 0.61, 0.24, 0.70, 0.23, 0.09, 0.03, 0.14, 0.80} + } + } + } + }); + + std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float, 10>{{0, 0, 0, 0, 0, 0, 0, 0, 0, 0}}); + std::shared_ptr<Tensor> myWeight = std::make_shared<Tensor>(Array1D<float, 10>{{0.1617684f, 0.3833238f ,-0.6842308f ,-0.4342245f ,-0.4717381f ,-0.1776187f, -0.2728751f, -0.4638580f, 0.2936697f, -0.9011016f}}); + + myWeight->setBackend("cuda"); + myBias->setBackend("cuda"); + + std::shared_ptr<Node> myILayerNorm = ILayerNorm(); + auto op = std::static_pointer_cast<OperatorTensor>(myILayerNorm -> getOperator()); + + op -> associateInput(1, myWeight); + op -> associateInput(2, myBias); + + input0->setBackend("cuda"); + + op -> associateInput(0,input0); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + op->forward(); + + // expected output + std::shared_ptr<Tensor> output_ilayernorm = std::make_shared<Tensor>(Array4D<float,2,2,2,10> { + { + { + { + {9.8821178e-02, 4.9410585e-02, 4.9410585e-02, 4.9410585e-02, 4.9410585e-02, 4.9410585e-02, 0.0000000e+00, 0.0000000e+00, 4.9410585e-02, 4.9410585e-02}, + {4.9410585e-02, 0.0000000e+00, 0.0000000e+00, 4.9410585e-02, 4.9410585e-02, 0.0000000e+00, 4.9410585e-02, 4.9410585e-02, 4.9410585e-02, 0.0000000e+00} + }, + { + {0.0000000e+00, 4.9410585e-02, 9.8821178e-02, 0.0000000e+00, 4.9410585e-02, 0.0000000e+00, 4.9410585e-02, 9.8821178e-02, 4.9410585e-02, 0.0000000e+00}, + {4.9410585e-02, 4.9410585e-02, 0.0000000e+00, 4.9410585e-02, 0.0000000e+00, 0.0000000e+00, 4.9410585e-02, 0.0000000e+00, 4.9410585e-02, 4.9410585e-02} + } + }, + { + { + {0.0000000e+00, 0.0000000e+00, 4.9410585e-02, 4.9410585e-02, 4.9410585e-02, 0.0000000e+00, 4.9410585e-02, 4.9410585e-02, 4.9410585e-02, 4.9410585e-02}, + {9.8821178e-02, 4.9410585e-02, 0.0000000e+00, 0.0000000e+00, 4.9410585e-02, 4.9410585e-02, 4.9410585e-02, 0.0000000e+00, 4.9410585e-02, 0.0000000e+00} + }, + { + {4.9410585e-02, 0.0000000e+00, 4.9410585e-02, 4.9410585e-02, 0.0000000e+00, 4.9410585e-02, 4.9410585e-02, 4.9410585e-02, 4.9410585e-02, 0.0000000e+00}, + {4.9410585e-02, 4.9410585e-02, 4.9410585e-02, 0.0000000e+00, 4.9410585e-02, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 4.9410585e-02} + } + } + } + }); + + + float* computedOutput = new float[output_ilayernorm->size()](); + cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * output_ilayernorm->size(), cudaMemcpyDeviceToHost); + + //test if forward result are as expected + for(int i = 0; i < output_ilayernorm->size(); i++){ + const float targetOutput = *(static_cast<float*>(output_ilayernorm->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + } + + } + +} + +TEST_CASE("[gpu/operator] ILayerNorm(backward)", "[ILayerNorm][GPU]") + +{ + std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array4D<float,1,1,1,8> { //NCHW + { + { + { + {1.46650600, 1.24083233, -0.33106008, -0.15137172, 0.06625678, -1.8326609, 0.53444749, -0.05167147}, + }, + }, + } + }); + + std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array4D<float,1,1,1,8> { //NCHW + { + { + { + {0.96, 0.54, 0.22, -0.15, 0.17, 0.26, -0.85, 0.5}, + }, + }, + } + }); + + std::shared_ptr<Tensor> myWeight = std::make_shared<Tensor>(Array4D<float,1,1,1,8> { //NCHW + { + { + { + {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}, + }, + }, + } + }); + + + myWeight->setBackend("cuda"); + myBias->setBackend("cuda"); + + std::shared_ptr<Node> myILayerNorm = ILayerNorm(); + auto op = std::static_pointer_cast<OperatorTensor>(myILayerNorm -> getOperator()); + + op -> associateInput(1, myWeight); + op -> associateInput(2, myBias); + + input0->setBackend("cuda"); + + op -> associateInput(0,input0); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + myILayerNorm->forward(); + + std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array4D<float,1,1,1,8> { + { + { + { + { 1.34347093, 0.90813798, 0.39607167, 1.20428133, 0.16845724, 0.48487359, 0.40748054, -0.21790814}, + }, + }, + } + }); + + + myOutputGrad->setBackend("cuda"); + std::shared_ptr<Tensor> predictedOutput = op->getOutput(0); + std::shared_ptr<Tensor> input = op->getInput(0); + predictedOutput->setGrad(myOutputGrad); + REQUIRE_NOTHROW(myILayerNorm->backward()); + + std::shared_ptr<Tensor> expectedInputGradILayerNorm = std::make_shared<Tensor>(Array4D<float,1,1,1,8> { + { + { + { + { 0.467678, 0.310749, 0.1129, 0.351786, 0.0507252, 0.101587, 0.130249, -0.0646476}, + }, + }, + } + }); + + + float *computedInputGradCuda = new float[myOutputGrad->size()](); + cudaMemcpy(computedInputGradCuda, op->getInput(0)->grad()->getImpl()->rawPtr(), sizeof(float) * myOutputGrad->size(), cudaMemcpyDeviceToHost); + + //test if backward result are as expected + for(int i = 0; i < expectedInputGradILayerNorm->size(); i++){ + const float targetOutput = *(static_cast<float*>(expectedInputGradILayerNorm->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedInputGradCuda[i] - targetOutput) < 2e-6); + } + + delete[] computedInputGradCuda; +} diff --git a/unit_tests/Test_LnImpl.cpp b/unit_tests/Test_LnImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..06e2205ba38ce0becd0326bf4d258b9f55a228bd --- /dev/null +++ b/unit_tests/Test_LnImpl.cpp @@ -0,0 +1,106 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <array> +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution + +#include <catch2/catch_test_macros.hpp> + +#include "aidge/backend/cpu.hpp" +#include "aidge/backend/cuda.hpp" +#include "aidge/data/Tensor.hpp" +#include "aidge/utils/TensorUtils.hpp" + +namespace Aidge { + +TEST_CASE("[gpu/operator] Ln", "[Ln][GPU]") { +constexpr std::uint16_t NBTRIALS = 10; + // Create a random number generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<float> valueDist( + 0.1f, 1.1f); // Random float distribution between 0 and 1 + std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1), + std::size_t(10)); + std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(8)); + + // To measure execution time of 'forward()' + std::chrono::time_point<std::chrono::system_clock> start; + std::chrono::time_point<std::chrono::system_clock> end; + std::chrono::duration<double, std::micro> duration{}; + std::size_t number_of_operation = 0; + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) + { + // Create Ln Operator CUDA + std::shared_ptr<Node> myLnCUDA = Ln(); + auto op_cuda = std::static_pointer_cast<OperatorTensor>(myLnCUDA -> getOperator()); + + // Create Ln Operator CPU + std::shared_ptr<Node> myLnCPU = Ln(); + auto op_cpu = std::static_pointer_cast<OperatorTensor>(myLnCPU -> getOperator()); + op_cpu->setDataType(DataType::Float32); + op_cpu->setBackend("cpu"); + + const std::size_t nbDims = nbDimsDist(gen); + std::vector<std::size_t> dims; + for (std::size_t i = 0; i < nbDims; ++i) { + dims.push_back(dimSizeDist(gen)); + } + + const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + number_of_operation += nb_elements; + float* array0 = new float[nb_elements]; + + for (std::size_t i = 0; i < nb_elements; ++i) { + array0[i] = valueDist(gen); + } + + // input0 CUDA + float* array0_d; + std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>(); + T0_cuda->setDataType(DataType::Float32); + T0_cuda->setBackend("cuda"); + T0_cuda->resize(dims); + op_cuda->associateInput(0, T0_cuda); + cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements); + cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice); + T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements); + + // input0 CPU + std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>(); + op_cpu->associateInput(0,T0_cpu); + T0_cpu->setDataType(DataType::Float32); + T0_cpu->setBackend("cpu"); + T0_cpu->resize(dims); + T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements); + + // forward CUDA + op_cuda->setDataType(DataType::Float32); + op_cuda->setBackend("cuda"); + start = std::chrono::system_clock::now(); + op_cuda->forward(); + end = std::chrono::system_clock::now(); + duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + + // forward CPU + op_cpu->forward(); + float *computedCPU = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr()); + + std::shared_ptr<Tensor> outputFallback; + const auto& cudaOutput = op_cuda->getOutput(0)->refCastFrom(outputFallback, *op_cpu->getOutput(0)); + REQUIRE(approxEq<float>(cudaOutput, *(op_cpu->getOutput(0)))); + + delete[] array0; + cudaFree(array0_d); + } +} +} // namespace Aidge diff --git a/unit_tests/Test_MulImpl.cpp b/unit_tests/Test_MulImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9eaba6e80971a7075576cd3d4d409b79dac4eb0c --- /dev/null +++ b/unit_tests/Test_MulImpl.cpp @@ -0,0 +1,140 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <array> +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution + +#include <catch2/catch_test_macros.hpp> + +#include "aidge/backend/cpu.hpp" +#include "aidge/backend/cuda.hpp" +#include "aidge/data/Tensor.hpp" +#include "aidge/utils/TensorUtils.hpp" + +namespace Aidge { + +TEST_CASE("[gpu/operator] Mul", "[Mul][GPU]") { +constexpr std::uint16_t NBTRIALS = 10; + // Create a random number generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<float> valueDist( + 0.1f, 1.1f); // Random float distribution between 0 and 1 + std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1), + std::size_t(10)); + std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(4), std::size_t(5)); + std::uniform_int_distribution<int> boolDist(0,1); + + // To measure execution time of 'forward()' + std::chrono::time_point<std::chrono::system_clock> start; + std::chrono::time_point<std::chrono::system_clock> end; + std::chrono::duration<double, std::micro> duration{}; + std::size_t number_of_operation = 0; + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) + { + // Create Mul Operator CUDA + std::shared_ptr<Node> myMulCUDA = Mul(); + auto op_cuda = std::static_pointer_cast<OperatorTensor>(myMulCUDA -> getOperator()); + + // Create Mul Operator CPU + std::shared_ptr<Node> myMulCPU = Mul(); + auto op_cpu = std::static_pointer_cast<OperatorTensor>(myMulCPU -> getOperator()); + op_cpu->setDataType(DataType::Float32); + op_cpu->setBackend("cpu"); + + const std::size_t nbDims = nbDimsDist(gen); + std::vector<std::size_t> dims0, dims1, dims; + for (std::size_t i = 0; i < nbDims; ++i) { + const std::size_t dim = dimSizeDist(gen); + dims0.push_back(dim); + if (boolDist(gen)) { + dims1.push_back(1); + }else{ + dims1.push_back(dim); + } + dims.push_back(std::max(dims0[i], dims1[i])); + } + + const std::size_t nb_elements0 = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t nb_elements1 = std::accumulate(dims1.cbegin(), dims1.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + number_of_operation += nb_elements; + float* array0 = new float[nb_elements0]; + float* array1 = new float[nb_elements1]; + + for (std::size_t i = 0; i < nb_elements0; ++i) { + array0[i] = valueDist(gen); + } + for (std::size_t i = 0; i < nb_elements1; ++i) { + array1[i] = valueDist(gen); + } + + // input0 CUDA + float* array0_d, *array1_d; + std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>(); + T0_cuda->setDataType(DataType::Float32); + T0_cuda->setBackend("cuda"); + T0_cuda->resize(dims0); + op_cuda->associateInput(0, T0_cuda); + cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements0); + cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements0, cudaMemcpyHostToDevice); + T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements0); + + // input0 CPU + std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>(); + op_cpu->associateInput(0,T0_cpu); + T0_cpu->setDataType(DataType::Float32); + T0_cpu->setBackend("cpu"); + T0_cpu->resize(dims0); + T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements0); + + // input1 CUDA + std::shared_ptr<Tensor> T1_cuda = std::make_shared<Tensor>(); + T1_cuda->setDataType(DataType::Float32); + T1_cuda->setBackend("cuda"); + T1_cuda->resize(dims1); + op_cuda->associateInput(1, T1_cuda); + cudaMalloc(reinterpret_cast<void **>(&array1_d), sizeof(float) * nb_elements1); + cudaMemcpy(array1_d, array1, sizeof(float) * nb_elements1, cudaMemcpyHostToDevice); + T1_cuda->getImpl()->setRawPtr(array1_d, nb_elements1); + + // input1 CPU + std::shared_ptr<Tensor> T1_cpu = std::make_shared<Tensor>(); + op_cpu->associateInput(1,T1_cpu); + T1_cpu->setDataType(DataType::Float32); + T1_cpu->setBackend("cpu"); + T1_cpu->resize(dims1); + T1_cpu -> getImpl() -> setRawPtr(array1, nb_elements1); + + // forward CUDA + op_cuda->setDataType(DataType::Float32); + op_cuda->setBackend("cuda"); + start = std::chrono::system_clock::now(); + op_cuda->forward(); + end = std::chrono::system_clock::now(); + duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + + // forward CPU + op_cpu->forward(); + float *computedCPU = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr()); + + std::shared_ptr<Tensor> outputFallback; + const auto& cudaOutput = op_cuda->getOutput(0)->refCastFrom(outputFallback, *op_cpu->getOutput(0)); + REQUIRE(approxEq<float>(cudaOutput, *(op_cpu->getOutput(0)))); + + delete[] array0; + delete[] array1; + cudaFree(array0_d); + cudaFree(array1_d); + } +} +} // namespace Aidge diff --git a/unit_tests/Test_PowImpl.cpp b/unit_tests/Test_PowImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..49e65b46d7d85b7087c5c73151d643593d91e02e --- /dev/null +++ b/unit_tests/Test_PowImpl.cpp @@ -0,0 +1,355 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <array> +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution + +#include <catch2/catch_test_macros.hpp> + +#include "aidge/backend/cpu.hpp" +#include "aidge/backend/cuda.hpp" +#include "aidge/data/Tensor.hpp" +#include "aidge/utils/TensorUtils.hpp" + +namespace Aidge { + +TEST_CASE("[gpu/operator] Pow", "[Pow][GPU]") { + constexpr std::uint16_t NBTRIALS = 10; + // Create a random number generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1 + std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10)); + std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5)); + std::uniform_int_distribution<int> boolDist(0,1); + + // To measure execution time of 'MatPow_Op::forward()' member function call + std::chrono::time_point<std::chrono::system_clock> start; + std::chrono::time_point<std::chrono::system_clock> end; + std::chrono::duration<double, std::micro> duration{}; + + SECTION("PowImpl::forward()") { + SECTION("Scalar / Scalar") { + + } + SECTION("Scalar / +1-D Tensor") { + + } + SECTION("+1-D Tensor / +1-D Tensor - same dimensions") { + + // Create Pow Operator + std::shared_ptr<Node> myPowCUDA = Pow(); + auto op_cuda = std::static_pointer_cast<OperatorTensor>(myPowCUDA-> getOperator()); + op_cuda->setDataType(DataType::Float32); + op_cuda->setBackend("cuda"); + std::shared_ptr<Node> myPowCPU = Pow(); + auto op_cpu = std::static_pointer_cast<OperatorTensor>(myPowCPU-> getOperator()); + op_cpu->setDataType(DataType::Float32); + op_cpu->setBackend("cpu"); + + std::size_t number_of_operation = 0; + + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { + // generate 2 random Tensors + const std::size_t nbDims = nbDimsDist(gen); + std::vector<std::size_t> dims; + for (std::size_t i = 0; i < nbDims; ++i) { + dims.push_back(dimSizeDist(gen)); + } + const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + number_of_operation += nb_elements; + + // without broadcasting + float* array0 = new float[nb_elements]; + float* array1 = new float[nb_elements]; + + for (std::size_t i = 0; i < nb_elements; ++i) { + array0[i] = valueDist(gen); + array1[i] = valueDist(gen); + } + + // input0 CUDA + float* array0_d, *array1_d; + std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>(); + T0_cuda->setDataType(DataType::Float32); + T0_cuda->setBackend("cuda"); + T0_cuda->resize(dims); + op_cuda->associateInput(0, T0_cuda); + cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements); + cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice); + T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements); + + // input0 CPU + std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>(); + T0_cpu->setDataType(DataType::Float32); + T0_cpu->setBackend("cpu"); + T0_cpu->resize(dims); + op_cpu->associateInput(0,T0_cpu); + T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements); + + // input1 CUDA + std::shared_ptr<Tensor> T1_cuda = std::make_shared<Tensor>(); + T1_cuda->setDataType(DataType::Float32); + T1_cuda->setBackend("cuda"); + T1_cuda->resize(dims); + op_cuda->associateInput(1, T1_cuda); + cudaMalloc(reinterpret_cast<void **>(&array1_d), sizeof(float) * nb_elements); + cudaMemcpy(array1_d, array1, sizeof(float) * nb_elements, cudaMemcpyHostToDevice); + T1_cuda->getImpl()->setRawPtr(array1_d, nb_elements); + + // input1 + std::shared_ptr<Tensor> T1_cpu = std::make_shared<Tensor>(); + T1_cpu->setDataType(DataType::Float32); + T1_cpu->setBackend("cpu"); + T1_cpu->resize(dims); + op_cpu -> associateInput(1,T1_cpu); + T1_cpu -> getImpl() -> setRawPtr(array1, nb_elements); + + op_cuda->forwardDims(); + start = std::chrono::system_clock::now(); + myPowCUDA->forward(); + end = std::chrono::system_clock::now(); + duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + + // REQUIRE(false); + op_cpu->forwardDims(); + myPowCPU->forward(); + + std::shared_ptr<Tensor> outputFallback; + const auto& cudaOutput = op_cuda->getOutput(0)->refCastFrom(outputFallback, *op_cpu->getOutput(0)); + REQUIRE(approxEq<float>(cudaOutput, *(op_cpu->getOutput(0)))); + + delete[] array0; + delete[] array1; + cudaFree(array0_d); + cudaFree(array1_d); + } + std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl; + std::cout << "total time: " << duration.count() << "μs" << std::endl; + } + + SECTION("+1-D Tensor / +1-D Tensor - broadcasting") { + // Create Pow Operator + std::shared_ptr<Node> myPowCUDA = Pow(); + auto op_cuda = std::static_pointer_cast<OperatorTensor>(myPowCUDA-> getOperator()); + op_cuda->setDataType(DataType::Float32); + op_cuda->setBackend("cuda"); + std::shared_ptr<Node> myPowCPU = Pow(); + auto op_cpu = std::static_pointer_cast<OperatorTensor>(myPowCPU-> getOperator()); + op_cpu->setDataType(DataType::Float32); + op_cpu->setBackend("cpu"); + + std::size_t number_of_operation = 0; + + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { + // generate 2 random Tensors + // handle dimensions, replace some dimensions with '1' to get broadcasting + constexpr std::size_t nbDims = 4; + std::vector<std::size_t> dims; + for (std::size_t i = 0; i < nbDims; ++i) { + dims.push_back(dimSizeDist(gen)); + } + std::vector<std::size_t> dims0 = dims; + std::vector<std::size_t> dims1 = dims; + std::vector<std::size_t> dimsOut = dims; + for (std::size_t i = 0; i < nbDims; ++i) { + if (boolDist(gen)) { + dims0[i] = 1; + } + if (boolDist(gen)) { + dims1[i] = 1; + } + dimsOut[i] = (dims0[i] == 1) ? dims1[i] : dims0[i]; + } + + // create arrays and fill them with random values + std::size_t array0_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + std::size_t array1_size = std::accumulate(dims1.cbegin(), dims1.cend(), std::size_t(1), std::multiplies<std::size_t>()); + float* array0 = new float[array0_size]; + float* array1 = new float[array1_size]; + + for (std::size_t i = 0; i < array0_size; ++i) { + array0[i] = valueDist(gen); + } + for (std::size_t i = 0; i < array1_size; ++i) { + array1[i] = valueDist(gen); + } + // input0 CUDA + float* array0_d, *array1_d; + std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>(); + T0_cuda->setDataType(DataType::Float32); + T0_cuda->setBackend("cuda"); + T0_cuda->resize(dims0); + op_cuda->associateInput(0, T0_cuda); + cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * array0_size); + cudaMemcpy(array0_d, array0, sizeof(float) * array0_size, cudaMemcpyHostToDevice); + T0_cuda->getImpl()->setRawPtr(array0_d, array0_size); + + // input0 CPU + std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>(); + T0_cpu->setDataType(DataType::Float32); + T0_cpu->setBackend("cpu"); + op_cpu->associateInput(0,T0_cpu); + T0_cpu->resize(dims0); + T0_cpu -> getImpl() -> setRawPtr(array0, array0_size); + + // input1 CUDA + std::shared_ptr<Tensor> T1_cuda = std::make_shared<Tensor>(); + T1_cuda->setDataType(DataType::Float32); + T1_cuda->setBackend("cuda"); + T1_cuda->resize(dims1); + op_cuda->associateInput(1, T1_cuda); + cudaMalloc(reinterpret_cast<void **>(&array1_d), sizeof(float) * array1_size); + cudaMemcpy(array1_d, array1, sizeof(float) * array1_size, cudaMemcpyHostToDevice); + T1_cuda->getImpl()->setRawPtr(array1_d, array1_size); + + // input1 + std::shared_ptr<Tensor> T1_cpu = std::make_shared<Tensor>(); + T1_cpu->setDataType(DataType::Float32); + T1_cpu->setBackend("cpu"); + T1_cpu->resize(dims1); + op_cpu -> associateInput(1,T1_cpu); + T1_cpu -> getImpl() -> setRawPtr(array1, array1_size); + + op_cuda->forwardDims(); + start = std::chrono::system_clock::now(); + myPowCUDA->forward(); + end = std::chrono::system_clock::now(); + duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + + op_cpu->forwardDims(); + myPowCPU->forward(); + + std::shared_ptr<Tensor> outputFallback; + const auto& cudaOutput = op_cuda->getOutput(0)->refCastFrom(outputFallback, *op_cpu->getOutput(0)); + REQUIRE(approxEq<float>(cudaOutput, *(op_cpu->getOutput(0)))); + + delete[] array0; + delete[] array1; + cudaFree(array0_d); + cudaFree(array1_d); + + const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>()); + number_of_operation += nb_elements; + } + std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl; + std::cout << "total time: " << duration.count() << "μs" << std::endl; + } + SECTION("+1-D Tensor / 1-D Tensor") { + // Create Pow Operator + std::shared_ptr<Node> myPowCUDA = Pow(); + auto op_cuda = std::static_pointer_cast<OperatorTensor>(myPowCUDA-> getOperator()); + op_cuda->setDataType(DataType::Float32); + op_cuda->setBackend("cuda"); + std::shared_ptr<Node> myPowCPU = Pow(); + auto op_cpu = std::static_pointer_cast<OperatorTensor>(myPowCPU-> getOperator()); + op_cpu->setDataType(DataType::Float32); + op_cpu->setBackend("cpu"); + + std::size_t number_of_operation = 0; + std::uniform_int_distribution<std::size_t> nbRemovedDimsDist(std::size_t(1), std::size_t(3)); + + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { + // generate 2 random Tensors + // handle dimensions + constexpr std::size_t nbDims = 4; + std::vector<std::size_t> dims0(4); + for (std::size_t i = 0; i < nbDims; ++i) { + dims0[i] = dimSizeDist(gen); + } + std::vector<std::size_t> dimsOut = dims0; + std::vector<std::size_t> dims1 = dims0; + for (std::size_t i = 0; i < nbDims; ++i) { + if (boolDist(gen)) { + dims1[i] = 1; + } + } + dims1.erase(dims1.cbegin(), dims1.cbegin() + nbRemovedDimsDist(gen)); + + // create arrays and fill them with random values + std::size_t array0_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + float* array0 = new float[array0_size]; + std::size_t array1_size = std::accumulate(dims1.cbegin(), dims1.cend(), std::size_t(1), std::multiplies<std::size_t>()); + float* array1 = new float[array1_size]; + + for (std::size_t i = 0; i < array0_size; ++i) { + array0[i] = valueDist(gen); + } + for (std::size_t i = 0; i < array1_size; ++i) { + array1[i] = valueDist(gen); + } + + // input0 CUDA + float* array0_d, *array1_d; + std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>(); + T0_cuda->setDataType(DataType::Float32); + T0_cuda->setBackend("cuda"); + T0_cuda->resize(dims0); + op_cuda->associateInput(0, T0_cuda); + cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * array0_size); + cudaMemcpy(array0_d, array0, sizeof(float) * array0_size, cudaMemcpyHostToDevice); + T0_cuda->getImpl()->setRawPtr(array0_d, array0_size); + + // input0 CPU + std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>(); + T0_cpu->setDataType(DataType::Float32); + T0_cpu->setBackend("cpu"); + op_cpu->associateInput(0,T0_cpu); + T0_cpu->resize(dims0); + T0_cpu -> getImpl() -> setRawPtr(array0, array0_size); + + // input1 CUDA + std::shared_ptr<Tensor> T1_cuda = std::make_shared<Tensor>(); + T1_cuda->setDataType(DataType::Float32); + T1_cuda->setBackend("cuda"); + T1_cuda->resize(dims1); + op_cuda->associateInput(1, T1_cuda); + cudaMalloc(reinterpret_cast<void **>(&array1_d), sizeof(float) * array1_size); + cudaMemcpy(array1_d, array1, sizeof(float) * array1_size, cudaMemcpyHostToDevice); + T1_cuda->getImpl()->setRawPtr(array1_d, array1_size); + + // input1 + std::shared_ptr<Tensor> T1_cpu = std::make_shared<Tensor>(); + T1_cpu->setDataType(DataType::Float32); + T1_cpu->setBackend("cpu"); + T1_cpu->resize(dims1); + op_cpu -> associateInput(1,T1_cpu); + T1_cpu -> getImpl() -> setRawPtr(array1, array1_size); + + op_cuda->forwardDims(); + start = std::chrono::system_clock::now(); + myPowCUDA->forward(); + end = std::chrono::system_clock::now(); + duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + + op_cpu->forwardDims(); + myPowCPU->forward(); + + std::shared_ptr<Tensor> outputFallback; + const auto& cudaOutput = op_cuda->getOutput(0)->refCastFrom(outputFallback, *op_cpu->getOutput(0)); + REQUIRE(approxEq<float>(cudaOutput, *(op_cpu->getOutput(0)))); + + delete[] array0; + delete[] array1; + cudaFree(array0_d); + cudaFree(array1_d); + + const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>()); + number_of_operation += nb_elements; + } + + std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl; + std::cout << "total time: " << duration.count() << "μs" << std::endl; + } + } +} +} // namespace Aidge diff --git a/unit_tests/Test_ReduceMeanImpl.cpp b/unit_tests/Test_ReduceMeanImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..041ad6e02d5f39fde22f34ce715d2b807e164b1a --- /dev/null +++ b/unit_tests/Test_ReduceMeanImpl.cpp @@ -0,0 +1,333 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <array> +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution + +#include <catch2/catch_test_macros.hpp> + +#include "aidge/backend/cpu.hpp" +#include "aidge/backend/cuda.hpp" +#include "aidge/data/Tensor.hpp" +#include "aidge/utils/TensorUtils.hpp" + +namespace Aidge { + +TEST_CASE("[gpu/operator] ReduceMean(forward)", "[ReduceMean][GPU]") { + SECTION("KeepDims") { + SECTION("test 1") { + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> { + { + { + { 5.0, 1.0 }, + { 20.0, 2.0 } + }, + { + { 30.0, 1.0 }, + { 40.0, 2.0 } + }, + { + { 55.0, 1.0 }, + { 60.0, 2.0 } + } + } + }); + myInput->setBackend("cuda"); + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array3D<float,3,1,2> { + { + + {{ 12.5, 1.5 }}, + {{ 35.0, 1.5 }}, + {{ 57.5, 1.5 }} + } + }); + + std::shared_ptr<Node> myReduceMean = ReduceMean({1}); + auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator()); + op->associateInput(0,myInput); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + myReduceMean->forward(); + + float* computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + for(int i = 0; i < myOutput->size(); i++){ + const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + } + + delete[] computedOutput; + } + SECTION("test 2") { + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,3,2> { + { + { + { 0.0, 0.0 }, + { 1.0, 1.0 }, + { 2.0, 2.0 } + }, + { + { 3.0, 3.0 }, + { 4.0, 4.0 }, + { 5.0, 5.0 } + }, + { + { 6.0, 6.0 }, + { 7.0, 7.0 }, + { 8.0, 8.0 } + } + } + }); + myInput->setBackend("cuda"); + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array3D<float,3,1,1> { + { + + {{ 1.0 }}, + {{ 4.0 }}, + {{ 7.0 }} + } + }); + + std::shared_ptr<Node> myReduceMean = ReduceMean({1, 2}); + auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator()); + op->associateInput(0,myInput); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + myReduceMean->forward(); + + float* computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + for(int i = 0; i < myOutput->size(); i++){ + const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + } + + delete[] computedOutput; + } + } + SECTION("not_KeepDims") { + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> { + { + { + { 5.0, 1.0 }, + { 20.0, 2.0 } + }, + { + { 30.0, 1.0 }, + { 40.0, 2.0 } + }, + { + { 55.0, 1.0 }, + { 60.0, 2.0 } + } + } + }); + myInput->setBackend("cuda"); + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float,3,2> { + { + { 12.5, 1.5 }, + { 35.0, 1.5 }, + { 57.5, 1.5 } + } + }); + + std::shared_ptr<Node> myReduceMean = ReduceMean({1}, false); + auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator()); + op->associateInput(0,myInput); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + myReduceMean->forward(); + float* computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + for(int i = 0; i < myOutput->size(); i++){ + const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + std::cout << "computed: " << computedOutput[i] << ", target: " << targetOutput << std::endl; + REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + } + + delete[] computedOutput; + + } + SECTION("all_axes") { + SECTION("1") { + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> { + { + { + { 5.0, 1.0 }, + { 20.0, 2.0 } + }, + { + { 30.0, 1.0 }, + { 40.0, 2.0 } + }, + { + { 55.0, 1.0 }, + { 60.0, 2.0 } + } + } + }); + myInput->setBackend("cuda"); + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array1D<float,1> { + {18.25} + }); + + std::shared_ptr<Node> myReduceMean = ReduceMean({0, 1, 2}, false); + auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator()); + op->associateInput(0,myInput); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + myReduceMean->forward(); + float* computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + for(int i = 0; i < myOutput->size(); i++){ + const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + } + + delete[] computedOutput; + } + SECTION("2") { + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<float,5,4> { + {{ 0.004232f, 0.105120f, 0.045124f, 0.009205f}, + { 0.000766f, 0.272162f, 0.503560f, 0.044163f}, + { 0.049755f, 0.000305f, 0.143634f, 0.013253f}, + { 0.096258f, 0.311231f, 0.358143f, 0.000452f}, + { 0.468617f, 0.015693f, 0.145316f, 0.000105f}} + }); + myInput->setBackend("cuda"); + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array1D<float,1> { + {0.1293547f} + }); + + std::shared_ptr<Node> myReduceMean = ReduceMean({0, 1}, false); + auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator()); + op->associateInput(0,myInput); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + myReduceMean->forward(); + + float* computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + for(int i = 0; i < myOutput->size(); i++){ + const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + } + + delete[] computedOutput; + } + SECTION("noop_with_empty_axes") { + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> { + { + { + { 5.0, 1.0 }, + { 20.0, 2.0 } + }, + { + { 30.0, 1.0 }, + { 40.0, 2.0 } + }, + { + { 55.0, 1.0 }, + { 60.0, 2.0 } + } + } + }); + myInput->setBackend("cuda"); + std::shared_ptr<Node> myReduceMean = ReduceMean({}, false, true); + auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator()); + op->associateInput(0,myInput); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + myReduceMean->forward(); + + myInput->setBackend("cpu"); + float* computedOutput = new float[myInput->size()](); + cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myInput->size(), cudaMemcpyDeviceToHost); + for(int i = 0; i < myInput->size(); i++){ + const float targetOutput = *(static_cast<float*>(myInput->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + } + + delete[] computedOutput; + } + } +} + +TEST_CASE("[gpu/operator] ReduceMean(backward)", "[ReduceMean][GPU]") { + SECTION("KeepDims") { + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> { + { + { + { 5.0, 1.0 }, + { 20.0, 2.0 } + }, + { + { 30.0, 1.0 }, + { 40.0, 2.0 } + }, + { + { 55.0, 1.0 }, + { 60.0, 2.0 } + } + } + }); + myInput->setBackend("cuda"); + + + std::shared_ptr<Node> myReduceMean = ReduceMean({1}); + auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator()); + op->associateInput(0,myInput); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + myReduceMean->forward(); + + + std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array3D<float,3,1,2> { + { + + {{ 1.0, 2.0 }}, + {{ 3.0, 4.0 }}, + {{ 5.0, 6.0 }} + } + }); + std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array3D<float,3,2,2> { + { + { + { 1.0, 2.0 }, + { 1.0, 2.0 } + }, + { + { 3.0, 4.0 }, + { 3.0, 4.0 } + }, + { + { 5.0, 6.0 }, + { 5.0, 6.0 } + } + } + }); + myOutputGrad->setBackend("cuda"); + op->getOutput(0)->setGrad(myOutputGrad); + REQUIRE_NOTHROW(myReduceMean->backward()); + + float *computedGradCuda = new float[expectedInputGrad->size()](); + cudaMemcpy(computedGradCuda, op->getInput(0)->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInputGrad->size(), cudaMemcpyDeviceToHost); + + for(int i = 0; i < expectedInputGrad->size(); i++){ + const float targetOutput = *(static_cast<float*>(expectedInputGrad->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6); + } + + delete[] computedGradCuda; + } +} +} diff --git a/unit_tests/Test_ReduceSumImpl.cpp b/unit_tests/Test_ReduceSumImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d0d37754102331c8f91a1ce1c81d679761916339 --- /dev/null +++ b/unit_tests/Test_ReduceSumImpl.cpp @@ -0,0 +1,297 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <array> +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution + +#include <catch2/catch_test_macros.hpp> + +#include "aidge/backend/cpu.hpp" +#include "aidge/backend/cuda.hpp" +#include "aidge/data/Tensor.hpp" +#include "aidge/utils/TensorUtils.hpp" + +namespace Aidge { + +TEST_CASE("[gpu/operator] ReduceSum(forward)", "[ReduceSum][GPU]") { + SECTION("KeepDims") { + SECTION("test 1") { + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> { + { + { + { 5.0, 1.0 }, + { 20.0, 2.0 } + }, + { + { 30.0, 1.0 }, + { 40.0, 2.0 } + }, + { + { 55.0, 1.0 }, + { 60.0, 2.0 } + } + } + }); + myInput->setBackend("cuda"); + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array3D<float,3,1,2> { + { + {{ 25.0, 3.0 }}, + {{ 70.0, 3.0 }}, + {{ 115.0, 3.0 }} + } + }); + + std::shared_ptr<Node> myReduceSum = ReduceSum({1}); + auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator()); + op->associateInput(0,myInput); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + myReduceSum->forward(); + + float* computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + for(int i = 0; i < myOutput->size(); i++){ + const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + std::cout << "i: " << i << ", computed: " << computedOutput[i] << ", target: "<< targetOutput <<std::endl; + REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + } + + delete[] computedOutput; + } + SECTION("test 2") { + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,3,2> { + { + { + { 0.0, 0.0 }, + { 1.0, 1.0 }, + { 2.0, 2.0 } + }, + { + { 3.0, 3.0 }, + { 4.0, 4.0 }, + { 5.0, 5.0 } + }, + { + { 6.0, 6.0 }, + { 7.0, 7.0 }, + { 8.0, 8.0 } + } + } + }); + myInput->setBackend("cuda"); + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array3D<float,3,1,1> { + { + + {{ 6.0 }}, + {{ 24.0 }}, + {{ 42.0 }} + } + }); + + std::shared_ptr<Node> myReduceSum = ReduceSum({1, 2}); + auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator()); + op->associateInput(0,myInput); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + myReduceSum->forward(); + + float* computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + for(int i = 0; i < myOutput->size(); i++){ + const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + } + + delete[] computedOutput; + } + } + SECTION("not_KeepDims") { + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> { + { + { + { 5.0, 1.0 }, + { 20.0, 2.0 } + }, + { + { 30.0, 1.0 }, + { 40.0, 2.0 } + }, + { + { 55.0, 1.0 }, + { 60.0, 2.0 } + } + } + }); + myInput->setBackend("cuda"); + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float,3,2> { + { + { 25.0, 3.0 }, + { 70.0, 3.0 }, + { 115.0, 3.0 } + } + }); + + std::shared_ptr<Node> myReduceSum = ReduceSum({1}, false); + auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator()); + op->associateInput(0,myInput); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + myReduceSum->forward(); + float* computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + for(int i = 0; i < myOutput->size(); i++){ + const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + } + + delete[] computedOutput; + + } + SECTION("all_axes") { + SECTION("1") { + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> { + { + { + { 5.0, 1.0 }, + { 20.0, 2.0 } + }, + { + { 30.0, 1.0 }, + { 40.0, 2.0 } + }, + { + { 55.0, 1.0 }, + { 60.0, 2.0 } + } + } + }); + myInput->setBackend("cuda"); + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array1D<float,1> { + {219.0} + }); + + std::shared_ptr<Node> myReduceSum = ReduceSum({0, 1, 2}, false); + auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator()); + op->associateInput(0,myInput); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + myReduceSum->forward(); + float* computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + for(int i = 0; i < myOutput->size(); i++){ + const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + } + + delete[] computedOutput; + } + SECTION("2") { + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<float,5,4> { + {{ 0.004232f, 0.105120f, 0.045124f, 0.009205f}, + { 0.000766f, 0.272162f, 0.503560f, 0.044163f}, + { 0.049755f, 0.000305f, 0.143634f, 0.013253f}, + { 0.096258f, 0.311231f, 0.358143f, 0.000452f}, + { 0.468617f, 0.015693f, 0.145316f, 0.000105f}} + }); + myInput->setBackend("cuda"); + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array1D<float,1> { + {2.587094f} + }); + + std::shared_ptr<Node> myReduceSum = ReduceSum({0, 1}, false); + auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator()); + op->associateInput(0,myInput); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + myReduceSum->forward(); + + float* computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + for(int i = 0; i < myOutput->size(); i++){ + const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + } + + delete[] computedOutput; + } + } +} + +TEST_CASE("[gpu/operator] ReduceSum(backward)", "[ReduceSum][GPU]") { + SECTION("KeepDims") { + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> { + { + { + { 5.0, 1.0 }, + { 20.0, 2.0 } + }, + { + { 30.0, 1.0 }, + { 40.0, 2.0 } + }, + { + { 55.0, 1.0 }, + { 60.0, 2.0 } + } + } + }); + myInput->setBackend("cuda"); + + + std::shared_ptr<Node> myReduceSum = ReduceSum({1}); + auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator()); + op->associateInput(0,myInput); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + myReduceSum->forward(); + + + std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array3D<float,3,1,2> { + { + + {{ 1.0, 2.0 }}, + {{ 3.0, 4.0 }}, + {{ 5.0, 6.0 }} + } + }); + std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array3D<float,3,2,2> { + { + { + { 1.0, 2.0 }, + { 1.0, 2.0 } + }, + { + { 3.0, 4.0 }, + { 3.0, 4.0 } + }, + { + { 5.0, 6.0 }, + { 5.0, 6.0 } + } + } + }); + myOutputGrad->setBackend("cuda"); + op->getOutput(0)->setGrad(myOutputGrad); + REQUIRE_NOTHROW(myReduceSum->backward()); + + float *computedGradCuda = new float[expectedInputGrad->size()](); + cudaMemcpy(computedGradCuda, op->getInput(0)->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInputGrad->size(), cudaMemcpyDeviceToHost); + + for(int i = 0; i < expectedInputGrad->size(); i++){ + const float targetOutput = *(static_cast<float*>(expectedInputGrad->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6); + } + + delete[] computedGradCuda; + } +} +} diff --git a/unit_tests/Test_ShiftGELUImpl.cpp b/unit_tests/Test_ShiftGELUImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..86e747e735eccb397caa8062f52c2561e8ef759d --- /dev/null +++ b/unit_tests/Test_ShiftGELUImpl.cpp @@ -0,0 +1,220 @@ +/******************************************************************************** + * Copyright (c) 2024 Thales + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * Author: Lucas RAKOTOARIVONY, Thales Research & Technology France + * Date: 25.06.2024 + * + ********************************************************************************/ + +#include <array> + +#include <catch2/catch_test_macros.hpp> + +#include "Test_cuda.hpp" + +#include "aidge/data/Tensor.hpp" + +#include "aidge/backend/cpu.hpp" +#include "aidge/backend/cuda.hpp" + +using namespace Aidge; + +TEST_CASE("[gpu/operator] ShiftGELU(forward)", "[ShiftGELU][GPU]") { + SECTION("4D Tensor") { + std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array4D<float,2,2,2,10> { + { + { + { + {0.96, 0.48, 0.54, 0.49, 0.59, 0.93, 0.00, 0.00, 0.61, 0.61}, + {0.85, 0.06, 0.11, 0.87, 0.55, 0.12, 0.80, 0.48, 0.41, 0.16} + }, + { + {0.24, 0.46, 0.97, 0.19, 0.65, 0.12, 0.44, 1.00, 0.37, 0.09}, + {0.44, 0.64, 0.21, 0.58, 0.05, 0.24, 0.56, 0.07, 0.49, 0.79} + } + }, + { + { + {0.00, 0.13, 0.55, 0.42, 0.49, 0.28, 0.52, 0.55, 0.34, 0.85}, + {0.98, 0.32, 0.09, 0.05, 0.37, 0.47, 0.63, 0.13, 0.70, 0.02} + }, + { + {0.69, 0.13, 0.74, 0.61, 0.25, 0.87, 0.46, 0.40, 0.81, 0.06}, + {0.89, 0.32, 0.61, 0.24, 0.70, 0.23, 0.09, 0.03, 0.14, 0.80} + } + } + } + }); + + //expected output of shiftgelu forward operator + std::shared_ptr<Tensor> output_shiftGELU = std::make_shared<Tensor>(Array4D<float,2,2,2,10> { + { + { + { + { 0.991388f, 0.413078f, 0.413078f, 0.413078f, 0.413078f, 0.413078f, 0.0f, 0.0f, 0.413078f, 0.413078f }, + { 0.413078f, 0.0f, 0.0f, 0.413078f, 0.413078f, 0.0f, 0.413078f, 0.413078f, 0.413078f, 0.0f } + }, + { + { 0.0f, 0.413078f, 0.991388f, 0.0f, 0.413078f, 0.0f, 0.413078f, 0.991388f, 0.413078f, 0.0f }, + { 0.413078f, 0.413078f, 0.0f, 0.413078f, 0.0f, 0.0f, 0.413078f, 0.0f, 0.413078f, 0.413078f } + } + }, + { + { + { 0.0f, 0.0f, 0.413078f, 0.413078f, 0.413078f, 0.0f, 0.413078f, 0.413078f, 0.413078f, 0.413078f }, + { 0.991388f, 0.413078f, 0.0f, 0.0f, 0.413078f, 0.413078f, 0.413078f, 0.0f, 0.413078f, 0.0f} + }, + { + { 0.413078f, 0.0f, 0.413078f, 0.413078f, 0.0f, 0.413078f, 0.413078f, 0.413078f, 0.413078f, 0.0f }, + { 0.413078f, 0.413078f, 0.413078f, 0.0f, 0.413078f, 0.0f, 0.0f, 0.0f, 0.0f, 0.413078f } + } + } + } + }); + + //expected output of GELU forward operator (computed with PyTorch) + std::shared_ptr<Tensor> output_GELU = std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 10> { + { + { + { + { 0.7982f, 0.3285f, 0.3809f, 0.3371f, 0.4262f, 0.7661f, 0.0000f, 0.0000f, 0.4447f, 0.4447f }, + { 0.6820f, 0.0314f, 0.0598f, 0.7028f, 0.3899f, 0.0657f, 0.6305f, 0.3285f, 0.2702f, 0.0902f } + }, + { + { 0.1428f, 0.3115f, 0.8090f, 0.1093f, 0.4824f, 0.0657f, 0.2948f, 0.8413f, 0.2384f, 0.0482f }, + { 0.2948f, 0.4729f, 0.1225f, 0.4170f, 0.0260f, 0.1428f, 0.3989f, 0.0370f, 0.3371f, 0.6203f } + } + }, + { + { + { 0.0000f, 0.0717f, 0.3899f, 0.2784f, 0.3371f, 0.1709f, 0.3632f, 0.3899f, 0.2152f, 0.6820f }, + { 0.8197f, 0.2002f, 0.0482f, 0.0260f, 0.2384f, 0.3200f, 0.4635f, 0.0717f, 0.5306f, 0.0102f } + }, + { + { 0.5209f, 0.0717f, 0.5701f, 0.4447f, 0.1497f, 0.7028f, 0.3115f, 0.2622f, 0.6407f, 0.0314f }, + { 0.7238f, 0.2002f, 0.4447f, 0.1428f, 0.5306f, 0.1359f, 0.0482f, 0.0154f, 0.0778f, 0.6305f } + } + } + } + }); + + std::shared_ptr<Node> myShiftGELU = ShiftGELU(); + auto op = std::static_pointer_cast<OperatorTensor>(myShiftGELU -> getOperator()); + op->associateInput(0,input0); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + op->forward(); + + float* computedOutput = new float[output_shiftGELU->size()](); + cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * output_shiftGELU->size(), cudaMemcpyDeviceToHost); + + //test if forward result are as expected + for(int i = 0; i < output_shiftGELU->size(); i++){ + const float targetOutput = *(static_cast<float*>(output_shiftGELU->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + } + + //measure difference between GELU and shiftgelu + float sum = 0.0; + for(int i = 0; i < output_GELU->size(); i++){ + const float targetOutput = *(static_cast<float*>(output_GELU->getImpl()->rawPtr()) + i); + sum += fabs(computedOutput[i] - targetOutput); + } + sum = sum / output_GELU->size(); + REQUIRE(sum < 1.5e-1); + + delete[] computedOutput; + } + +} + +TEST_CASE("[gpu/operator] ShiftGELU(backward)", "[ShiftGELU][GPU]") + +{ + + std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array4D<float,1,1,1,8> { //NCHW + { + { + { + {1.46650600, 1.24083233, -0.33106008, -0.15137172, 0.06625678, -1.8326609, 0.53444749, -0.05167147}, + }, + }, + } + }); + + input0->setBackend("cuda"); + + std::shared_ptr<Node> myShiftGELU = ShiftGELU(); + auto op = std::static_pointer_cast<OperatorTensor>(myShiftGELU->getOperator()); + op->associateInput(0, input0); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + myShiftGELU->forward(); + + std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array4D<float,1,1,1,8> { + { + { + { + { 1.34347093, 0.90813798, 0.39607167, 1.20428133, 0.16845724, 0.48487359, 0.40748054, -0.21790814}, + }, + }, + } + }); + + + myOutputGrad->setBackend("cuda"); + std::shared_ptr<Tensor> predictedOutput = op->getOutput(0); + std::shared_ptr<Tensor> input = op->getInput(0); + predictedOutput->setGrad(myOutputGrad); + REQUIRE_NOTHROW(myShiftGELU->backward()); + + //expected output of shiftgelu backward operator + std::shared_ptr<Tensor> expectedInputGradShiftGELU = std::make_shared<Tensor>(Array4D<float,1,1,1,8> { + { + { + { + { 1.88094, 1.09182, 0.134203, 0.439603, 0.0696628, 0.173469, 0.254718, -0.084009}, + }, + }, + } + }); + + //expected output of gelu backward operator (computed with PyTorch) + std::shared_ptr<Tensor> expectedInputGradGELU = std::make_shared<Tensor>(Array4D<float,1,1,1,8> { + { + { + { + { 1.5159, 1.0188, 0.0971, 0.4578, 0.0931, -0.0499, 0.3620, -0.1000}, + }, + }, + } + }); + + + float *computedGradCuda = new float[myOutputGrad->size()](); + + cudaMemcpy(computedGradCuda, input->grad()->getImpl()->rawPtr(), sizeof(float) * myOutputGrad->size(), cudaMemcpyDeviceToHost); + + //test if backward result are as expected + for(int i = 0; i < expectedInputGradShiftGELU->size(); i++){ + const float targetOutput = *(static_cast<float*>(expectedInputGradShiftGELU->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 2e-6); + } + + //measure difference between gelu and shifgelu + float sum = 0.0; + for(int i = 0; i < expectedInputGradGELU->size(); i++){ + const float targetOutput = *(static_cast<float*>(expectedInputGradGELU->getImpl()->rawPtr()) + i); + sum += fabs(computedGradCuda[i] - targetOutput); + } + sum = sum / expectedInputGradGELU->size(); + REQUIRE(sum < 2e-1); + + + delete[] computedGradCuda; +} diff --git a/unit_tests/Test_ShiftMaxImpl.cpp b/unit_tests/Test_ShiftMaxImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2a94a23c3a04edd72cb535ebfb6e2c538e4aeee8 --- /dev/null +++ b/unit_tests/Test_ShiftMaxImpl.cpp @@ -0,0 +1,217 @@ +/******************************************************************************** + * Copyright (c) 2024 Thales + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * Author: Lucas RAKOTOARIVONY, Thales Research & Technology France + * Date: 25.06.2024 + * + ********************************************************************************/ + +#include <array> + +#include <catch2/catch_test_macros.hpp> + +#include "Test_cuda.hpp" + +#include "aidge/data/Tensor.hpp" + +#include "aidge/backend/cpu.hpp" +#include "aidge/backend/cuda.hpp" + +using namespace Aidge; + +TEST_CASE("[gpu/operator] ShiftMax(forward)", "[ShiftMax][GPU]") { + SECTION("4D Tensor") { + std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array4D<float,2,2,2,10> { + { + { + { + {0.96, 0.48, 0.54, 0.49, 0.59, 0.93, 0.00, 0.00, 0.61, 0.61}, + {0.85, 0.06, 0.11, 0.87, 0.55, 0.12, 0.80, 0.48, 0.41, 0.16} + }, + { + {0.24, 0.46, 0.97, 0.19, 0.65, 0.12, 0.44, 1.00, 0.37, 0.09}, + {0.44, 0.64, 0.21, 0.58, 0.05, 0.24, 0.56, 0.07, 0.49, 0.79} + } + }, + { + { + {0.00, 0.13, 0.55, 0.42, 0.49, 0.28, 0.52, 0.55, 0.34, 0.85}, + {0.98, 0.32, 0.09, 0.05, 0.37, 0.47, 0.63, 0.13, 0.70, 0.02} + }, + { + {0.69, 0.13, 0.74, 0.61, 0.25, 0.87, 0.46, 0.40, 0.81, 0.06}, + {0.89, 0.32, 0.61, 0.24, 0.70, 0.23, 0.09, 0.03, 0.14, 0.80} + } + } + } + }); + //expected output of shiftmax forward operator + std::shared_ptr<Tensor> output_shiftmax = std::make_shared<Tensor>(Array4D<float,2,2,2,10> { + { + { + { + { 0.111084f, 0.111084f, 0.111084f, 0.111084f, 0.111084f, 0.111084f, 0.055542f, 0.055542f, 0.111084f, 0.111084f }, + { 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f } + }, + { + { 0.0624695f, 0.124969f, 0.124969f, 0.0624695f, 0.124969f, 0.0624695f, 0.124969f, 0.124969f, 0.124969f, 0.0624695f }, + { 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f } + } + }, + { + { + { 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f }, + { 0.124969f, 0.124969f, 0.0624695f, 0.0624695f, 0.124969f, 0.124969f, 0.124969f, 0.0624695f, 0.124969f, 0.0624695f } + }, + { + { 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f }, + { 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f } + } + } + } + }); + //expected output of softmax forward operator (computed with PyTorch) + std::shared_ptr<Tensor> output_softmax = std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 10> { + { + { + { + { 0.1484f, 0.0918f, 0.0975f, 0.0928f, 0.1025f, 0.1440f, 0.0568f, 0.0568f, 0.1046f, 0.1046f }, + { 0.1436f, 0.0652f, 0.0685f, 0.1465f, 0.1064f, 0.0692f, 0.1366f, 0.0992f, 0.0925f, 0.0721f } + }, + { + { 0.0768f, 0.0957f, 0.1593f, 0.0730f, 0.1157f, 0.0681f, 0.0938f, 0.1642f, 0.0874f, 0.0661f }, + { 0.1005f, 0.1227f, 0.0798f, 0.1156f, 0.0680f, 0.0823f, 0.1133f, 0.0694f, 0.1056f, 0.1426f } + } + }, + { + { + { 0.0645f, 0.0734f, 0.1118f, 0.0981f, 0.1052f, 0.0853f, 0.1085f, 0.1118f, 0.0906f, 0.1509f }, + { 0.1743f, 0.0901f, 0.0716f, 0.0688f, 0.0947f, 0.1047f, 0.1228f, 0.0745f, 0.1317f, 0.0667f } + }, + { + { 0.1164f, 0.0665f, 0.1224f, 0.1075f, 0.0750f, 0.1394f, 0.0925f, 0.0871f, 0.1313f, 0.0620f }, + { 0.1551f, 0.0877f, 0.1172f, 0.0810f, 0.1283f, 0.0802f, 0.0697f, 0.0656f, 0.0733f, 0.1418f } + } + } + } + }); + + std::shared_ptr<Node> myShiftMax = ShiftMax(); + auto op = std::static_pointer_cast<OperatorTensor>(myShiftMax -> getOperator()); + op->associateInput(0,input0); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + op->forward(); + + float* computedOutput = new float[output_shiftmax->size()](); + cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * output_shiftmax->size(), cudaMemcpyDeviceToHost); + + //test if forward result are as expected + for(int i = 0; i < output_shiftmax->size(); i++){ + const float targetOutput = *(static_cast<float*>(output_shiftmax->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + } + + //measure difference between softmax and shiftmax + float sum = 0.0; + for(int i = 0; i < output_softmax->size(); i++){ + const float targetOutput = *(static_cast<float*>(output_softmax->getImpl()->rawPtr()) + i); + sum += fabs(computedOutput[i] - targetOutput); + } + sum = sum / output_softmax->size(); + REQUIRE(sum < 4e-2); + + delete[] computedOutput; + } + +} + +TEST_CASE("[gpu/operator] ShiftMax(backward)", "[ShiftMax][GPU]") + +{ + + std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array4D<float,1,1,1,8> { //NCHW + { + { + { + {1.46650600, 1.24083233, -0.33106008, -0.15137172, 0.06625678, -1.8326609, 0.53444749, -0.05167147}, + }, + }, + } + }); + + input0->setBackend("cuda"); + + std::shared_ptr<Node> myShiftMax = ShiftMax(); + auto op = std::static_pointer_cast<OperatorTensor>(myShiftMax->getOperator()); + op->associateInput(0, input0); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + myShiftMax->forward(); + + std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array4D<float,1,1,1,8> { + { + { + { + { 1.34347093, 0.90813798, 0.39607167, 1.20428133, 0.16845724, 0.48487359, 0.40748054, -0.21790814}, + }, + }, + } + }); + + + myOutputGrad->setBackend("cuda"); + std::shared_ptr<Tensor> predictedOutput = op->getOutput(0); + std::shared_ptr<Tensor> input = op->getInput(0); + predictedOutput->setGrad(myOutputGrad); + REQUIRE_NOTHROW(myShiftMax->backward()); + + //expected output of shiftmax backward operator + std::shared_ptr<Tensor> expectedInputGradShiftMax = std::make_shared<Tensor>(Array4D<float,1,1,1,8> { + { + { + { + { 0.159378, 0.0249331, -0.0250217, 0.0262418, -0.0514701, -0.00459638, -0.0551896, -0.0739511}, + }, + }, + } + }); + + //expected output of softmax backward operator (computed with PyTorch) + std::shared_ptr<Tensor> expectedInputGradSoftmax = std::make_shared<Tensor>(Array4D<float,1,1,1,8> { + { + { + { + { 0.1672, 0.0198, -0.0236, 0.0241, -0.0535, -0.0042, -0.0547, -0.0752}, + }, + }, + } + }); + + + float *computedGradCuda = new float[myOutputGrad->size()](); + + cudaMemcpy(computedGradCuda, input->grad()->getImpl()->rawPtr(), sizeof(float) * myOutputGrad->size(), cudaMemcpyDeviceToHost); + + //test if backward result are as expected + for(int i = 0; i < expectedInputGradShiftMax->size(); i++){ + const float targetOutput = *(static_cast<float*>(expectedInputGradShiftMax->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6); + } + + //measure difference between softmax and shiftmax + float sum = 0.0; + for(int i = 0; i < expectedInputGradSoftmax->size(); i++){ + const float targetOutput = *(static_cast<float*>(expectedInputGradSoftmax->getImpl()->rawPtr()) + i); + sum += fabs(computedGradCuda[i] - targetOutput); + } + sum = sum / expectedInputGradSoftmax->size(); + REQUIRE(sum < 4e-3); + + delete[] computedGradCuda; +} diff --git a/unit_tests/Test_TensorImpl.cpp b/unit_tests/Test_TensorImpl.cpp index cad4a1a067d55fe5c8246d09a733f44007886dc0..cb120a970c5310f80f8c62960c029a845937ba30 100644 --- a/unit_tests/Test_TensorImpl.cpp +++ b/unit_tests/Test_TensorImpl.cpp @@ -122,3 +122,42 @@ TEST_CASE("Tensor creation", "[Connector]") { REQUIRE(val[7] == 8); } } + +TEST_CASE("Tensor Descriptor Update") { + Tensor x; + x.setBackend("cuda"); + + std::vector<std::size_t> shapeA = { 7, 6, 5, 4, 3 }; + x.resize(shapeA); + + cudnnTensorDescriptor_t desc = std::dynamic_pointer_cast<TensorImpl_cuda_>(x.getImpl())->getCudnnTensorDesc(x); + + cudnnDataType_t currentDataType; + int currentNbDims; + std::vector<int> currentDimA(shapeA.size()); + std::vector<int> currentStrideA(shapeA.size()); + + REQUIRE_NOTHROW(cudnnGetTensorNdDescriptor(desc, shapeA.size(), ¤tDataType, ¤tNbDims, currentDimA.data(), currentStrideA.data())); + + REQUIRE(std::equal(currentDimA.begin(), currentDimA.end(), shapeA.begin(), [](int a, std::size_t b) { + return static_cast<std::size_t>(a) == b; + } + ) + ); + + // Change the tensor shape and check tensor descriptor + std::vector<std::size_t> shapeB = { 6, 5, 4 }; + x.resize(shapeB); + + std::vector<int> currentDimB(shapeB.size()); + std::vector<int> currentStrideB(shapeB.size()); + + desc = std::dynamic_pointer_cast<TensorImpl_cuda_>(x.getImpl())->getCudnnTensorDesc(x); + REQUIRE_NOTHROW(cudnnGetTensorNdDescriptor(desc, shapeB.size(), ¤tDataType, ¤tNbDims, currentDimB.data(), currentStrideB.data())); + + REQUIRE(std::equal(currentDimB.begin(), currentDimB.end(), shapeB.begin(), [](int a, std::size_t b) { + return static_cast<std::size_t>(a) == b; + } + ) + ); +} diff --git a/version.txt b/version.txt index f4778493c50025c6ab147a1fec7486ef0c706792..69367fd08f3ce302151ebc9779193d517dfa32de 100644 --- a/version.txt +++ b/version.txt @@ -1 +1,2 @@ -0.2.2 \ No newline at end of file +0.3.0 +