Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • eclipse/aidge/aidge_backend_cuda
  • hrouis/aidge_backend_cuda
  • oantoni/aidge_backend_cuda
  • jeromeh/aidge_backend_cuda
  • lrakotoarivony/aidge_backend_cuda
  • silvanosky/aidge_backend_cuda
  • maab05/aidge_backend_cuda
  • noamzerah/aidge_backend_cuda
  • farnez/aidge_backend_cuda
  • axelfarr/aidge_backend_cuda
  • farges/trying_all-major
  • bobot/aidge_backend_cuda
  • theodorget/aidge_backend_cuda
13 results
Show changes
Commits on Source (206)
Showing
with 559 additions and 87 deletions
......@@ -4,6 +4,7 @@
# C++ Build
build*/
install*/
include/aidge/backend/cuda_version.h
# VSCode
.vscode
......
......@@ -12,23 +12,46 @@ stages:
- deploy
include:
- project: 'eclipse/aidge/gitlab_shared_files'
- project: 'eclipse/aidge/gitlab_shared_files'
ref: 'main'
file:
file:
# choose which jobs to run by including the corresponding files.
- '.gitlab/ci/ubuntu_cpp.gitlab-ci.yml'
- '.gitlab/ci/ubuntu_python.gitlab-ci.yml'
- '.gitlab/ci/release/cibuildwheel_ubuntu.gitlab-ci.yml'
- '.gitlab/ci/release/cibuildwheel_ubuntu.gitlab-ci.yml'
# - '.gitlab/ci/windows_cpp.gitlab-ci.yml'
# - '.gitlab/ci/windows_python.gitlab-ci.yml'
# - '.gitlab/ci/release/cibuildwheel_windows.gitlab-ci.yml'
# - '.gitlab/ci/windows_python.gitlab-ci.yml'
# - '.gitlab/ci/release/cibuildwheel_windows.gitlab-ci.yml'
build:ubuntu_python:
# Use cudnn image instead of the cuda one
image: nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
test:ubuntu_python:
# Use cudnn image instead of the cuda one
image: nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
coverage:ubuntu_python:
# Use cudnn image instead of the cuda one
image: nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
.build:ubuntu_cpp:template:
# Use cudnn image instead of the cuda one
image: nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
test:ubuntu_cpp:
# Use cudnn image instead of the cuda one
image: nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
coverage:ubuntu_cpp:
# Use cudnn image instead of the cuda one
image: nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
release:pip:ubuntu:
tags:
tags:
- release:cuda
variables:
DOCKER_HOST: unix:///var/run/docker.sock
......@@ -36,19 +59,22 @@ release:pip:ubuntu:
BUILD_WITH_CUDA=1
AIDGE_DEPENDENCIES='aidge_core aidge_backend_cpu'
AIDGE_INSTALL='/AIDGE_INSTALL_CIBUILDWHEEL'
CUDA_TOOLKIT_VERSION='11-8'
DOCKER_HOST='unix:///var/run/docker.sock'
ARCH='x86_64'
CUDNN_VERSION='9'
CUDA_MAJOR_VERSION='11'
CUDA_MAJOR_VERSION='12'
CUDA_MINOR_VERSION='8'
SEARCH_PATH='/home/ubuntu/builds/$CI_RUNNER_SHORT_TOKEN/$CI_CONCURRENT_ID'
SEARCH_PATH='/home/gitlab-runner/builds/$CI_RUNNER_SHORT_TOKEN/$CI_CONCURRENT_ID'
CIBW_REPAIR_WHEEL_COMMAND='auditwheel --verbose repair {wheel} -w {dest_dir} --exclude libcudart.so.12 --exclude libcudnn.so.9 --exclude libcublas.so.12 --exclude libcublasLt.so.12'
parallel:
matrix:
- CIBW_BUILD: "cp38-manylinux_x86_64"
- CIBW_BUILD: "cp39-manylinux_x86_64"
# - CIBW_BUILD: "cp38-manylinux_x86_64"
# - CIBW_BUILD: "cp39-manylinux_x86_64"
- CIBW_BUILD: "cp310-manylinux_x86_64"
- CIBW_BUILD: "cp311-manylinux_x86_64"
- CIBW_BUILD: "cp312-manylinux_x86_64"
- CIBW_BUILD: "cp313-manylinux_x86_64"
before_script:
# retrieve aidge dependencies
......@@ -56,9 +82,11 @@ release:pip:ubuntu:
- !reference [.ubuntu:download:repositories, before_script] # located in common.gitlab-ci.yml
script:
- /home/ubuntu/.local/bin/cibuildwheel --output-dir wheelhouse
# - /home/ubuntu/.local/bin/cibuildwheel --output-dir wheelhouse
- cibuildwheel --output-dir wheelhouse
after_script:
# Ensure all files are owned by the correct user at the end of the job
- sudo chown -R $(whoami):$(whoami) .
# after_script:
# # Ensure all files are owned by the correct user at the end of the job
# # Note: sudo requires the job to have privileged = true
# - sudo chown -R $(whoami):$(whoami) .
#!/bin/bash
set -e
if [[ "$1" == "" ]]; then
if [[ "$1" == "" ]]; then
echo "build aidge deps in cibuildwheel container before building wheel."
echo "search path defines where the dependencies will be searched."
echo "Hint : In wheel containers, files are mounted on /host by default."
......@@ -10,13 +10,14 @@ set -x
if [[ $AIDGE_DEPENDENCIES == "" ]]; then # case for aidge_ core
mkdir -p build # creating build if its not already there to hold the build of cpp files
rm -rf build/* # build from scratch
else
else
for repo in $AIDGE_DEPENDENCIES ; do # case for other projects
search_path=$1
REPO_PATH=$(find $search_path ! -writable -prune -o -type d \
-name "$repo" \
-not -path "*/install/*" \
-not -path "*/.git/*" \
-not -path "*/.mypy_cache/*" \
-not -path "*/miniconda/*" \
-not -path "*/conda/*" \
-not -path "*/.local/*" \
......@@ -24,7 +25,7 @@ else
-not -path "*/$repo/$repo/*" \
-not -path "*/proc/*" \
-print -quit)
if [[ -z "$REPO_PATH" ]]; then
if [[ -z "$REPO_PATH" ]]; then
echo "ERROR : dependency $repo not found in search_path \"$search_path\". ABORTING."
exit -1
fi
......@@ -33,6 +34,10 @@ else
mkdir -p build # creating build if its not already there to hold the build of cpp files
rm -rf build/* # build from scratch
pip install . -v
# Give all rights on generated build folder to avoid root issues once out of the Docker
chmod -R a+rwX build/
chmod -R a+rwX *.egg-info/
cd -
done
fi
......
# Version 0.5.0 (January 31, 2024)
# Version 0.4.0 (December 6, 2024)
# Version 0.1.0 (January 23, 2024)
Initial release
# CMake >= 3.18 is required for good support of FindCUDAToolkit
cmake_minimum_required(VERSION 3.18)
set(CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
file(STRINGS "${CMAKE_SOURCE_DIR}/version.txt" version)
# Parse version.txt to retrieve Major, Minor and Path
string(REGEX MATCH "([0-9]+\\.[0-9]+\\.[0-9]+)" _ MATCHES ${version})
set(PROJECT_VERSION_MAJOR ${CMAKE_MATCH_1})
set(PROJECT_VERSION_MINOR ${CMAKE_MATCH_2})
set(PROJECT_VERSION_PATCH ${CMAKE_MATCH_3})
project(aidge_backend_cuda
VERSION ${version}
DESCRIPTION "CUDA implementations of the operators of aidge framework."
......@@ -21,18 +30,17 @@ execute_process(
)
message(STATUS "Latest git commit: ${GIT_COMMIT_HASH}")
# Define a preprocessor macro with the Git commit version
add_definitions(-DGIT_COMMIT_HASH="${GIT_COMMIT_HASH}")
# Note : project name is ${CMAKE_PROJECT_NAME} and python module name is also ${CMAKE_PROJECT_NAME}
set(module_name _${CMAKE_PROJECT_NAME}) # target name
##############################################
# Define options
option(PYBIND "python binding" OFF)
option(WERROR "Warning as error" OFF)
option(TEST "Enable tests" ON)
option(COVERAGE "Enable coverage" OFF)
option(ENABLE_ASAN "Enable ASan (AddressSanitizer) for runtime analysis of memory use (over/underflow, memory leak, ...)" OFF)
option(PYBIND "python binding (Default: OFF)" OFF)
option(WERROR "Warning as error (Default: OFF)" OFF)
option(TEST "Enable tests (Default: ON)" ON)
option(COVERAGE "Enable coverage (Default: OFF)" OFF)
option(ENABLE_ASAN "Enable ASan (AddressSanitizer) for runtime analysis of memory use (over/underflow, memory leak, ...) (Default: OFF)" OFF)
##############################################
# Import utils CMakeLists
......@@ -60,14 +68,14 @@ endif()
##########
# CUDA
if(NOT $ENV{AIDGE_INSTALL} STREQUAL "")
if(NOT $ENV{CIBUILDWHEEL} STREQUAL "")
message(WARNING "Env var CIBUILDWHEEL detected : currently building for a release job."
"\nSetting manually CUDACXX, PATH & LD_LIBRARY_PATH Variables")
list(APPEND ENV{LD_LIBRARY_PATH} /usr/local/cuda/lib64)
list(APPEND ENV{PATH} /usr/local/cuda/bin)
set(ENV{CUDACXX} /usr/local/cuda/bin/nvcc)
endif()
find_package(CUDAToolkit REQUIRED)
find_package(CUDAToolkit 12 REQUIRED)
if(NOT DEFINED CMAKE_CUDA_STANDARD)
set(CMAKE_CUDA_STANDARD 14)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
......@@ -97,7 +105,7 @@ if (PYBIND)
# Handles Python + pybind11 headers dependencies
include(PybindModuleCreation)
# creates a target of the same name as CMAKE_PROJECT_NAME
generate_python_binding(${CMAKE_PROJECT_NAME} ${module_name}) # the python bindings module has the same name as the project.
generate_python_binding(${CMAKE_PROJECT_NAME} ${module_name}) # the python bindings module has the same name as the project.
target_link_libraries(${module_name}
PUBLIC
......@@ -107,12 +115,21 @@ if (PYBIND)
)
endif()
message(STATUS "Creating ${CMAKE_CURRENT_SOURCE_DIR}/include/aidge/backend/cuda_version.h")
# Generate version.h file from config file version.h.in
configure_file(
"${CMAKE_CURRENT_SOURCE_DIR}/include/aidge/backend/version.h.in"
"${CMAKE_CURRENT_SOURCE_DIR}/include/aidge/backend/cuda_version.h"
)
target_link_libraries(${module_name}
PUBLIC
_aidge_core # _ is added because we link the target not the project
CUDA::cudart
PRIVATE
CUDA::cublas
cudnn
CUDA::cudart
)
if( ${ENABLE_ASAN} )
......
......@@ -8,6 +8,8 @@ You can find in this folder the library that implements the CUDA operators.
## Installation
### Dependencies
- [CUDA ToolKit 12.4](https://developer.nvidia.com/cuda-12-4-0-download-archive)
- [CUDnn9](https://developer.nvidia.com/cudnn-downloads) make sure to install the CUDA 12 compatible version
- `GCC`
- `Make`/`Ninja`
- `CMake`
......
from aidge_backend_cuda.aidge_backend_cuda import * # import so generated by PyBind
from ._version import *
from . import benchmark
import time
import numpy as np
import aidge_core
import aidge_backend_cuda
def measure_inference_time(model: aidge_core.GraphView, input_data: list[str, np.ndarray], nb_warmup: int = 10, nb_iterations: int = 50) -> list[float]:
# update model and inputs backend
model.set_backend("cuda")
ordered_inputs = [aidge_core.Tensor(i[1]) for i in input_data]
for ordered_input in ordered_inputs:
ordered_input.set_backend("cuda")
scheduler = aidge_core.SequentialScheduler(model)
scheduler.generate_scheduling()
timings = []
# Warm-up runs.
for i in range(nb_warmup + nb_iterations):
if i < nb_warmup:
scheduler.forward(forward_dims=False, data=ordered_inputs)
else:
start = time.process_time()
scheduler.forward(forward_dims=False, data=ordered_inputs)
end = time.process_time()
timings.append((end - start))
return timings
def compute_output(model: aidge_core.GraphView, input_data: list[str, np.ndarray]) -> list[np.ndarray]:
# update model and inputs backend
model.set_backend("cuda", device = 1)
ordered_inputs = [aidge_core.Tensor(i[1]) for i in input_data]
for ordered_input in ordered_inputs:
ordered_input.set_backend("cuda", device = 1)
scheduler = aidge_core.SequentialScheduler(model)
scheduler.generate_scheduling()
scheduler.forward(forward_dims=False, data=ordered_inputs)
outs = []
for pair in model.get_ordered_outputs():
t = pair[0].get_operator().get_output(pair[1])
t.set_backend("cpu")
outs.append(t)
return [np.array(out) for out in outs]
\ No newline at end of file
function(generate_python_binding pybind_module_name target_to_bind)
function(generate_python_binding pybind_module_name target_to_bind)
add_definitions(-DPYBIND)
Include(FetchContent)
set(PYBIND_VERSION v2.10.4)
set(PYBIND_VERSION v2.13.6)
set(PYBIND11_FINDPYTHON ON)
message(STATUS "Retrieving pybind ${PYBIND_VERSION} from git")
......@@ -20,6 +20,18 @@ function(generate_python_binding pybind_module_name target_to_bind)
file(GLOB_RECURSE pybind_src_files "python_binding/*.cpp")
pybind11_add_module(${pybind_module_name} MODULE ${pybind_src_files} "NO_EXTRAS") # NO EXTRA recquired for pip install
target_include_directories(${pybind_module_name} PUBLIC "python_binding")
target_link_libraries(${pybind_module_name} PUBLIC ${target_to_bind})
target_include_directories(${pybind_module_name} PRIVATE "python_binding")
target_link_libraries(${pybind_module_name}
PRIVATE
${target_to_bind}
CUDA::cublas
cudnn
CUDA::cudart
)
set_property(TARGET ${pybind_module_name} PROPERTY POSITION_INDEPENDENT_CODE ON)
set_target_properties(${pybind_module_name} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
set_target_properties(${pybind_module_name} PROPERTIES
CMAKE_SHARED_LINKER_FLAGS "-Wl,--exclude-libs,ALL"
)
set_target_properties(${module} PROPERTIES INSTALL_RPATH "")
endfunction()
......@@ -11,35 +11,44 @@
#ifndef AIDGE_BACKEND_CUDA_IMPORTS_H_
#define AIDGE_BACKEND_CUDA_IMPORTS_H_
#include "aidge/backend/cuda_version.h"
#include "aidge/backend/cuda/data/TensorImpl.hpp"
#include "aidge/backend/cuda/operator/OperatorImpl.hpp"
#include "aidge/backend/cuda/operator/AbsImpl.hpp"
#include "aidge/backend/cuda/operator/AddImpl.hpp"
#include "aidge/backend/cuda/operator/AndImpl.hpp"
#include "aidge/backend/cuda/operator/ArgMaxImpl.hpp"
#include "aidge/backend/cuda/operator/AvgPoolingImpl.hpp"
#include "aidge/backend/cuda/operator/BatchNormImpl.hpp"
#include "aidge/backend/cuda/operator/BitShiftImpl.hpp"
#include "aidge/backend/cuda/operator/ConvImpl.hpp"
#include "aidge/backend/cuda/operator/ClipImpl.hpp"
#include "aidge/backend/cuda/operator/DivImpl.hpp"
#include "aidge/backend/cuda/operator/EqualImpl.hpp"
#include "aidge/backend/cuda/operator/ErfImpl.hpp"
#include "aidge/backend/cuda/operator/FCImpl.hpp"
#include "aidge/backend/cuda/operator/GlobalAveragePoolingImpl.hpp"
#include "aidge/backend/cuda/operator/ILayerNormImpl.hpp"
#include "aidge/backend/cuda/operator/LRNImpl.hpp"
#include "aidge/backend/cuda/operator/LnImpl.hpp"
#include "aidge/backend/cuda/operator/MaxPoolingImpl.hpp"
#include "aidge/backend/cuda/operator/MatMulImpl.hpp"
#include "aidge/backend/cuda/operator/MulImpl.hpp"
#include "aidge/backend/cuda/operator/PadImpl.hpp"
#include "aidge/backend/cuda/operator/PowImpl.hpp"
#include "aidge/backend/cuda/operator/ReduceMeanImpl.hpp"
#include "aidge/backend/cuda/operator/ReduceSumImpl.hpp"
#include "aidge/backend/cuda/operator/ReLUImpl.hpp"
#include "aidge/backend/cuda/operator/ShiftMaxImpl.hpp"
#include "aidge/backend/cuda/operator/ResizeImpl.hpp"
#include "aidge/backend/cuda/operator/RoundImpl.hpp"
#include "aidge/backend/cuda/operator/ShiftGELUImpl.hpp"
#include "aidge/backend/cuda/operator/ReshapeImpl.hpp"
#include "aidge/backend/cuda/operator/ShiftMaxImpl.hpp"
#include "aidge/backend/cuda/operator/SigmoidImpl.hpp"
#include "aidge/backend/cuda/operator/SoftmaxImpl.hpp"
#include "aidge/backend/cuda/operator/SqrtImpl.hpp"
#include "aidge/backend/cuda/operator/SubImpl.hpp"
#include "aidge/backend/cuda/operator/TanhImpl.hpp"
#include "aidge/backend/cuda/operator/ShiftMaxImpl.hpp"
#include "aidge/backend/cuda/operator/ShiftGELUImpl.hpp"
#include "aidge/backend/cuda/operator/ILayerNormImpl.hpp"
#endif /* AIDGE_BACKEND_CUDA_IMPORTS_H_ */
/********************************************************************************
* Copyright (c) 2025 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CUDA_DATA_INTERPOLATION_H_
#define AIDGE_CUDA_DATA_INTERPOLATION_H_
#include "aidge/data/Interpolation.hpp"
namespace Aidge {
namespace InterpolationCUDA {
/**
* @brief Computes the approximate input coordinates corresponding to given output coordinates,
* according to the specified coordinate transformation mode.
*
* This function performs an "untransform" step for spatial interpolation, mapping output-space
* integer coordinates (`coordOut`) back into approximate input-space floating-point coordinates
* (`coordInApprox`) based on the transformation strategy provided.
*
* @param coordOut Pointer to the output coordinates (integer values).
* @param inputDims Pointer to the dimensions of the input tensor.
* @param outputDims Pointer to the dimensions of the output tensor.
* @param coordTransfoMode The coordinate transformation mode (e.g., AlignCorners, HalfPixel, etc.).
* @param roi Pointer to region of interest values, used only for TFCropAndResize mode;
* expected to be of size 2 * rank.
* @param coordInApprox Pointer to the output array where the approximate input coordinates will be stored.
* @param rank The dimensionality of the spatial domain (e.g., 2 for 2D, 3 for 3D).
*/
__device__ void untransformCoordinates(
const int* coordOut,
const int* inputDims,
const int* outputDims,
Aidge::Interpolation::CoordinateTransformation coordTransfoMode,
const float* roi,
float* coordInApprox,
int rank);
/**
* @brief Retrieves neighboring input tensor values around a set of continuous coordinates for interpolation.
*
* This function gathers neighboring values from a multidimensional tensor for use in interpolation,
* depending on the interpolation mode (e.g., linear or cubic), scaling, padding strategy, and
* anti-aliasing settings. The gathered values and their coordinates are stored in output buffers.
*
* @tparam T The data type of the input and output tensor values.
* @param tensorValues Pointer to the input tensor values (flattened array).
* @param tensorDims Pointer to the dimensions of the input tensor.
* @param coords Pointer to the floating-point coordinates in the input space.
* @param scales Pointer to scaling factors per dimension.
* @param rank The number of spatial dimensions in the tensor.
* @param mode Interpolation mode (e.g., Linear, Cubic).
* @param paddingMode Padding mode used for out-of-bound accesses (e.g., Zero, Edge).
* @param antialiasing Whether antialiasing is enabled (affects kernel footprint).
* @param outValues Pointer to the buffer where retrieved neighbor values will be stored.
* @param outCoords Pointer to the buffer where the corresponding coordinates of neighbors will be stored.
* Output shape is [maxNeighbours x rank].
* @param outCount Pointer to a single integer where the number of valid neighbors will be written.
* @param maxNeighbours Maximum number of neighbors to retrieve (capacity of the output buffers).
*/
template <typename T>
__device__ void retrieveNeighboursKernel(
const T* tensorValues,
const int* tensorDims,
float* coords,
const float* scales,
int rank,
Aidge::Interpolation::Mode mode,
Aidge::PadBorderType paddingMode,
bool antialiasing,
T* outValues,
int* outCoords,
int* outCount,
int maxNeighbours
);
/**
* @brief Performs N-dimensional linear interpolation given neighbor points and their values.
*
* This function computes a weighted average of neighbor values based on linear interpolation
* weights derived from their distance to a target coordinate. It optionally applies antialiasing
* scaling to the distances. The weights are normalized to ensure smooth interpolation.
*
* @tparam T The data type of the input and output values.
* @param coordToInterpolate Pointer to the floating-point coordinate to interpolate at (length = rank).
* @param scales Pointer to scale factors per dimension.
* @param pointsCoords Pointer to neighbor coordinates (flattened array of size coordsNbr × rank).
* @param pointValues Pointer to values corresponding to neighbor coordinates.
* @param coordsNbr Number of neighboring points used in the interpolation.
* @param rank Number of spatial dimensions.
* @param antialiasing Whether to apply antialiasing by scaling coordinate deltas.
* @return T The interpolated value at the target coordinate.
*/
template <typename T>
__device__ T interpolateLinear(
const float* coordToInterpolate,
const float* scales,
const int* pointsCoords,
const T* pointValues,
int coordsNbr,
int rank,
bool antialiasing
);
/**
* @brief Performs N-dimensional cubic interpolation given neighbor points and their values.
*
* This function computes a weighted average of neighbor values based on cubic interpolation weights,
* optionally applying antialiasing and excluding neighbors outside input dimensions.
* Static dimensions (with size 1 and scale 1) are ignored in the weight calculation.
*
* @tparam T The data type of the input and output values.
* @param coordToInterpolate Pointer to the floating-point coordinate to interpolate at (length = rank).
* @param scales Pointer to scale factors per dimension.
* @param pointsCoords Pointer to neighbor coordinates (flattened array of size coordsNbr × rank).
* @param pointValues Pointer to values corresponding to neighbor coordinates.
* @param coordsNbr Number of neighboring points used in the interpolation.
* @param rank Number of spatial dimensions.
* @param inputDims Pointer to input tensor dimensions.
* @param a Cubic interpolation parameter (often -0.75 for Catmull-Rom).
* @param antialiasing Whether to apply antialiasing in weight calculation.
* @param excludeOutside Whether to exclude neighbors outside input dimensions.
* @return T The interpolated value at the target coordinate.
*/
template <typename T>
__device__ T interpolateCubic(
float* coordToInterpolate,
const float* scales,
int* pointsCoords,
T* pointValues,
int coordsNbr,
int rank,
const int* inputDims,
float a,
bool antialiasing,
bool excludeOutside
);
/**
* @brief Dispatches to the appropriate interpolation method (linear or cubic) based on the mode.
*
* This function selects the interpolation method and computes the interpolated value
* at the given coordinate using either cubic or linear interpolation. Returns zero
* for unsupported interpolation modes.
*
* @tparam T The data type of the input and output values.
* @param coordToInterpolate Pointer to the floating-point coordinate to interpolate at (length = rank).
* @param scales Pointer to scale factors per dimension.
* @param pointsCoords Pointer to neighbor coordinates (flattened array of size coordsNbr × rank).
* @param pointValues Pointer to values corresponding to neighbor coordinates.
* @param coordsNbr Number of neighboring points used in the interpolation.
* @param rank Number of spatial dimensions.
* @param mode Interpolation mode (Linear or Cubic).
* @param cubicCoeffA Cubic interpolation coefficient (used only for cubic mode).
* @param antialiasing Whether to apply antialiasing in interpolation.
* @param excludeOutside Whether to exclude neighbors outside input dimensions (only cubic mode).
* @param inputDims Pointer to input tensor dimensions.
* @return T The interpolated value at the target coordinate.
*/
template <typename T>
__device__ T interpolate(float* coordToInterpolate,
const float* scales,
int* pointsCoords,
T* pointValues,
int coordsNbr,
int rank,
Aidge::Interpolation::Mode mode,
float cubicCoeffA,
bool antialiasing,
bool excludeOutside,
const int* inputDims);
} // namespace InterpolationCUDA
} // namespace Aidge
#endif /*AIDGE_CUDA_DATA_INTERPOLATION_H_*/
\ No newline at end of file
......@@ -4,6 +4,10 @@
#include <cstddef> // std::size_t
#include <memory>
#include <string>
#include <vector>
#include <cuda.h>
#include <type_traits> // std::enable_if, std::is_same
#include "aidge/backend/TensorImpl.hpp"
#include "aidge/data/Tensor.hpp"
......@@ -17,14 +21,9 @@
namespace Aidge {
template <typename SRC_T, typename DST_T>
void thrust_copy(const SRC_T* /*srcData*/, DST_T* /*dstData*/, size_t /*size*/);
template <typename SRC_T, typename std::enable_if<!std::is_same<half_float::half, SRC_T>::value>::type* = nullptr>
void thrust_copy(const SRC_T* srcData, half_float::half* dstData, size_t size);
template <typename DST_T, typename std::enable_if<!std::is_same<half_float::half, DST_T>::value>::type* = nullptr>
void thrust_copy(const half_float::half* srcData, DST_T* dstData, size_t size);
template <>
void thrust_copy(const half_float::half* srcData, half_float::half* dstData, size_t size);
void thrust_copy(const SRC_T* srcData, DST_T* dstData, size_t size);
/**
* @brief Abstract class for the TensorImpl_cuda class template.
......@@ -94,7 +93,7 @@ public:
}
void copy(const void *src, NbElts_t length, NbElts_t offset = 0) override {
AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "TensorImpl_cuda<{}>::copy(): copy length ({}) is above capacity ({})", typeid(T).name(), length, mNbElts);
AIDGE_ASSERT(offset + length <= mNbElts, "TensorImpl_cuda<{}>::copy(): copy offset ({}) + length ({}) is above capacity ({})", typeid(T).name(), offset, length, mNbElts);
const T* srcT = static_cast<const T *>(src);
T* dstT = static_cast<T *>(rawPtr(offset));
......@@ -107,62 +106,67 @@ public:
return;
}
AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "TensorImpl_cuda<{}>::copyCast(): copy length ({}) is above capacity ({})", typeid(T).name(), length, mNbElts);
AIDGE_ASSERT(offset + length <= mNbElts, "TensorImpl_cuda<{}>::copyCast(): copy offset ({}) + length ({}) is above capacity ({})", typeid(T).name(), offset, length, mNbElts);
switch (srcDt) {
case DataType::Float64:
thrust_copy(static_cast<const double*>(src),
static_cast<T*>(rawPtr(offset)),
length);
static_cast<size_t>(length));
break;
case DataType::Float32:
thrust_copy(static_cast<const float*>(src),
static_cast<T*>(rawPtr(offset)),
length);
static_cast<size_t>(length));
break;
case DataType::Float16:
thrust_copy(static_cast<const half_float::half*>(src),
static_cast<T*>(rawPtr(offset)),
length);
static_cast<size_t>(length));
break;
case DataType::Int64:
thrust_copy(static_cast<const int64_t*>(src),
static_cast<T*>(rawPtr(offset)),
length);
static_cast<size_t>(length));
break;
case DataType::UInt64:
thrust_copy(static_cast<const uint64_t*>(src),
static_cast<T*>(rawPtr(offset)),
length);
static_cast<size_t>(length));
break;
case DataType::Int32:
thrust_copy(static_cast<const int32_t*>(src),
static_cast<T*>(rawPtr(offset)),
length);
static_cast<size_t>(length));
break;
case DataType::UInt32:
thrust_copy(static_cast<const uint32_t*>(src),
static_cast<T*>(rawPtr(offset)),
length);
static_cast<size_t>(length));
break;
case DataType::Int16:
thrust_copy(static_cast<const int16_t*>(src),
static_cast<T*>(rawPtr(offset)),
length);
static_cast<size_t>(length));
break;
case DataType::UInt16:
thrust_copy(static_cast<const uint16_t*>(src),
static_cast<T*>(rawPtr(offset)),
length);
static_cast<size_t>(length));
break;
case DataType::Int8:
thrust_copy(static_cast<const int8_t*>(src),
static_cast<T*>(rawPtr(offset)),
length);
static_cast<size_t>(length));
break;
case DataType::UInt8:
thrust_copy(static_cast<const uint8_t*>(src),
static_cast<T*>(rawPtr(offset)),
length);
static_cast<size_t>(length));
break;
case DataType::Boolean:
thrust_copy(static_cast<const bool*>(src),
static_cast<T*>(rawPtr(offset)),
static_cast<size_t>(length));
break;
default:
AIDGE_THROW_OR_ABORT(std::runtime_error, "TensorImpl_cuda<{}>::copyCast(): unsupported data type {}.", typeid(T).name(), srcDt);
......@@ -171,21 +175,22 @@ public:
}
void copyFromDevice(const void *src, const std::pair<std::string, DeviceIdx_t>& device, NbElts_t length, NbElts_t offset = 0) override {
AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "TensorImpl_cuda<{}>::copyFromDevice(): copy length ({}) is above capacity ({})", typeid(T).name(), length, mNbElts);
AIDGE_ASSERT(offset + length <= mNbElts, "TensorImpl_cuda<{}>::copyFromDevice(): copy offset ({}) + length ({}) is above capacity ({})", typeid(T).name(), offset, length, mNbElts);
CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(offset), src, length * sizeof(T), cudaMemcpyDeviceToDevice));
}
void copyFromHost(const void *src, NbElts_t length, NbElts_t offset = 0) override {
AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "TensorImpl_cuda<{}>::copyFromHost(): copy length ({}) is above capacity ({})", typeid(T).name(), length, mNbElts);
AIDGE_ASSERT(offset + length <= mNbElts, "TensorImpl_cuda<{}>::copyFromHost(): copy offset ({}) + length ({}) is above capacity ({})", typeid(T).name(), offset, length, mNbElts);
CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(offset), src, length * sizeof(T), cudaMemcpyHostToDevice));
}
void copyToHost(void *dst, NbElts_t length, NbElts_t offset = 0) const override {
AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "TensorImpl_cuda<{}>::copyToHost(): copy length ({}) is above capacity ({})", typeid(T).name(), length, mNbElts);
AIDGE_ASSERT(offset + length <= mNbElts, "TensorImpl_cuda<{}>::copyToHost(): copy offset ({}) + length ({}) is above capacity ({})", typeid(T).name(), offset, length, mNbElts);
CHECK_CUDA_STATUS(cudaMemcpy(dst, rawPtr(offset), length * sizeof(T), cudaMemcpyDeviceToHost));
}
void *rawPtr(NbElts_t offset = 0) override {
cudaSetDevice(mDevice);
lazyInit();
return (mData.data() + offset);
};
......@@ -279,16 +284,18 @@ private:
template <typename T>
const std::string TensorImpl_cuda<T>::Backend = "cuda";
namespace {
static Registrar<Tensor> registrarTensorImpl_cuda_Float64(
{"cuda", DataType::Float64}, Aidge::TensorImpl_cuda<double>::create);
static Registrar<Tensor> registrarTensorImpl_cuda_Float32(
{"cuda", DataType::Float32}, Aidge::TensorImpl_cuda<float>::create);
static Registrar<Tensor> registrarTensorImpl_cuda_Float16(
{"cuda", DataType::Float16}, Aidge::TensorImpl_cuda<half_float::half>::create);
static Registrar<Tensor> registrarTensorImpl_cuda_Int32(
{"cuda", DataType::Int32}, Aidge::TensorImpl_cuda<int32_t>::create);
} // namespace
REGISTRAR(Tensor, {"cuda", DataType::Float64}, Aidge::TensorImpl_cuda<double>::create);
REGISTRAR(Tensor, {"cuda", DataType::Float32}, Aidge::TensorImpl_cuda<float>::create);
REGISTRAR(Tensor, {"cuda", DataType::Float16}, Aidge::TensorImpl_cuda<half_float::half>::create);
REGISTRAR(Tensor, {"cuda", DataType::Int64}, Aidge::TensorImpl_cuda<int64_t>::create);
REGISTRAR(Tensor, {"cuda", DataType::Int32}, Aidge::TensorImpl_cuda<int32_t>::create);
REGISTRAR(Tensor, {"cuda", DataType::Int16}, Aidge::TensorImpl_cuda<int16_t>::create);
REGISTRAR(Tensor, {"cuda", DataType::Int8}, Aidge::TensorImpl_cuda<int8_t>::create);
REGISTRAR(Tensor, {"cuda", DataType::UInt64}, Aidge::TensorImpl_cuda<uint64_t>::create);
REGISTRAR(Tensor, {"cuda", DataType::UInt32}, Aidge::TensorImpl_cuda<uint32_t>::create);
REGISTRAR(Tensor, {"cuda", DataType::UInt16}, Aidge::TensorImpl_cuda<uint16_t>::create);
REGISTRAR(Tensor, {"cuda", DataType::UInt8}, Aidge::TensorImpl_cuda<uint8_t>::create);
REGISTRAR(Tensor, {"cuda", DataType::Boolean}, Aidge::TensorImpl_cuda<bool>::create);
} // namespace Aidge
#endif /* AIDGE_BACKEND_CUDA_DATA_TENSORIMPL_H_ */
/********************************************************************************
* Copyright (c) 2024 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_BACKEND_CUDA_OPERATOR_ABSIMPL_H_
#define AIDGE_BACKEND_CUDA_OPERATOR_ABSIMPL_H_
#include <array>
#include <memory>
#include <tuple>
#include <vector>
#include <cudnn.h>
#include "aidge/backend/OperatorImpl.hpp"
#include "aidge/operator/Abs.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
#include "aidge/backend/cuda/utils/CudaUtils.hpp"
namespace Aidge {
// Operator implementation entry point for the backend
class AbsImpl_cuda : public OperatorImpl {
public:
AbsImpl_cuda(const Abs_Op& op) : OperatorImpl(op, "cuda") {}
static std::unique_ptr<AbsImpl_cuda> create(const Abs_Op& op) {
return std::make_unique<AbsImpl_cuda>(op);
}
virtual std::vector<ImplSpec> getAvailableImplSpecs() const override {
return {
{DataType::Float64},
{DataType::Float32},
{DataType::Float16},
};
}
void forward() override;
private:
std::shared_ptr<Tensor> mInputFallback;
std::shared_ptr<Tensor> mOutputGradFallback;
template <class T> void forward_(const Tensor& input);
};
// Implementation entry point registration to Operator
REGISTRAR(Abs_Op, "cuda", Aidge::AbsImpl_cuda::create);
} // namespace Aidge
#endif /* AIDGE_BACKEND_CUDA_OPERATOR_ABSIMPL_H_ */
/********************************************************************************
* Copyright (c) 2024 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CUDA_OPERATOR_ABSIMPL_KERNELS_H_
#define AIDGE_CUDA_OPERATOR_ABSIMPL_KERNELS_H_
#include <stdexcept>
#include <cfloat>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cuda_fp16.h>
#include "aidge/data/Data.hpp"
#include "aidge/backend/cuda/utils/CudaUtils.hpp"
#include "aidge/utils/Types.h"
namespace Aidge {
template <class T>
void absForward(const T* input, T* output, int size);
}
#endif /* AIDGE_CUDA_OPERATOR_ABSIMPL_KERNELS_H_ */
......@@ -36,7 +36,7 @@ public:
return std::make_unique<AddImpl_cuda>(op);
}
virtual std::set<ImplSpec> getAvailableImplSpecs() const override {
virtual std::vector<ImplSpec> getAvailableImplSpecs() const override {
return {
{DataType::Float64},
{DataType::Float32},
......@@ -46,10 +46,18 @@ public:
void forward() override;
void backward() override;
~AddImpl_cuda();
private:
template <class T> void forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides);
template <class T> void backward_(const Tensor& outGrad, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides);
std::vector<cudnnTensorDescriptor_t> mTensorDesc;
cudnnReduceTensorDescriptor_t mBwdReduceDesc = nullptr;
size_t mBwdWorkspaceSize = 0;
void* mBwdWorkspace = nullptr;
std::vector<std::shared_ptr<Tensor>> mInputFallbacks;
std::shared_ptr<Tensor> mOutputGradFallback;
template <class T> void forward_(const std::vector<std::reference_wrapper<Tensor>>& inputs);
template <class T> void backward_(const Tensor& outGrad);
};
// Implementation entry point registration to Operator
......
......@@ -36,11 +36,11 @@ public:
return std::make_unique<ArgMaxImpl_cuda>(op);
}
virtual std::set<ImplSpec> getAvailableImplSpecs() const override {
virtual std::vector<ImplSpec> getAvailableImplSpecs() const override {
return {
{DataType::Float64},
{DataType::Float32},
{DataType::Float16},
{DataType::Float64, DataType::Int64},
{DataType::Float32, DataType::Int64},
{DataType::Float16, DataType::Int64},
};
}
......@@ -50,7 +50,7 @@ private:
// CuDNN specific variables
std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback;
template <class T> void forward_(const Tensor& input, std::int32_t axis, DimSize_t selectLastIdx);
template <class I, class O> void forward_(const Tensor& input, std::int32_t axis, DimSize_t selectLastIdx);
};
// Implementation entry point registration to Operator
......
......@@ -23,8 +23,8 @@
namespace Aidge
{
template <class T>
void ArgMax_cuda_forward_kernel(const T* input, T* output,
template <class I, class O>
void ArgMax_cuda_forward_kernel(const I* input, O* output,
const std::vector<int>& inputDims, const std::vector<int>& inputStrides,
int axis, int total_elems, std::size_t selectLastIdx);
}
......
......@@ -37,7 +37,7 @@ public:
return std::make_unique<AvgPoolingImpl_cuda>(op);
}
virtual std::set<ImplSpec> getAvailableImplSpecs() const override {
virtual std::vector<ImplSpec> getAvailableImplSpecs() const override {
return {
{DataType::Float64},
{DataType::Float32},
......
......@@ -37,7 +37,7 @@ public:
return std::make_unique<BatchNormImpl_cuda>(op);
}
virtual std::set<ImplSpec> getAvailableImplSpecs() const override {
virtual std::vector<ImplSpec> getAvailableImplSpecs() const override {
return {
{DataType::Float64},
{DataType::Float32},
......
/********************************************************************************
* Copyright (c) 2024 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_BACKEND_CUDA_OPERATOR_BITSHIFTIMPL_H_
#define AIDGE_BACKEND_CUDA_OPERATOR_BITSHIFTIMPL_H_
#include <array>
#include <memory>
#include <tuple>
#include <vector>
#include <cudnn.h>
#include "aidge/backend/OperatorImpl.hpp"
#include "aidge/operator/BitShift.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
#include "aidge/backend/cuda/utils/CudaUtils.hpp"
namespace Aidge {
// Operator implementation entry point for the backend
class BitShiftImpl_cuda : public OperatorImpl {
public:
BitShiftImpl_cuda(const BitShift_Op& op) : OperatorImpl(op, "cuda") {}
static std::unique_ptr<BitShiftImpl_cuda> create(const BitShift_Op& op) {
return std::make_unique<BitShiftImpl_cuda>(op);
}
virtual std::vector<ImplSpec> getAvailableImplSpecs() const override {
return {
{DataType::Int32},
{DataType::Int64}
};
}
void forward() override;
private:
template <class T> void forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides);
};
// Implementation entry point registration to Operator
REGISTRAR(BitShift_Op, "cuda", Aidge::BitShiftImpl_cuda::create);
} // namespace Aidge
#endif /* AIDGE_BACKEND_CUDA_OPERATOR_BITSHIFTIMPL_H_ */