Skip to content
Snippets Groups Projects
Commit dc147250 authored by Maxence Naud's avatar Maxence Naud
Browse files

Merge branch 'dev' into 'main'

version 0.2.0

See merge request !15
parents 62fe9d5f 1af4d7d4
No related branches found
No related tags found
1 merge request!15version 0.2.0
Pipeline #43371 passed
Showing
with 1175 additions and 210 deletions
include:
- remote: 'https://gitlab.eclipse.org/eclipse/aidge/gitlab_shared_files/-/raw/main/.gitlab/ci/shared_script.gitlab-ci.yml'
build:ubuntu_cpp: build:ubuntu_cpp:
stage: build stage: build
needs: [] needs: []
...@@ -6,15 +9,14 @@ build:ubuntu_cpp: ...@@ -6,15 +9,14 @@ build:ubuntu_cpp:
script: script:
# Download dependencies # Download dependencies
- DEPENDENCY_JOB="build:ubuntu_cpp"
# aidge_core # aidge_core
- 'curl --location --output build_artifacts.zip "https://gitlab.eclipse.org/api/v4/projects/5139/jobs/artifacts/main/download?job=build:ubuntu_cpp"' - DEPENDENCY_NAME="aidge_core"
- unzip -o build_artifacts.zip -d . - !reference [.download_dependency, script]
- rm -rf build_cpp
# aidge_backend_cpu # aidge_backend_cpu
- 'curl --location --output build_artifacts.zip "https://gitlab.eclipse.org/api/v4/projects/5140/jobs/artifacts/master/download?job=build:ubuntu_cpp"' - DEPENDENCY_NAME="aidge_backend_cpu"
- unzip -o build_artifacts.zip -d . - !reference [.download_dependency, script]
- rm -rf build_cpp
# Build current module # Build current module
- export CMAKE_PREFIX_PATH=../install_cpp - export CMAKE_PREFIX_PATH=../install_cpp
- mkdir -p build_cpp - mkdir -p build_cpp
...@@ -35,15 +37,14 @@ build:ubuntu_cpp_g++10: ...@@ -35,15 +37,14 @@ build:ubuntu_cpp_g++10:
- docker - docker
script: script:
# Download dependencies # Download dependencies
- DEPENDENCY_JOB="build:ubuntu_cpp"
# aidge_core # aidge_core
- 'curl --location --output build_artifacts.zip "https://gitlab.eclipse.org/api/v4/projects/5139/jobs/artifacts/main/download?job=build:ubuntu_cpp"' - DEPENDENCY_NAME="aidge_core"
- unzip -o build_artifacts.zip -d . - !reference [.download_dependency, script]
- rm -rf build_cpp
# aidge_backend_cpu # aidge_backend_cpu
- 'curl --location --output build_artifacts.zip "https://gitlab.eclipse.org/api/v4/projects/5140/jobs/artifacts/master/download?job=build:ubuntu_cpp"' - DEPENDENCY_NAME="aidge_backend_cpu"
- unzip -o build_artifacts.zip -d . - !reference [.download_dependency, script]
- rm -rf build_cpp
# Build current module # Build current module
- export CMAKE_PREFIX_PATH=../install_cpp - export CMAKE_PREFIX_PATH=../install_cpp
...@@ -63,14 +64,14 @@ build:ubuntu_cpp_g++12: ...@@ -63,14 +64,14 @@ build:ubuntu_cpp_g++12:
script: script:
# Download dependencies # Download dependencies
- DEPENDENCY_JOB="build:ubuntu_cpp"
# aidge_core # aidge_core
- 'curl --location --output build_artifacts.zip "https://gitlab.eclipse.org/api/v4/projects/5139/jobs/artifacts/main/download?job=build:ubuntu_cpp"' - DEPENDENCY_NAME="aidge_core"
- unzip -o build_artifacts.zip -d . - !reference [.download_dependency, script]
- rm -rf build_cpp
# aidge_backend_cpu # aidge_backend_cpu
- 'curl --location --output build_artifacts.zip "https://gitlab.eclipse.org/api/v4/projects/5140/jobs/artifacts/master/download?job=build:ubuntu_cpp"' - DEPENDENCY_NAME="aidge_backend_cpu"
- unzip -o build_artifacts.zip -d . - !reference [.download_dependency, script]
- rm -rf build_cpp
# Build current module # Build current module
- export CMAKE_PREFIX_PATH=../install_cpp - export CMAKE_PREFIX_PATH=../install_cpp
...@@ -90,14 +91,14 @@ build:ubuntu_cpp_clang12: ...@@ -90,14 +91,14 @@ build:ubuntu_cpp_clang12:
script: script:
# Download dependencies # Download dependencies
- DEPENDENCY_JOB="build:ubuntu_cpp"
# aidge_core # aidge_core
- 'curl --location --output build_artifacts.zip "https://gitlab.eclipse.org/api/v4/projects/5139/jobs/artifacts/main/download?job=build:ubuntu_cpp"' - DEPENDENCY_NAME="aidge_core"
- unzip -o build_artifacts.zip -d . - !reference [.download_dependency, script]
- rm -rf build_cpp
# aidge_backend_cpu # aidge_backend_cpu
- 'curl --location --output build_artifacts.zip "https://gitlab.eclipse.org/api/v4/projects/5140/jobs/artifacts/master/download?job=build:ubuntu_cpp"' - DEPENDENCY_NAME="aidge_backend_cpu"
- unzip -o build_artifacts.zip -d . - !reference [.download_dependency, script]
- rm -rf build_cpp
# Build current module # Build current module
- export CMAKE_PREFIX_PATH=../install_cpp - export CMAKE_PREFIX_PATH=../install_cpp
...@@ -117,14 +118,13 @@ build:ubuntu_cpp_clang15: ...@@ -117,14 +118,13 @@ build:ubuntu_cpp_clang15:
script: script:
# Download dependencies # Download dependencies
- DEPENDENCY_JOB="build:ubuntu_cpp"
# aidge_core # aidge_core
- 'curl --location --output build_artifacts.zip "https://gitlab.eclipse.org/api/v4/projects/5139/jobs/artifacts/main/download?job=build:ubuntu_cpp"' - DEPENDENCY_NAME="aidge_core"
- unzip -o build_artifacts.zip -d . - !reference [.download_dependency, script]
- rm -rf build_cpp
# aidge_backend_cpu # aidge_backend_cpu
- 'curl --location --output build_artifacts.zip "https://gitlab.eclipse.org/api/v4/projects/5140/jobs/artifacts/master/download?job=build:ubuntu_cpp"' - DEPENDENCY_NAME="aidge_backend_cpu"
- unzip -o build_artifacts.zip -d . - !reference [.download_dependency, script]
- rm -rf build_cpp
# Build current module # Build current module
- export CMAKE_PREFIX_PATH=../install_cpp - export CMAKE_PREFIX_PATH=../install_cpp
...@@ -144,12 +144,13 @@ build:ubuntu_python: ...@@ -144,12 +144,13 @@ build:ubuntu_python:
script: script:
# Download dependencies # Download dependencies
# aidge_core (Python) - DEPENDENCY_JOB="build:ubuntu_python"
- 'curl --location --output build_artifacts.zip "https://gitlab.eclipse.org/api/v4/projects/5139/jobs/artifacts/main/download?job=build:ubuntu_python"' # aidge_core (python)
- unzip -o build_artifacts.zip -d . - DEPENDENCY_NAME="aidge_core"
# aidge_backend_cpu (Python) - !reference [.download_dependency, script]
- 'curl --location --output build_artifacts.zip "https://gitlab.eclipse.org/api/v4/projects/5140/jobs/artifacts/master/download?job=build:ubuntu_python"' # aidge_backend_cpu (python)
- unzip -o build_artifacts.zip -d . - DEPENDENCY_NAME="aidge_backend_cpu"
- !reference [.download_dependency, script]
- python3 -m pip install virtualenv - python3 -m pip install virtualenv
- virtualenv venv - virtualenv venv
......
...@@ -7,11 +7,12 @@ file(READ "${CMAKE_SOURCE_DIR}/project_name.txt" project) ...@@ -7,11 +7,12 @@ file(READ "${CMAKE_SOURCE_DIR}/project_name.txt" project)
message(STATUS "Project name: ${project}") message(STATUS "Project name: ${project}")
message(STATUS "Project version: ${version}") message(STATUS "Project version: ${version}")
# Note : project name is {project} and python module name is also {project} # Note : project name is {project} and python module name is also {project}
set(module_name _${project}) # target name set(module_name _${project}) # target name
project(${project}) project(${project})
set(CXX_STANDARD 14)
############################################## ##############################################
# Define options # Define options
...@@ -19,6 +20,7 @@ option(PYBIND "python binding" ON) ...@@ -19,6 +20,7 @@ option(PYBIND "python binding" ON)
option(WERROR "Warning as error" OFF) option(WERROR "Warning as error" OFF)
option(TEST "Enable tests" ON) option(TEST "Enable tests" ON)
option(COVERAGE "Enable coverage" OFF) option(COVERAGE "Enable coverage" OFF)
option(ENABLE_ASAN "Enable ASan (AddressSanitizer) for runtime analysis of memory use (over/underflow, memory leak, ...)" OFF)
############################################## ##############################################
# Import utils CMakeLists # Import utils CMakeLists
...@@ -36,8 +38,9 @@ enable_language(CUDA) ...@@ -36,8 +38,9 @@ enable_language(CUDA)
find_package(CUDAToolkit REQUIRED) find_package(CUDAToolkit REQUIRED)
find_package(aidge_core REQUIRED) find_package(aidge_core REQUIRED)
find_package(aidge_backend_cpu REQUIRED) if(TEST)
find_package(aidge_backend_cpu REQUIRED)
endif()
############################################## ##############################################
# Create target and set properties # Create target and set properties
...@@ -48,11 +51,31 @@ add_library(${module_name} ${src_files} ${inc_files}) ...@@ -48,11 +51,31 @@ add_library(${module_name} ${src_files} ${inc_files})
target_link_libraries(${module_name} target_link_libraries(${module_name}
PUBLIC PUBLIC
_aidge_core # _ is added because we link the target not the project _aidge_core # _ is added because we link the target not the project
_aidge_backend_cpu # _ is added because we link the target not the project
CUDA::cudart CUDA::cudart
CUDA::cublas
cudnn cudnn
) )
if( ${ENABLE_ASAN} )
message("Building ${module_name} with ASAN.")
set(SANITIZE_FLAGS -fsanitize=address -fno-omit-frame-pointer)
target_link_libraries(${module_name}
PUBLIC
-fsanitize=address
)
target_compile_options(${module_name}
PRIVATE
${SANITIZE_FLAGS}
)
endif()
if(TEST)
target_link_libraries(${module_name}
PUBLIC
_aidge_backend_cpu # _ is added because we link the target not the project
)
endif()
#Set target properties #Set target properties
target_include_directories(${module_name} target_include_directories(${module_name}
PUBLIC PUBLIC
...@@ -76,7 +99,7 @@ if (PYBIND) ...@@ -76,7 +99,7 @@ if (PYBIND)
# Handles Python + pybind11 headers dependencies # Handles Python + pybind11 headers dependencies
target_link_libraries(${module_name} target_link_libraries(${module_name}
PUBLIC PUBLIC
pybind11::pybind11 pybind11::pybind11
PRIVATE PRIVATE
Python::Python Python::Python
...@@ -119,8 +142,8 @@ install(DIRECTORY include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) ...@@ -119,8 +142,8 @@ install(DIRECTORY include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
install(EXPORT ${project}-targets install(EXPORT ${project}-targets
FILE "${project}-targets.cmake" FILE "${project}-targets.cmake"
DESTINATION ${INSTALL_CONFIGDIR} DESTINATION ${INSTALL_CONFIGDIR}
# COMPONENT ${module_name} # COMPONENT ${module_name}
) )
#Create a ConfigVersion.cmake file #Create a ConfigVersion.cmake file
include(CMakePackageConfigHelpers) include(CMakePackageConfigHelpers)
......
...@@ -13,7 +13,10 @@ ...@@ -13,7 +13,10 @@
#define AIDGE_BACKEND_CUDA_IMPORTS_H_ #define AIDGE_BACKEND_CUDA_IMPORTS_H_
#include "aidge/backend/cuda/data/TensorImpl.hpp" #include "aidge/backend/cuda/data/TensorImpl.hpp"
#include "aidge/backend/cuda/operator/AvgPoolingImpl.hpp"
#include "aidge/backend/cuda/operator/ConvImpl.hpp" #include "aidge/backend/cuda/operator/ConvImpl.hpp"
#include "aidge/backend/cuda/operator/ProducerImpl.hpp" #include "aidge/backend/cuda/operator/FCImpl.hpp"
#include "aidge/backend/cuda/operator/MaxPoolingImpl.hpp"
#include "aidge/backend/cuda/operator/ReLUImpl.hpp"
#endif /* AIDGE_BACKEND_CUDA_IMPORTS_H_ */ #endif /* AIDGE_BACKEND_CUDA_IMPORTS_H_ */
\ No newline at end of file
#ifndef AIDGE_BACKEND_CUDA_DATA_TENSORIMPL_H_ #ifndef AIDGE_BACKEND_CUDA_DATA_TENSORIMPL_H_
#define AIDGE_BACKEND_CUDA_DATA_TENSORIMPL_H_ #define AIDGE_BACKEND_CUDA_DATA_TENSORIMPL_H_
#include <cstddef> // std::size_t
#include <memory>
#include <string>
#include "aidge/backend/TensorImpl.hpp" #include "aidge/backend/TensorImpl.hpp"
#include "aidge/data/Tensor.hpp" #include "aidge/data/Tensor.hpp"
#include "aidge/utils/Registrar.hpp" #include "aidge/utils/Registrar.hpp"
...@@ -24,19 +28,27 @@ void thrust_copy(const half_float::half* srcData, half_float::half* dstData, siz ...@@ -24,19 +28,27 @@ void thrust_copy(const half_float::half* srcData, half_float::half* dstData, siz
/** /**
* @brief Abstract class for the TensorImpl_cuda class template. * @brief Abstract class for the TensorImpl_cuda class template.
* @details Its purpose is to provide access to base methods that are specific * @details Its purpose is to provide access to base methods that are specific
* to the implementation (which are therefore not present in the TensorImpl * to the implementation (which are therefore not present in the TensorImpl
* class), but whose data type does not need to be known. * class), but whose data type does not need to be known.
*/ */
class TensorImpl_cuda_ { class TensorImpl_cuda_ {
protected:
mutable cudnnTensorDescriptor_t mCudnnTensor = nullptr;
public: public:
/** /**
* @brief Return the CuDNN tensor descriptor of the tensor. * @brief Return the CuDNN tensor descriptor of the tensor.
* @details This method uses lazy initialization for the descriptor * @details This method uses lazy initialization for the descriptor
* (which is therefore mutable in the derived class). * (which is therefore mutable in the derived class).
* @return cudnnTensorDescriptor_t CuDNN tensor descriptor. * @return cudnnTensorDescriptor_t CuDNN tensor descriptor.
*/ */
virtual const cudnnTensorDescriptor_t& getCudnnTensorDesc() const = 0; virtual const cudnnTensorDescriptor_t& getCudnnTensorDesc(const Tensor& tensor) const = 0;
virtual ~TensorImpl_cuda_() {
if (mCudnnTensor != nullptr)
cudnnDestroyTensorDescriptor(mCudnnTensor);
}
}; };
template <class T> template <class T>
...@@ -54,119 +66,117 @@ private: ...@@ -54,119 +66,117 @@ private:
} }
private: private:
const Tensor &mTensor; // Impl needs to access Tensor information, but is not
// supposed to change it!
/// Pointer to the data and its capacity
future_std::span<T> mData; future_std::span<T> mData;
/// If this instance own the data, std::unique_ptr manages it /// If this instance own the data, std::unique_ptr manages it
std::unique_ptr<T, decltype(&cudaDelete)> mDataOwner; std::unique_ptr<T, decltype(&cudaDelete)> mDataOwner;
mutable cudnnTensorDescriptor_t mCudnnTensor = nullptr;
public: public:
static constexpr const char *Backend = "cuda"; static const std::string Backend;
TensorImpl_cuda(DeviceIdx_t device, std::vector<DimSize_t> dims) : TensorImpl(Backend, device, dims), mDataOwner(nullptr, cudaDelete) {}
TensorImpl_cuda(const Tensor &tensor) : TensorImpl(Backend), mTensor(tensor), mDataOwner(nullptr, cudaDelete) {}
bool operator==(const TensorImpl &otherImpl) const override final; bool operator==(const TensorImpl &otherImpl) const override final;
static std::unique_ptr<TensorImpl_cuda> create(const Tensor &tensor) { static std::shared_ptr<TensorImpl_cuda> create(DeviceIdx_t device, std::vector<DimSize_t> dims) {
return std::make_unique<TensorImpl_cuda<T>>(tensor); return std::make_shared<TensorImpl_cuda<T>>(device, dims);
} }
// native interface // native interface
const future_std::span<T>& data() const { return mData; } const future_std::span<T>& data() const { return mData; }
std::size_t size() const override { return mData.size(); } std::size_t scalarSize() const noexcept override { return sizeof(T); }
std::size_t scalarSize() const override { return sizeof(T); }
void setDevice(DeviceIdx_t device) override {
mDevice = device;
}
void copy(const void *src, NbElts_t length, NbElts_t offset = 0) override { void copy(const void *src, NbElts_t length, NbElts_t offset = 0) override {
void* dst = static_cast<void*>(static_cast<T*>(rawPtr()) + offset); AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "copy length is above capacity");
CHECK_CUDA_STATUS(cudaMemcpy(dst, src, length * sizeof(T), cudaMemcpyDeviceToDevice)); const T* srcT = static_cast<const T *>(src);
T* dstT = static_cast<T *>(rawPtr(offset));
AIDGE_ASSERT(dstT < srcT || dstT >= srcT + length, "overlapping copy is not supported");
CHECK_CUDA_STATUS(cudaMemcpy(dstT, srcT, length * sizeof(T), cudaMemcpyDeviceToDevice));
} }
void copyCast(const void *src, NbElts_t length, const DataType srcDt) override { void copyCast(const void *src, const DataType srcDt, NbElts_t length, NbElts_t offset = 0) override {
if (length == 0) { if (length == 0) {
return; return;
} }
AIDGE_ASSERT(length <= mData.size() || length <= mTensor.size(), "copy length is above capacity"); AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "copy length is above capacity");
if (srcDt == DataType::Float64) { switch (srcDt) {
case DataType::Float64:
thrust_copy(static_cast<const double*>(src), thrust_copy(static_cast<const double*>(src),
static_cast<T*>(rawPtr()), static_cast<T*>(rawPtr(offset)),
length); length);
} break;
else if (srcDt == DataType::Float32) { case DataType::Float32:
thrust_copy(static_cast<const float*>(src), thrust_copy(static_cast<const float*>(src),
static_cast<T*>(rawPtr()), static_cast<T*>(rawPtr(offset)),
length); length);
} break;
else if (srcDt == DataType::Float16) { case DataType::Float16:
thrust_copy(static_cast<const half_float::half*>(src), thrust_copy(static_cast<const half_float::half*>(src),
static_cast<T*>(rawPtr()), static_cast<T*>(rawPtr(offset)),
length); length);
} break;
else if (srcDt == DataType::Int64) { case DataType::Int64:
thrust_copy(static_cast<const int64_t*>(src), thrust_copy(static_cast<const int64_t*>(src),
static_cast<T*>(rawPtr()), static_cast<T*>(rawPtr(offset)),
length); length);
} break;
else if (srcDt == DataType::UInt64) { case DataType::UInt64:
thrust_copy(static_cast<const uint64_t*>(src), thrust_copy(static_cast<const uint64_t*>(src),
static_cast<T*>(rawPtr()), static_cast<T*>(rawPtr(offset)),
length); length);
} break;
else if (srcDt == DataType::Int32) { case DataType::Int32:
thrust_copy(static_cast<const int32_t*>(src), thrust_copy(static_cast<const int32_t*>(src),
static_cast<T*>(rawPtr()), static_cast<T*>(rawPtr(offset)),
length); length);
} break;
else if (srcDt == DataType::UInt32) { case DataType::UInt32:
thrust_copy(static_cast<const uint32_t*>(src), thrust_copy(static_cast<const uint32_t*>(src),
static_cast<T*>(rawPtr()), static_cast<T*>(rawPtr(offset)),
length); length);
} break;
else if (srcDt == DataType::Int16) { case DataType::Int16:
thrust_copy(static_cast<const int16_t*>(src), thrust_copy(static_cast<const int16_t*>(src),
static_cast<T*>(rawPtr()), static_cast<T*>(rawPtr(offset)),
length); length);
} break;
else if (srcDt == DataType::UInt16) { case DataType::UInt16:
thrust_copy(static_cast<const uint16_t*>(src), thrust_copy(static_cast<const uint16_t*>(src),
static_cast<T*>(rawPtr()), static_cast<T*>(rawPtr(offset)),
length); length);
} break;
else if (srcDt == DataType::Int8) { case DataType::Int8:
thrust_copy(static_cast<const int8_t*>(src), thrust_copy(static_cast<const int8_t*>(src),
static_cast<T*>(rawPtr()), static_cast<T*>(rawPtr(offset)),
length); length);
} break;
else if (srcDt == DataType::UInt8) { case DataType::UInt8:
thrust_copy(static_cast<const uint8_t*>(src), thrust_copy(static_cast<const uint8_t*>(src),
static_cast<T*>(rawPtr()), static_cast<T*>(rawPtr(offset)),
length); length);
} break;
else { default:
AIDGE_THROW_OR_ABORT(std::runtime_error, "Unsupported data type."); AIDGE_THROW_OR_ABORT(std::runtime_error, "Unsupported data type.");
break;
} }
} }
void copyFromDevice(const void *src, NbElts_t length, const std::pair<std::string, DeviceIdx_t>& device) override { void copyFromDevice(const void *src, const std::pair<std::string, DeviceIdx_t>& device, NbElts_t length, NbElts_t offset = 0) override {
AIDGE_ASSERT(length <= mData.size() || length <= mTensor.size(), "copy length is above capacity"); AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "copy length is above capacity");
CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(), src, length * sizeof(T), cudaMemcpyDeviceToDevice)); CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(offset), src, length * sizeof(T), cudaMemcpyDeviceToDevice));
} }
void copyFromHost(const void *src, NbElts_t length) override { void copyFromHost(const void *src, NbElts_t length, NbElts_t offset = 0) override {
AIDGE_ASSERT(length <= mData.size() || length <= mTensor.size(), "copy length is above capacity"); AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "copy length is above capacity");
CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(), src, length * sizeof(T), cudaMemcpyHostToDevice)); CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(offset), src, length * sizeof(T), cudaMemcpyHostToDevice));
} }
void copyToHost(void *dst, NbElts_t length) const override { void copyToHost(void *dst, NbElts_t length, NbElts_t offset = 0) const override {
AIDGE_ASSERT(length <= mData.size() || length <= mTensor.size(), "copy length is above capacity"); AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "copy length is above capacity");
CHECK_CUDA_STATUS(cudaMemcpy(dst, rawPtr(), length * sizeof(T), cudaMemcpyDeviceToHost)); CHECK_CUDA_STATUS(cudaMemcpy(dst, rawPtr(offset), length * sizeof(T), cudaMemcpyDeviceToHost));
} }
void *rawPtr(NbElts_t offset = 0) override { void *rawPtr(NbElts_t offset = 0) override {
...@@ -175,30 +185,27 @@ public: ...@@ -175,30 +185,27 @@ public:
}; };
const void *rawPtr(NbElts_t offset = 0) const override { const void *rawPtr(NbElts_t offset = 0) const override {
AIDGE_ASSERT(mData.size() >= mTensor.size(), "accessing uninitialized const rawPtr"); AIDGE_ASSERT(mData.size() >= mNbElts, "accessing uninitialized const rawPtr");
return (mData.data() + offset); return (mData.data() + offset);
}; };
const cudnnTensorDescriptor_t& getCudnnTensorDesc() const override { const cudnnTensorDescriptor_t& getCudnnTensorDesc(const Tensor& tensor) const override {
if (mCudnnTensor == nullptr) { if (mCudnnTensor == nullptr) {
CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&mCudnnTensor)); CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&mCudnnTensor));
if (mTensor.size() > 0) { if (tensor.size() > 0) {
/** /**
** cudNN Tensors are restricted to having at least 4 dimensions : ** cudNN Tensors are restricted to having at least 4 dimensions :
** When working with lower dimensionsal data, unused dimensions are set to 1. ** When working with lower dimensionsal data, unused dimensions are set to 1.
** Referes to the cudnnSetTensorNdDescriptor documentation from : ** Referes to the cudnnSetTensorNdDescriptor documentation from :
** https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html ** https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html
**/ **/
std::vector<int> dims(mTensor.dims().begin(), mTensor.dims().end()); std::vector<int> dims(tensor.dims().cbegin(), tensor.dims().cend());
std::vector<int> strides(tensor.strides().cbegin(), tensor.strides().cend());
if (dims.size() < 4) if (dims.size() < 4) {
dims.resize(4, 1); dims.resize(4, 1);
strides.resize(4, 1);
std::vector<int> strides(dims.size(), 1);
for (size_t dim = 1; dim < dims.size(); ++dim) {
strides[dims.size() - dim - 1] = strides[dims.size() - dim] * dims[dims.size() - dim];
} }
CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(mCudnnTensor, CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(mCudnnTensor,
...@@ -213,27 +220,27 @@ public: ...@@ -213,27 +220,27 @@ public:
} }
void setRawPtr(void *ptr, NbElts_t length) override final { void setRawPtr(void *ptr, NbElts_t length) override final {
AIDGE_ASSERT(length >= mTensor.size(), "trying to set raw pointer of insufficient capacity"); AIDGE_ASSERT(length >= mNbElts, "trying to set raw pointer of insufficient capacity");
mData = future_std::span<T>(static_cast<T *>(ptr), length); mData = future_std::span<T>(static_cast<T *>(ptr), length);
mDataOwner.reset(); mDataOwner.reset();
}; };
virtual ~TensorImpl_cuda() { virtual ~TensorImpl_cuda() = default;
if (mCudnnTensor != nullptr)
cudnnDestroyTensorDescriptor(mCudnnTensor);
}
private: private:
void lazyInit() { void lazyInit() {
if (mData.size() < mTensor.size()) { if (mData.size() < mNbElts) {
// Need more data, a re-allocation will occur // Need more data, a re-allocation will occur
AIDGE_ASSERT(mData.empty() || mDataOwner != nullptr, "trying to enlarge non-owned data"); AIDGE_ASSERT(mData.empty() || mDataOwner != nullptr, "trying to enlarge non-owned data");
mDataOwner.reset(cudaAlloc(mTensor.size())); mDataOwner.reset(cudaAlloc(mNbElts));
mData = future_std::span<T>(mDataOwner.get(), mTensor.size()); mData = future_std::span<T>(mDataOwner.get(), mNbElts);
} }
} }
}; };
template <typename T>
const std::string TensorImpl_cuda<T>::Backend = "cuda";
namespace { namespace {
static Registrar<Tensor> registrarTensorImpl_cuda_Float64( static Registrar<Tensor> registrarTensorImpl_cuda_Float64(
{"cuda", DataType::Float64}, Aidge::TensorImpl_cuda<double>::create); {"cuda", DataType::Float64}, Aidge::TensorImpl_cuda<double>::create);
......
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_BACKEND_CUDA_OPERATOR_AVGPOOLINGIMPL_H_
#define AIDGE_BACKEND_CUDA_OPERATOR_AVGPOOLINGIMPL_H_
#include <array>
#include <memory>
#include <tuple>
#include <vector>
#include <cudnn.h>
#include "aidge/backend/OperatorImpl.hpp"
#include "aidge/operator/AvgPooling.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
#include "aidge/backend/cuda/utils/CudaUtils.hpp"
namespace Aidge {
template <DimIdx_t DIM>
class AvgPoolingImpl_cuda : public OperatorImpl {
private:
// CuDNN specific variables
cudnnPoolingDescriptor_t mAvgPoolingDesc = nullptr;
cudnnPoolingMode_t mMode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
std::shared_ptr<Tensor> mInputFallback;
public:
AvgPoolingImpl_cuda(const AvgPooling_Op<DIM> &op) : OperatorImpl(op, "cuda") {}
static std::unique_ptr<AvgPoolingImpl_cuda> create(const AvgPooling_Op<2> &op) {
return std::make_unique<AvgPoolingImpl_cuda>(op);
}
public:
void forward();
~AvgPoolingImpl_cuda();
private:
template <class T> void forward_(const Tensor& input);
};
namespace {
// add cuda backend to AvgPooling_Op<2> implementation registry
static Registrar<AvgPooling_Op<2>> registrarAvgPoolingImpl_cuda("cuda", Aidge::AvgPoolingImpl_cuda<2>::create);
} // namespace
} // namespace Aidge
#endif /* AIDGE_BACKEND_CUDA_OPERATOR_AVGPOOLINGIMPL_H_ */
...@@ -34,22 +34,30 @@ private: ...@@ -34,22 +34,30 @@ private:
cudnnConvolutionDescriptor_t mConvDesc = nullptr; cudnnConvolutionDescriptor_t mConvDesc = nullptr;
cudnnFilterDescriptor_t mFilterDesc = nullptr; cudnnFilterDescriptor_t mFilterDesc = nullptr;
cudnnConvolutionFwdAlgo_t mFwdAlgo; cudnnConvolutionFwdAlgo_t mFwdAlgo;
cudnnConvolutionBwdFilterAlgo_t mBwdFilterAlgo;
cudnnConvolutionBwdDataAlgo_t mBwdDataAlgo;
size_t mWorkspaceSize = 0; size_t mWorkspaceSize = 0;
void* mWorkspace = nullptr; void* mFwdWorkspace = nullptr;
void* mBwdWorkspace = nullptr;
std::shared_ptr<Tensor> mInput0Fallback;
std::shared_ptr<Tensor> mInput1Fallback;
std::shared_ptr<Tensor> mInput2Fallback;
public: public:
ConvImpl_cuda(const Conv_Op<DIM> &op) : OperatorImpl(op) {} ConvImpl_cuda(const Conv_Op<DIM> &op) : OperatorImpl(op, "cuda") {}
static std::unique_ptr<ConvImpl_cuda> create(const Conv_Op<2> &op) { static std::unique_ptr<ConvImpl_cuda> create(const Conv_Op<DIM> &op) {
return std::make_unique<ConvImpl_cuda>(op); return std::make_unique<ConvImpl_cuda>(op);
} }
public: public:
void forward(); void forward();
void backward();
~ConvImpl_cuda(); ~ConvImpl_cuda();
private: private:
template <class T> void forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2); template <class T> void forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2);
template <class T> void backward_(const Tensor& input0, const Tensor& input1, const Tensor& input2);
}; };
namespace { namespace {
......
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_BACKEND_CUDA_OPERATOR_FCIMPL_H_
#define AIDGE_BACKEND_CUDA_OPERATOR_FCIMPL_H_
#include <array>
#include <memory>
#include <tuple>
#include <vector>
#include <cudnn.h>
#include "aidge/backend/OperatorImpl.hpp"
#include "aidge/operator/FC.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
#include "aidge/backend/cuda/utils/CudaUtils.hpp"
namespace Aidge {
class FCImplForward_cuda : public Registrable<FCImplForward_cuda,
std::tuple<DataType>,
void(std::size_t , std::size_t, std::size_t, bool, const void* , const void* , const void* , void*)> {};
class FCImpl_cuda : public OperatorImpl {
private:
std::shared_ptr<Tensor> mInput0Fallback;
std::shared_ptr<Tensor> mInput1Fallback;
std::shared_ptr<Tensor> mInput2Fallback;
public:
FCImpl_cuda(const FC_Op &op) : OperatorImpl(op, "cuda") {}
static std::unique_ptr<FCImpl_cuda> create(const FC_Op &op) {
return std::make_unique<FCImpl_cuda>(op);
}
public:
void forward();
// ~FCImpl_cuda();
private:
template <class T> void forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, bool noBias, std::size_t outChannels);
};
namespace {
// add cuda backend to FC_Op implementation registry
static Registrar<FC_Op> registrarFCImpl_cuda("cuda", Aidge::FCImpl_cuda::create);
} // namespace
} // namespace Aidge
#endif /* AIDGE_BACKEND_CUDA_OPERATOR_FCIMPL_H_ */
...@@ -9,26 +9,28 @@ ...@@ -9,26 +9,28 @@
* *
********************************************************************************/ ********************************************************************************/
#include <cassert> #ifndef AIDGE_CUDA_OPERATOR_FCIMPL_FORWARD_KERNEL_H_
#include <numeric> // std::accumulate #define AIDGE_CUDA_OPERATOR_FCIMPL_FORWARD_KERNEL_H_
#include <vector>
#include "aidge/data/Tensor.hpp" #include <stdexcept>
#include "aidge/operator/Producer.hpp" #include <cfloat>
#include "aidge/utils/Types.h" #include <cuda.h>
#include <cuda_runtime_api.h>
#include <cuda_fp16.h>
#include "aidge/backend/cuda/operator/ProducerImpl.hpp" #include "aidge/data/Data.hpp"
#include "aidge/backend/cuda/utils/CudaUtils.hpp"
Aidge::DimSize_t Aidge::ProducerImpl_cuda::getNbProducedData( namespace Aidge {
Aidge::IOIndex_t outputIdx) const
{
// Requires the whole tensors, regardless of available data on inputs
assert(outputIdx == 0 && "operator has only one output");
(void) outputIdx;
return std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->size(); template <class T>
} cublasStatus_t cublasGemm(cublasHandle_t handle,
cublasOperation_t transa, cublasOperation_t transb,
void Aidge::ProducerImpl_cuda::forward() int m, int n, int k,
{ const T *alpha,
const T *A, int lda,
const T *B, int ldb,
const T *beta,
T *C, int ldc);
} }
#endif /* AIDGE_CUDA_OPERATOR_FCIMPL_FORWARD_KERNEL_H_ */
\ No newline at end of file
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_BACKEND_CUDA_OPERATOR_MAXPOOLINGIMPL_H_
#define AIDGE_BACKEND_CUDA_OPERATOR_MAXPOOLINGIMPL_H_
#include <array>
#include <memory>
#include <tuple>
#include <vector>
#include <cudnn.h>
#include "aidge/backend/OperatorImpl.hpp"
#include "aidge/operator/MaxPooling.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
#include "aidge/backend/cuda/utils/CudaUtils.hpp"
namespace Aidge {
template <DimIdx_t DIM>
class MaxPoolingImpl_cuda : public OperatorImpl {
private:
// CuDNN specific variables
cudnnPoolingDescriptor_t mMaxPoolingDesc = nullptr;
cudnnPoolingMode_t mMode = CUDNN_POOLING_MAX;
std::shared_ptr<Tensor> mInputFallback;
public:
MaxPoolingImpl_cuda(const MaxPooling_Op<DIM> &op) : OperatorImpl(op, "cuda") {}
static std::unique_ptr<MaxPoolingImpl_cuda> create(const MaxPooling_Op<2> &op) {
return std::make_unique<MaxPoolingImpl_cuda>(op);
}
public:
void forward();
~MaxPoolingImpl_cuda();
private:
template <class T> void forward_(const Tensor& input);
};
namespace {
// add cuda backend to MaxPooling_Op<2> implementation registry
static Registrar<MaxPooling_Op<2>> registrarMaxPoolingImpl_cuda("cuda", Aidge::MaxPoolingImpl_cuda<2>::create);
} // namespace
} // namespace Aidge
#endif /* AIDGE_BACKEND_CUDA_OPERATOR_MAXPOOLINGIMPL_H_ */
...@@ -9,32 +9,53 @@ ...@@ -9,32 +9,53 @@
* *
********************************************************************************/ ********************************************************************************/
#ifndef AIDGE_CUDA_OPERATOR_PRODUCERIMPL_H_ #ifndef AIDGE_BACKEND_CUDA_OPERATOR_RELUIMPL_H_
#define AIDGE_CUDA_OPERATOR_PRODUCERIMPL_H_ #define AIDGE_BACKEND_CUDA_OPERATOR_RELUIMPL_H_
#include <array>
#include <memory> #include <memory>
#include <tuple>
#include <vector>
#include <cudnn.h>
#include "aidge/backend/OperatorImpl.hpp" #include "aidge/backend/OperatorImpl.hpp"
#include "aidge/operator/Producer.hpp" #include "aidge/operator/ReLU.hpp"
#include "aidge/utils/Registrar.hpp" #include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h" #include "aidge/utils/Types.h"
#include "aidge/backend/cuda/utils/CudaUtils.hpp"
namespace Aidge { namespace Aidge {
class ProducerImpl_cuda : public OperatorImpl { class ReLUImpl_cuda : public OperatorImpl {
private:
// CuDNN specific variables
#if CUDNN_VERSION >= 5000
cudnnActivationDescriptor_t mReLUDesc = nullptr;
#else
cudnnActivationMode_t mReLUDesc = nullptr;
#endif
std::shared_ptr<Tensor> mInputFallback;
public: public:
ProducerImpl_cuda(const Producer_Op &op) : OperatorImpl(op) {} ReLUImpl_cuda(const ReLU_Op &op) : OperatorImpl(op, "cuda") {}
static std::unique_ptr<ProducerImpl_cuda> create(const Producer_Op &op) { static std::unique_ptr<ReLUImpl_cuda> create(const ReLU_Op &op) {
return std::make_unique<ProducerImpl_cuda>(op); return std::make_unique<ReLUImpl_cuda>(op);
} }
NbElts_t getNbProducedData(const IOIndex_t outputIdx) const override final; public:
void forward() override; void forward();
~ReLUImpl_cuda();
private:
template <class T> void forward_(const Tensor& input);
}; };
namespace { namespace {
static Registrar<Producer_Op> registrarProducerImpl_cuda("cuda", Aidge::ProducerImpl_cuda::create); // add cuda backend to ReLU_Op implementation registry
static Registrar<ReLU_Op> registrarReLUImpl_cuda("cuda", Aidge::ReLUImpl_cuda::create);
} // namespace } // namespace
} // namespace Aidge } // namespace Aidge
#endif /* AIDGE_CUDA_OPERATOR_PRODUCERIMPL_H_ */ #endif /* AIDGE_BACKEND_CUDA_OPERATOR_RELUIMPL_H_ */
...@@ -2,8 +2,8 @@ ...@@ -2,8 +2,8 @@
#define AIDGE_BACKEND_CUDA_CUDA_CONTEXT_H #define AIDGE_BACKEND_CUDA_CUDA_CONTEXT_H
#include <vector> #include <vector>
#include <cstdio>
#include "aidge/utils/ErrorHandling.hpp"
#include "aidge/backend/cuda/utils/CudaUtils.hpp" #include "aidge/backend/cuda/utils/CudaUtils.hpp"
namespace Aidge { namespace Aidge {
...@@ -87,7 +87,7 @@ public: ...@@ -87,7 +87,7 @@ public:
if (cublas_h[dev] == NULL) { if (cublas_h[dev] == NULL) {
CHECK_CUBLAS_STATUS(cublasCreate(&cublas_h[dev])); CHECK_CUBLAS_STATUS(cublasCreate(&cublas_h[dev]));
printf("CUBLAS initialized on device #%d\n", dev); fmt::print("CUBLAS initialized on device #{}\n", dev);
} }
return cublas_h[dev]; return cublas_h[dev];
...@@ -113,7 +113,7 @@ public: ...@@ -113,7 +113,7 @@ public:
if (cudnn_h[dev] == NULL) { if (cudnn_h[dev] == NULL) {
CHECK_CUDNN_STATUS(cudnnCreate(&cudnn_h[dev])); CHECK_CUDNN_STATUS(cudnnCreate(&cudnn_h[dev]));
printf("CUDNN initialized on device #%d\n", dev); fmt::print("CUDNN initialized on device #{}\n", dev);
} }
return cudnn_h[dev]; return cudnn_h[dev];
......
...@@ -11,6 +11,9 @@ ...@@ -11,6 +11,9 @@
#include <cuda.h> #include <cuda.h>
#include <cudnn.h> #include <cudnn.h>
#include "aidge/data/half.hpp"
#include "aidge/utils/ErrorHandling.hpp"
#define CHECK_CUDNN_STATUS(status) \ #define CHECK_CUDNN_STATUS(status) \
do { \ do { \
const cudnnStatus_t e = (status); \ const cudnnStatus_t e = (status); \
...@@ -62,6 +65,29 @@ ...@@ -62,6 +65,29 @@
namespace Aidge { namespace Aidge {
namespace Cuda { namespace Cuda {
// CuDNN scaling parameters are typically "alpha" and "beta".
// Their type must be "float" for HALF and FLOAT (default template)
// and "double" for DOUBLE (specialized template)
template <class T>
struct cudnn_scaling_type {
typedef float type;
};
template <>
struct cudnn_scaling_type<double> {
typedef double type;
};
template <class T>
struct cuda_type {
typedef T type;
};
template <>
struct cuda_type<half_float::half> {
typedef __half type;
};
const char* cublasGetErrorString(cublasStatus_t error); const char* cublasGetErrorString(cublasStatus_t error);
// Enable Peer-to-Peer communications between devices // Enable Peer-to-Peer communications between devices
......
...@@ -62,10 +62,10 @@ class CMakeBuild(build_ext): ...@@ -62,10 +62,10 @@ class CMakeBuild(build_ext):
os.chdir(str(build_temp)) os.chdir(str(build_temp))
# Impose to use the executable of the python # Impose to use the executable of the python
# used to launch setup.py to setup PythonInterp # used to launch setup.py to setup PythonInterp
param_py = "-DPYTHON_EXECUTABLE=" + sys.executable param_py = "-DPYTHON_EXECUTABLE=" + sys.executable
compile_type = 'Debug' compile_type = 'Debug'
install_path = os.path.join(sys.prefix, "lib", "libAidge") if "AIDGE_INSTALL" not in os.environ else os.environ["AIDGE_INSTALL"] install_path = os.path.join(sys.prefix, "lib", "libAidge") if "AIDGE_INSTALL" not in os.environ else os.environ["AIDGE_INSTALL"]
...@@ -85,11 +85,11 @@ class CMakeBuild(build_ext): ...@@ -85,11 +85,11 @@ class CMakeBuild(build_ext):
for file in files: for file in files:
if (file.endswith('.so') or file.endswith('.pyd')) and (root != str(aidge_package.absolute())): if (file.endswith('.so') or file.endswith('.pyd')) and (root != str(aidge_package.absolute())):
currentFile=os.path.join(root, file) currentFile=os.path.join(root, file)
shutil.copy(currentFile, str(aidge_package.absolute())) shutil.copy(currentFile, str(aidge_package.absolute()))
# Copy version.txt in aidge_package # Copy version.txt in aidge_package
os.chdir(os.path.dirname(__file__)) os.chdir(os.path.dirname(__file__))
shutil.copy("version.txt", str(aidge_package.absolute())) shutil.copy("version.txt", str(aidge_package.absolute()))
if __name__ == '__main__': if __name__ == '__main__':
...@@ -108,7 +108,7 @@ if __name__ == '__main__': ...@@ -108,7 +108,7 @@ if __name__ == '__main__':
cmdclass={ cmdclass={
'build_ext': CMakeBuild, 'build_ext': CMakeBuild,
}, },
install_requires=['aidge_core', 'aidge_backend_cpu'], install_requires=['aidge_core'],
zip_safe=False, zip_safe=False,
) )
...@@ -91,10 +91,10 @@ template <class T> ...@@ -91,10 +91,10 @@ template <class T>
bool Aidge::TensorImpl_cuda<T>::operator==(const TensorImpl &otherImpl) const { bool Aidge::TensorImpl_cuda<T>::operator==(const TensorImpl &otherImpl) const {
const auto& otherImplCuda = static_cast<const TensorImpl_cuda<T>&>(otherImpl); const auto& otherImplCuda = static_cast<const TensorImpl_cuda<T>&>(otherImpl);
if (mTensor.size() != otherImplCuda.mTensor.size()) if (mNbElts != otherImplCuda.size())
return false; return false;
thrust::device_ptr<T> thrustData(mData.data()); thrust::device_ptr<T> thrustData(mData.data());
thrust::device_ptr<T> thrustOtherData(otherImplCuda.mData.data()); thrust::device_ptr<T> thrustOtherData(otherImplCuda.mData.data());
return thrust::equal(thrustData, thrustData + mTensor.size(), thrustOtherData); return thrust::equal(thrustData, thrustData + mNbElts, thrustOtherData);
} }
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#include <cassert>
#include <vector>
#include "aidge/backend/cuda/data/TensorImpl.hpp"
#include "aidge/backend/cuda/operator/AvgPoolingImpl.hpp"
#include "aidge/backend/cuda/utils/CudaContext.hpp"
#include "aidge/backend/cuda/utils/CudaUtils.hpp"
#include "aidge/operator/AvgPooling.hpp"
#include "aidge/utils/Types.h"
template <Aidge::DimIdx_t DIM>
void Aidge::AvgPoolingImpl_cuda<DIM>::forward() {
const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
assert(mOp.getRawInput(0) && "missing input #0");
const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0));
// Lazy-initialize CuDNN AvgPooling descriptor
if (mAvgPoolingDesc == nullptr) {
const AvgPooling_Op<DIM>& avgPoolingOp = static_cast<const AvgPooling_Op<DIM>&>(op);
const std::vector<int> strides(avgPoolingOp.template getAttr<AvgPoolingAttr::StrideDims>().begin(), avgPoolingOp.template getAttr<AvgPoolingAttr::StrideDims>().end());
const std::vector<int> paddings(DIM, 0);
const std::vector<int> window_dims(avgPoolingOp.template getAttr<AvgPoolingAttr::KernelDims>().begin(), avgPoolingOp.template getAttr<AvgPoolingAttr::KernelDims>().end());
CHECK_CUDNN_STATUS(cudnnCreatePoolingDescriptor(&mAvgPoolingDesc));
CHECK_CUDNN_STATUS(
cudnnSetPoolingNdDescriptor(mAvgPoolingDesc,
mMode,
CUDNN_NOT_PROPAGATE_NAN,
DIM,
&window_dims[0],
&paddings[0],
&strides[0]));
}
switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
case DataType::Float64:
forward_<double>(input);
break;
case DataType::Float32:
forward_<float>(input);
break;
case DataType::Float16:
forward_<half>(input);
break;
default:
AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
}
}
template <Aidge::DimIdx_t DIM>
template <class T>
void Aidge::AvgPoolingImpl_cuda<DIM>::forward_(const Tensor& input) {
const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
CHECK_CUDNN_STATUS(
cudnnPoolingForward(
CudaContext::cudnnHandle(),
mAvgPoolingDesc,
&alpha,
std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input),
input.getImpl()->rawPtr(),
&beta,
std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()
)
);
}
template <Aidge::DimIdx_t DIM>
Aidge::AvgPoolingImpl_cuda<DIM>::~AvgPoolingImpl_cuda() {
if(mAvgPoolingDesc != nullptr)
cudnnDestroyPoolingDescriptor(mAvgPoolingDesc);
}
// Template declarations
template class Aidge::AvgPoolingImpl_cuda<2>;
...@@ -10,32 +10,26 @@ ...@@ -10,32 +10,26 @@
********************************************************************************/ ********************************************************************************/
#include <cassert> #include <cassert>
#include <chrono> // std::chrono::milliseconds
#include <numeric> // std::accumulate
#include <thread> // std::this_thread::sleep_for
#include <vector> #include <vector>
#include "aidge/utils/Types.h"
#include "aidge/operator/Conv.hpp"
#include "aidge/backend/cuda/data/TensorImpl.hpp" #include "aidge/backend/cuda/data/TensorImpl.hpp"
#include "aidge/backend/cuda/operator/ConvImpl.hpp" #include "aidge/backend/cuda/operator/ConvImpl.hpp"
#include "aidge/backend/cuda/utils/CudaContext.hpp" #include "aidge/backend/cuda/utils/CudaUtils.hpp"
#include "aidge/operator/Conv.hpp"
#include "aidge/utils/Types.h"
template <Aidge::DimIdx_t DIM> template <Aidge::DimIdx_t DIM>
void Aidge::ConvImpl_cuda<DIM>::forward() { void Aidge::ConvImpl_cuda<DIM>::forward() {
const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
// FIXME: uncomment the following code once memory handling will work // FIXME: uncomment the following code once memory handling will work
assert(mOp.getRawInput(0) && "missing input #0"); assert(mOp.getRawInput(0) && "missing input #0");
assert(mOp.getRawInput(1) && "missing input #1"); assert(mOp.getRawInput(1) && "missing input #1");
// Convert input data (no overhead if not needed!) // Convert input data (no overhead if not needed!)
// TODO: right now, if needed, memory will be allocated/deallocated at each const auto& input0 = op.getInput(0)->refCastFrom(mInput0Fallback, *op.getOutput(0));
// call to forward(). We might put the following shared_ptr as members of const auto& input1 = op.getInput(1)->refCastFrom(mInput1Fallback, *op.getOutput(0));
// this class to avoid that. const auto& input2 = op.getInput(2)->refCastFrom(mInput2Fallback, *op.getOutput(0));
std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback;
const auto& input0 = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->refCastFrom(input0Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
const auto& input1 = std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->refCastFrom(input1Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
const auto& input2 = std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->refCastFrom(input2Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
// Lazy-initialize CuDNN convolution descriptor // Lazy-initialize CuDNN convolution descriptor
if (mConvDesc == nullptr) { if (mConvDesc == nullptr) {
...@@ -45,14 +39,13 @@ void Aidge::ConvImpl_cuda<DIM>::forward() { ...@@ -45,14 +39,13 @@ void Aidge::ConvImpl_cuda<DIM>::forward() {
const std::vector<int> upscales(convOp.template getAttr<ConvAttr::DilationDims>().begin(), convOp.template getAttr<ConvAttr::DilationDims>().end()); const std::vector<int> upscales(convOp.template getAttr<ConvAttr::DilationDims>().begin(), convOp.template getAttr<ConvAttr::DilationDims>().end());
CHECK_CUDNN_STATUS(cudnnCreateConvolutionDescriptor(&mConvDesc)); CHECK_CUDNN_STATUS(cudnnCreateConvolutionDescriptor(&mConvDesc));
CHECK_CUDNN_STATUS( CHECK_CUDNN_STATUS(cudnnSetConvolutionNdDescriptor(mConvDesc,
cudnnSetConvolutionNdDescriptor(mConvDesc, DIM,
DIM, &paddings[0],
&paddings[0], &strides[0],
&strides[0], &upscales[0],
&upscales[0], CUDNN_CROSS_CORRELATION,
CUDNN_CROSS_CORRELATION, DataTypeToCudnn(op.getOutput(0)->dataType())));
DataTypeToCudnn(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType())));
} }
// Lazy-initialize CuDNN filter descriptor // Lazy-initialize CuDNN filter descriptor
...@@ -61,14 +54,14 @@ void Aidge::ConvImpl_cuda<DIM>::forward() { ...@@ -61,14 +54,14 @@ void Aidge::ConvImpl_cuda<DIM>::forward() {
CHECK_CUDNN_STATUS(cudnnCreateFilterDescriptor(&mFilterDesc)); CHECK_CUDNN_STATUS(cudnnCreateFilterDescriptor(&mFilterDesc));
CHECK_CUDNN_STATUS(cudnnSetFilterNdDescriptor(mFilterDesc, CHECK_CUDNN_STATUS(cudnnSetFilterNdDescriptor(mFilterDesc,
DataTypeToCudnn(input1.dataType()), DataTypeToCudnn(input1.dataType()),
CUDNN_TENSOR_NCHW, CUDNN_TENSOR_NCHW,
kernels.size(), kernels.size(),
&kernels[0])); &kernels[0]));
} }
// Set forward algorithm and allocate the required workspace // Set forward algorithm and allocate the required workspace
if (mWorkspace == nullptr) { if (mFwdWorkspace == nullptr) {
// Find the best CuDNN forward algorithm (the one with the lowest compute time) // Find the best CuDNN forward algorithm (the one with the lowest compute time)
int maxAlgoIterations = 0; int maxAlgoIterations = 0;
cudnnGetConvolutionForwardAlgorithmMaxCount(CudaContext::cudnnHandle(), cudnnGetConvolutionForwardAlgorithmMaxCount(CudaContext::cudnnHandle(),
...@@ -80,14 +73,14 @@ void Aidge::ConvImpl_cuda<DIM>::forward() { ...@@ -80,14 +73,14 @@ void Aidge::ConvImpl_cuda<DIM>::forward() {
std::vector<cudnnConvolutionFwdAlgoPerf_t> returnFwdAlgo(maxAlgoIterations); std::vector<cudnnConvolutionFwdAlgoPerf_t> returnFwdAlgo(maxAlgoIterations);
CHECK_CUDNN_STATUS(cudnnFindConvolutionForwardAlgorithm( CHECK_CUDNN_STATUS(cudnnFindConvolutionForwardAlgorithm(
CudaContext::cudnnHandle(), CudaContext::cudnnHandle(),
dynamic_cast<TensorImpl_cuda_*>(input0.getImpl().get())->getCudnnTensorDesc(), std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0),
mFilterDesc, mFilterDesc,
mConvDesc, mConvDesc,
dynamic_cast<TensorImpl_cuda_*>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl().get())->getCudnnTensorDesc(), std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
maxAlgoIterations, maxAlgoIterations,
&returnAlgoCounts, &returnAlgoCounts,
&returnFwdAlgo[0])); &returnFwdAlgo[0]));
mFwdAlgo = returnFwdAlgo[0].algo; mFwdAlgo = returnFwdAlgo[0].algo;
// Allocate the workspace required by the chosen CuDNN forward algorithm // Allocate the workspace required by the chosen CuDNN forward algorithm
...@@ -95,48 +88,54 @@ void Aidge::ConvImpl_cuda<DIM>::forward() { ...@@ -95,48 +88,54 @@ void Aidge::ConvImpl_cuda<DIM>::forward() {
CHECK_CUDNN_STATUS(cudnnGetConvolutionForwardWorkspaceSize( CHECK_CUDNN_STATUS(cudnnGetConvolutionForwardWorkspaceSize(
CudaContext::cudnnHandle(), CudaContext::cudnnHandle(),
dynamic_cast<TensorImpl_cuda_*>(input0.getImpl().get())->getCudnnTensorDesc(), std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0),
mFilterDesc, mFilterDesc,
mConvDesc, mConvDesc,
dynamic_cast<TensorImpl_cuda_*>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl().get())->getCudnnTensorDesc(), std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
mFwdAlgo, mFwdAlgo,
&workspaceSize)); &workspaceSize));
CHECK_CUDA_STATUS(cudaMalloc(&mWorkspace, workspaceSize)); CHECK_CUDA_STATUS(cudaMalloc(&mFwdWorkspace, workspaceSize));
mWorkspaceSize = workspaceSize; mWorkspaceSize = workspaceSize;
} }
// Do the actual forward computation // Do the actual forward computation
// Template is only for scaling parameters, which are always in float // Template is only for scaling parameters, which are always in float
// excepted when the convolution is performed in double precision. // excepted when the convolution is performed in double precision.
if (std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType() == DataType::Float64) { switch(op.getOutput(0)->dataType()) {
forward_<double>(input0, input1, input2); case DataType::Float64:
} forward_<double>(input0, input1, input2);
else { break;
forward_<float>(input0, input1, input2); case DataType::Float32:
forward_<float>(input0, input1, input2);
break;
case DataType::Float16:
forward_<half>(input0, input1, input2);
break;
default:
AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
} }
} }
template <Aidge::DimIdx_t DIM> template <Aidge::DimIdx_t DIM>
template <class T> template <class T>
void Aidge::ConvImpl_cuda<DIM>::forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2) { void Aidge::ConvImpl_cuda<DIM>::forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2) {
const T alpha = 1.0f; const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
const T beta = 0.0f; const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
CHECK_CUDNN_STATUS( CHECK_CUDNN_STATUS(cudnnConvolutionForward(CudaContext::cudnnHandle(),
cudnnConvolutionForward(CudaContext::cudnnHandle(), &alpha,
&alpha, std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0),
dynamic_cast<TensorImpl_cuda_*>(input0.getImpl().get())->getCudnnTensorDesc(), input0.getImpl()->rawPtr(),
input0.getImpl()->rawPtr(), mFilterDesc,
mFilterDesc, input1.getImpl()->rawPtr(),
input1.getImpl()->rawPtr(), mConvDesc,
mConvDesc, mFwdAlgo,
mFwdAlgo, mFwdWorkspace,
mWorkspace, mWorkspaceSize,
mWorkspaceSize, &beta,
&beta, std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
dynamic_cast<TensorImpl_cuda_*>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl().get())->getCudnnTensorDesc(), op.getOutput(0)->getImpl()->rawPtr()));
std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr()));
// Add bias (if there is any) // Add bias (if there is any)
if (mOp.getRawInput(2) && input2.size() > 0) { if (mOp.getRawInput(2) && input2.size() > 0) {
...@@ -151,12 +150,182 @@ void Aidge::ConvImpl_cuda<DIM>::forward_(const Tensor& input0, const Tensor& inp ...@@ -151,12 +150,182 @@ void Aidge::ConvImpl_cuda<DIM>::forward_(const Tensor& input0, const Tensor& inp
// TODO: find a more elegant solution(?) // TODO: find a more elegant solution(?)
CHECK_CUDNN_STATUS(cudnnAddTensor(CudaContext::cudnnHandle(), CHECK_CUDNN_STATUS(cudnnAddTensor(CudaContext::cudnnHandle(),
&alpha, &alpha,
dynamic_cast<TensorImpl_cuda_*>(bias.getImpl().get())->getCudnnTensorDesc(), std::dynamic_pointer_cast<TensorImpl_cuda_>(bias.getImpl())->getCudnnTensorDesc(bias),
input2.getImpl()->rawPtr(), input2.getImpl()->rawPtr(),
&alpha, &alpha,
dynamic_cast<TensorImpl_cuda_*>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl().get())->getCudnnTensorDesc(), std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr())); op.getOutput(0)->getImpl()->rawPtr()));
}
}
template <Aidge::DimIdx_t DIM>
void Aidge::ConvImpl_cuda<DIM>::backward() {
const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
// FIXME: uncomment the following code once memory handling will work
assert(mOp.getRawInput(0) && "missing input #0");
assert(mOp.getRawInput(1) && "missing input #1");
// Convert input data (no overhead if not needed!)
const auto& input0 = op.getInput(0)->ref(mInput0Fallback, *op.getOutput(0));
const auto& input1 = op.getInput(1)->ref(mInput1Fallback, *op.getOutput(0));
const auto& input2 = op.getInput(2)->ref(mInput2Fallback, *op.getOutput(0));
// Set forward algorithm and allocate the required workspace
if (mBwdWorkspace == nullptr) {
// Find the best CuDNN backward algorithm (the one with the lowest compute time)
int maxAlgoIterations = 0;
cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(CudaContext::cudnnHandle(),
&maxAlgoIterations);
assert(maxAlgoIterations > 0 && "No available CUDNN ConvolutionBackwardFilterAlgorithm");
int returnAlgoCounts = 0;
std::vector<cudnnConvolutionBwdFilterAlgoPerf_t> returnBwdFilterAlgo(maxAlgoIterations);
CHECK_CUDNN_STATUS(cudnnFindConvolutionBackwardFilterAlgorithm(
CudaContext::cudnnHandle(),
std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0),
std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
mConvDesc,
mFilterDesc,
maxAlgoIterations,
&returnAlgoCounts,
&returnBwdFilterAlgo[0]));
mBwdFilterAlgo = returnBwdFilterAlgo[0].algo;
maxAlgoIterations = 0;
cudnnGetConvolutionBackwardDataAlgorithmMaxCount(CudaContext::cudnnHandle(),
&maxAlgoIterations);
assert(maxAlgoIterations > 0 && "No available CUDNN ConvolutionBackwardDataAlgorithm");
returnAlgoCounts = 0;
std::vector<cudnnConvolutionBwdDataAlgoPerf_t> returnBwdDataAlgo(maxAlgoIterations);
CHECK_CUDNN_STATUS(cudnnFindConvolutionBackwardDataAlgorithm(
CudaContext::cudnnHandle(),
mFilterDesc,
std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
mConvDesc,
std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0),
maxAlgoIterations,
&returnAlgoCounts,
&returnBwdDataAlgo[0]));
mBwdDataAlgo = returnBwdDataAlgo[0].algo;
// Allocate the workspace required by the chosen CuDNN backward algorithm
size_t workspaceSize = 0;
CHECK_CUDNN_STATUS(cudnnGetConvolutionBackwardFilterWorkspaceSize(
CudaContext::cudnnHandle(),
// same arguments as cudnnGetConvolutionBackwardFilterAlgorithm()
// -->
std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0),
std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
mConvDesc,
mFilterDesc,
// <--
mBwdFilterAlgo,
&workspaceSize));
size_t workspaceSizeData = 0;
CHECK_CUDNN_STATUS(cudnnGetConvolutionBackwardDataWorkspaceSize(
CudaContext::cudnnHandle(),
// same arguments as cudnnGetConvolutionBackwardDataAlgorithm() -->
mFilterDesc,
std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
mConvDesc,
std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0),
// <--
mBwdDataAlgo,
&workspaceSizeData));
if (workspaceSizeData > workspaceSize)
workspaceSize = workspaceSizeData;
if (workspaceSize > mWorkspaceSize) {
if (mFwdWorkspace != nullptr) {
cudaFree(mFwdWorkspace);
}
CHECK_CUDA_STATUS(cudaMalloc(&mFwdWorkspace, workspaceSize));
mWorkspaceSize = workspaceSize;
}
mBwdWorkspace = mFwdWorkspace;
}
// Do the actual backward computation
// Template is only for scaling parameters, which are always in float
// excepted when the convolution is performed in double precision.
if (op.getOutput(0)->dataType() == DataType::Float64) {
backward_<double>(input0, input1, input2);
}
else {
backward_<float>(input0, input1, input2);
}
}
template <Aidge::DimIdx_t DIM>
template <class T>
void Aidge::ConvImpl_cuda<DIM>::backward_(const Tensor& input0, const Tensor& input1, const Tensor& input2) {
const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
std::shared_ptr<Tensor> gradOutputFallback;
const auto& gradOutput = op.getOutput(0)->grad()->refCastFrom(gradOutputFallback, *(op.getInput(0)->grad()));
const T alpha = 1.0f;
const T beta = 0.0f;
CHECK_CUDNN_STATUS(cudnnConvolutionBackwardFilter(
CudaContext::cudnnHandle(),
&alpha,
std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0),
input0.getImpl()->rawPtr(),
std::dynamic_pointer_cast<TensorImpl_cuda_>(gradOutput.getImpl())->getCudnnTensorDesc(gradOutput),
gradOutput.getImpl()->rawPtr(),
mConvDesc,
mBwdFilterAlgo,
mBwdWorkspace,
mWorkspaceSize,
&beta,
mFilterDesc,
op.getInput(1)->grad()->getImpl()->rawPtr()));
CHECK_CUDNN_STATUS(cudnnConvolutionBackwardData(
CudaContext::cudnnHandle(),
&alpha,
mFilterDesc,
input1.getImpl()->rawPtr(),
std::dynamic_pointer_cast<TensorImpl_cuda_>(gradOutput.getImpl())->getCudnnTensorDesc(gradOutput),
gradOutput.getImpl()->rawPtr(),
mConvDesc,
mBwdDataAlgo,
mBwdWorkspace,
mWorkspaceSize,
&beta,
std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->grad()->getImpl())->getCudnnTensorDesc(*op.getInput(0)),
op.getInput(0)->grad()->getImpl()->rawPtr()));
// Add bias (if there is any)
if (mOp.getRawInput(2) && input2.size() > 0) {
// Bias tensor needs to have the same number of dims than output tensor for cudnnAddTensor()
std::vector<DimSize_t> gradBiasDims(DIM+2, 1);
gradBiasDims[1] = op.getInput(2)->grad()->size();
// Create a dummy tensor with the right dims in order to get a CuDNN tensor descriptor (with getCudnnTensorDesc())
Tensor gradBias(op.getInput(2)->grad()->dataType());
gradBias.setBackend("cuda");
gradBias.resize(gradBiasDims);
// TODO: find a more elegant solution(?)
CHECK_CUDNN_STATUS(cudnnConvolutionBackwardBias(CudaContext::cudnnHandle(),
&alpha,
std::dynamic_pointer_cast<TensorImpl_cuda_>(gradOutput.getImpl())->getCudnnTensorDesc(gradOutput),
gradOutput.getImpl()->rawPtr(),
&beta,
std::dynamic_pointer_cast<TensorImpl_cuda_>(gradBias.getImpl())->getCudnnTensorDesc(gradBias),
op.getInput(2)->grad()->getImpl()->rawPtr()));
} }
} }
...@@ -170,8 +339,8 @@ Aidge::ConvImpl_cuda<DIM>::~ConvImpl_cuda() { ...@@ -170,8 +339,8 @@ Aidge::ConvImpl_cuda<DIM>::~ConvImpl_cuda() {
cudnnDestroyFilterDescriptor(mFilterDesc); cudnnDestroyFilterDescriptor(mFilterDesc);
} }
if (mWorkspace != nullptr) { if (mFwdWorkspace != nullptr) {
cudaFree(mWorkspace); cudaFree(mFwdWorkspace);
} }
} }
......
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#include <cassert>
#include <chrono> // std::chrono::milliseconds
#include <numeric> // std::accumulate
#include <thread> // std::this_thread::sleep_for
#include <vector>
#include "aidge/backend/cuda/data/TensorImpl.hpp"
#include "aidge/backend/cuda/operator/FCImpl.hpp"
#include "aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp"
#include "aidge/backend/cuda/utils/CudaContext.hpp"
#include "aidge/backend/cuda/utils/CudaUtils.hpp"
#include "aidge/operator/FC.hpp"
#include "aidge/utils/Types.h"
void Aidge::FCImpl_cuda::forward() {
assert(mOp.getRawInput(0) && "missing input #0");
assert(mOp.getRawInput(1) && "missing input #1");
assert(mOp.getRawInput(2) && "missing input #2");
const auto& fcOp = static_cast<const FC_Op&>(mOp);
bool noBias = fcOp.template getAttr<FCAttr::NoBias>();
std::size_t outChannels = static_cast<std::size_t>(fcOp.template getAttr<FCAttr::OutChannels>());
const auto& input0 = fcOp.getInput(0)->refCastFrom(mInput0Fallback, *fcOp.getOutput(0));
const auto& input1 = fcOp.getInput(1)->refCastFrom(mInput1Fallback, *fcOp.getOutput(0));
const auto& input2 = fcOp.getInput(2)->refCastFrom(mInput2Fallback, *fcOp.getOutput(0));
switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
case DataType::Float64:
forward_<double>(input0, input1, input2, noBias, outChannels);
break;
case DataType::Float32:
forward_<float>(input0, input1, input2, noBias, outChannels);
break;
case DataType::Float16:
forward_<half>(input0, input1, input2, noBias, outChannels);
break;
default:
AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
}
}
template<class T>
void Aidge::FCImpl_cuda::forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, bool noBias, std::size_t outChannels)
{
const T * input = static_cast<const T*>(input0.getImpl()->rawPtr());
const T * weights = static_cast<const T*>(input1.getImpl()->rawPtr());
T * output = static_cast<T*>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr());
// Performing output = T(weights) * input
// [n x m] = [n x k] * [k x m]
// cublas is column-major so instead of transposing inputs, computing output [m x n] and transposing output, we compute output as [n x m]
int n = outChannels;
int m = std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->size()/n;
int k = input0.size()/m;
int lda = k; // leading dimension of weights
int ldb = k; // leading dimension of input
int ldc = n; // leading dimension of output
const T alpha = 1.0f;
const T beta = 0.0f;
CHECK_CUBLAS_STATUS(cublasGemm(CudaContext::cublasHandle(),
CUBLAS_OP_T,
CUBLAS_OP_N,
n,
m,
k,
reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&alpha),
weights,
ldb,
input,
lda,
reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&beta),
output,
ldc));
if(!noBias){
T* onesVector;
CHECK_CUDA_STATUS(cudaMalloc((void**)&onesVector, m * sizeof(T)));
// Fill the vector with ones
std::vector<T> onesVec(m, T(1.0));
CHECK_CUDA_STATUS(cudaMemcpy(onesVector,
&onesVec[0],
m * sizeof(T),
cudaMemcpyHostToDevice));
const T * biases = static_cast<const T*>(input2.getImpl()->rawPtr());
// Performing output = biases * onesVector + output
// [n x m] = [n x 1] * [1 x m] + [n x m]
CHECK_CUBLAS_STATUS(cublasGemm(CudaContext::cublasHandle(),
CUBLAS_OP_N,
CUBLAS_OP_N,
n,
m,
1,
reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&alpha),
biases,
n,
onesVector,
1,
reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&alpha),
output,
n));
cudaFree(onesVector);
}
}
\ No newline at end of file
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#include <stdio.h>
#include "aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp"
namespace Aidge{
template <>
cublasStatus_t cublasGemm<__half>(cublasHandle_t handle,
cublasOperation_t transa, cublasOperation_t transb,
int m, int n, int k,
const __half *alpha,
const __half *A, int lda,
const __half *B, int ldb,
const __half *beta,
__half *C, int ldc)
{
return cublasHgemm(handle,
transa, transb,
m, n, k,
alpha,
A, lda,
B, ldb,
beta,
C, ldc);
}
template <>
cublasStatus_t cublasGemm<float>(cublasHandle_t handle,
cublasOperation_t transa, cublasOperation_t transb,
int m, int n, int k,
const float *alpha,
const float *A, int lda,
const float *B, int ldb,
const float *beta,
float *C, int ldc)
{
return cublasSgemm(handle,
transa, transb,
m, n, k,
alpha,
A, lda,
B, ldb,
beta,
C, ldc);
}
template <>
cublasStatus_t cublasGemm<double>(cublasHandle_t handle,
cublasOperation_t transa, cublasOperation_t transb,
int m, int n, int k,
const double *alpha,
const double *A, int lda,
const double *B, int ldb,
const double *beta,
double *C, int ldc)
{
return cublasDgemm(handle,
transa, transb,
m, n, k,
alpha,
A, lda,
B, ldb,
beta,
C, ldc);
}
}
\ No newline at end of file
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#include <cassert>
#include <vector>
#include "aidge/backend/cuda/data/TensorImpl.hpp"
#include "aidge/backend/cuda/operator/MaxPoolingImpl.hpp"
#include "aidge/backend/cuda/utils/CudaContext.hpp"
#include "aidge/backend/cuda/utils/CudaUtils.hpp"
#include "aidge/operator/MaxPooling.hpp"
#include "aidge/utils/Types.h"
template <Aidge::DimIdx_t DIM>
void Aidge::MaxPoolingImpl_cuda<DIM>::forward() {
const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
assert(mOp.getRawInput(0) && "missing input #0");
const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0));
// Lazy-initialize CuDNN MaxPooling descriptor
if (mMaxPoolingDesc == nullptr) {
const MaxPooling_Op<DIM>& maxPoolingOp = static_cast<const MaxPooling_Op<DIM>&>(op);
const std::vector<int> strides(maxPoolingOp.template getAttr<MaxPoolingAttr::StrideDims>().begin(), maxPoolingOp.template getAttr<MaxPoolingAttr::StrideDims>().end());
const std::vector<int> paddings(DIM, 0);
const std::vector<int> window_dims(maxPoolingOp.template getAttr<MaxPoolingAttr::KernelDims>().begin(), maxPoolingOp.template getAttr<MaxPoolingAttr::KernelDims>().end());
CHECK_CUDNN_STATUS(cudnnCreatePoolingDescriptor(&mMaxPoolingDesc));
CHECK_CUDNN_STATUS(
cudnnSetPoolingNdDescriptor(mMaxPoolingDesc,
mMode,
CUDNN_NOT_PROPAGATE_NAN,
DIM,
&window_dims[0],
&paddings[0],
&strides[0]));
}
switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
case DataType::Float64:
forward_<double>(input);
break;
case DataType::Float32:
forward_<float>(input);
break;
case DataType::Float16:
forward_<half>(input);
break;
default:
AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
}
}
template <Aidge::DimIdx_t DIM>
template <class T>
void Aidge::MaxPoolingImpl_cuda<DIM>::forward_(const Tensor& input) {
const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
CHECK_CUDNN_STATUS(
cudnnPoolingForward(
CudaContext::cudnnHandle(),
mMaxPoolingDesc,
&alpha,
std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input),
input.getImpl()->rawPtr(),
&beta,
std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()
)
);
}
template <Aidge::DimIdx_t DIM>
Aidge::MaxPoolingImpl_cuda<DIM>::~MaxPoolingImpl_cuda() {
if(mMaxPoolingDesc != nullptr)
cudnnDestroyPoolingDescriptor(mMaxPoolingDesc);
}
// Template declarations
template class Aidge::MaxPoolingImpl_cuda<2>;
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#include <cassert>
#include <vector>
#include "aidge/backend/cuda/data/TensorImpl.hpp"
#include "aidge/backend/cuda/operator/ReLUImpl.hpp"
#include "aidge/backend/cuda/utils/CudaContext.hpp"
#include "aidge/backend/cuda/utils/CudaUtils.hpp"
#include "aidge/operator/ReLU.hpp"
#include "aidge/utils/Types.h"
void Aidge::ReLUImpl_cuda::forward() {
const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
assert(mOp.getRawInput(0) && "missing input #0");
const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0));
// Lazy-initialize CuDNN ReLU descriptor
if (mReLUDesc == nullptr) {
#if CUDNN_VERSION >= 5000
CHECK_CUDNN_STATUS(cudnnCreateActivationDescriptor(&mReLUDesc));
CHECK_CUDNN_STATUS(cudnnSetActivationDescriptor(
mReLUDesc, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, 0.0));
#else
mReLUDesc = CUDNN_ACTIVATION_RELU;
#endif
}
switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
case DataType::Float64:
forward_<double>(input);
break;
case DataType::Float32:
forward_<float>(input);
break;
case DataType::Float16:
forward_<half>(input);
break;
default:
AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
}
}
template <class T>
void Aidge::ReLUImpl_cuda::forward_(const Tensor& input) {
const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
CHECK_CUDNN_STATUS(
cudnnActivationForward(CudaContext::cudnnHandle(),
mReLUDesc,
&alpha,
std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input),
input.getImpl()->rawPtr(),
&beta,
std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()));
}
Aidge::ReLUImpl_cuda::~ReLUImpl_cuda() {
if (mReLUDesc != nullptr) {
#if CUDNN_VERSION >= 5000
cudnnDestroyActivationDescriptor(mReLUDesc);
#endif
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment