diff --git a/.gitlab/ci/build.gitlab-ci.yml b/.gitlab/ci/build.gitlab-ci.yml index 01c537fce668e95f5e632cce8b58eda0da36ecdd..d12c714d34b2d457a5b4a675bbd4b99c1211362b 100644 --- a/.gitlab/ci/build.gitlab-ci.yml +++ b/.gitlab/ci/build.gitlab-ci.yml @@ -1,3 +1,6 @@ +include: + - remote: 'https://gitlab.eclipse.org/eclipse/aidge/gitlab_shared_files/-/raw/main/.gitlab/ci/shared_script.gitlab-ci.yml' + build:ubuntu_cpp: stage: build needs: [] @@ -6,15 +9,14 @@ build:ubuntu_cpp: script: # Download dependencies + - DEPENDENCY_JOB="build:ubuntu_cpp" # aidge_core - - 'curl --location --output build_artifacts.zip "https://gitlab.eclipse.org/api/v4/projects/5139/jobs/artifacts/main/download?job=build:ubuntu_cpp"' - - unzip -o build_artifacts.zip -d . - - rm -rf build_cpp + - DEPENDENCY_NAME="aidge_core" + - !reference [.download_dependency, script] # aidge_backend_cpu - - 'curl --location --output build_artifacts.zip "https://gitlab.eclipse.org/api/v4/projects/5140/jobs/artifacts/master/download?job=build:ubuntu_cpp"' - - unzip -o build_artifacts.zip -d . - - rm -rf build_cpp - + - DEPENDENCY_NAME="aidge_backend_cpu" + - !reference [.download_dependency, script] + # Build current module - export CMAKE_PREFIX_PATH=../install_cpp - mkdir -p build_cpp @@ -35,15 +37,14 @@ build:ubuntu_cpp_g++10: - docker script: - # Download dependencies + # Download dependencies + - DEPENDENCY_JOB="build:ubuntu_cpp" # aidge_core - - 'curl --location --output build_artifacts.zip "https://gitlab.eclipse.org/api/v4/projects/5139/jobs/artifacts/main/download?job=build:ubuntu_cpp"' - - unzip -o build_artifacts.zip -d . - - rm -rf build_cpp + - DEPENDENCY_NAME="aidge_core" + - !reference [.download_dependency, script] # aidge_backend_cpu - - 'curl --location --output build_artifacts.zip "https://gitlab.eclipse.org/api/v4/projects/5140/jobs/artifacts/master/download?job=build:ubuntu_cpp"' - - unzip -o build_artifacts.zip -d . - - rm -rf build_cpp + - DEPENDENCY_NAME="aidge_backend_cpu" + - !reference [.download_dependency, script] # Build current module - export CMAKE_PREFIX_PATH=../install_cpp @@ -63,14 +64,14 @@ build:ubuntu_cpp_g++12: script: # Download dependencies + - DEPENDENCY_JOB="build:ubuntu_cpp" # aidge_core - - 'curl --location --output build_artifacts.zip "https://gitlab.eclipse.org/api/v4/projects/5139/jobs/artifacts/main/download?job=build:ubuntu_cpp"' - - unzip -o build_artifacts.zip -d . - - rm -rf build_cpp + - DEPENDENCY_NAME="aidge_core" + - !reference [.download_dependency, script] # aidge_backend_cpu - - 'curl --location --output build_artifacts.zip "https://gitlab.eclipse.org/api/v4/projects/5140/jobs/artifacts/master/download?job=build:ubuntu_cpp"' - - unzip -o build_artifacts.zip -d . - - rm -rf build_cpp + - DEPENDENCY_NAME="aidge_backend_cpu" + - !reference [.download_dependency, script] + # Build current module - export CMAKE_PREFIX_PATH=../install_cpp @@ -90,14 +91,14 @@ build:ubuntu_cpp_clang12: script: # Download dependencies + - DEPENDENCY_JOB="build:ubuntu_cpp" # aidge_core - - 'curl --location --output build_artifacts.zip "https://gitlab.eclipse.org/api/v4/projects/5139/jobs/artifacts/main/download?job=build:ubuntu_cpp"' - - unzip -o build_artifacts.zip -d . - - rm -rf build_cpp + - DEPENDENCY_NAME="aidge_core" + - !reference [.download_dependency, script] # aidge_backend_cpu - - 'curl --location --output build_artifacts.zip "https://gitlab.eclipse.org/api/v4/projects/5140/jobs/artifacts/master/download?job=build:ubuntu_cpp"' - - unzip -o build_artifacts.zip -d . - - rm -rf build_cpp + - DEPENDENCY_NAME="aidge_backend_cpu" + - !reference [.download_dependency, script] + # Build current module - export CMAKE_PREFIX_PATH=../install_cpp @@ -117,14 +118,13 @@ build:ubuntu_cpp_clang15: script: # Download dependencies + - DEPENDENCY_JOB="build:ubuntu_cpp" # aidge_core - - 'curl --location --output build_artifacts.zip "https://gitlab.eclipse.org/api/v4/projects/5139/jobs/artifacts/main/download?job=build:ubuntu_cpp"' - - unzip -o build_artifacts.zip -d . - - rm -rf build_cpp + - DEPENDENCY_NAME="aidge_core" + - !reference [.download_dependency, script] # aidge_backend_cpu - - 'curl --location --output build_artifacts.zip "https://gitlab.eclipse.org/api/v4/projects/5140/jobs/artifacts/master/download?job=build:ubuntu_cpp"' - - unzip -o build_artifacts.zip -d . - - rm -rf build_cpp + - DEPENDENCY_NAME="aidge_backend_cpu" + - !reference [.download_dependency, script] # Build current module - export CMAKE_PREFIX_PATH=../install_cpp @@ -144,12 +144,13 @@ build:ubuntu_python: script: # Download dependencies - # aidge_core (Python) - - 'curl --location --output build_artifacts.zip "https://gitlab.eclipse.org/api/v4/projects/5139/jobs/artifacts/main/download?job=build:ubuntu_python"' - - unzip -o build_artifacts.zip -d . - # aidge_backend_cpu (Python) - - 'curl --location --output build_artifacts.zip "https://gitlab.eclipse.org/api/v4/projects/5140/jobs/artifacts/master/download?job=build:ubuntu_python"' - - unzip -o build_artifacts.zip -d . + - DEPENDENCY_JOB="build:ubuntu_python" + # aidge_core (python) + - DEPENDENCY_NAME="aidge_core" + - !reference [.download_dependency, script] + # aidge_backend_cpu (python) + - DEPENDENCY_NAME="aidge_backend_cpu" + - !reference [.download_dependency, script] - python3 -m pip install virtualenv - virtualenv venv diff --git a/CMakeLists.txt b/CMakeLists.txt index 21f57bc75b2fa4ad8b57711b092693c70b20d628..d940def39a4e092bb01765a0b127b41c6a88914f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,11 +7,12 @@ file(READ "${CMAKE_SOURCE_DIR}/project_name.txt" project) message(STATUS "Project name: ${project}") message(STATUS "Project version: ${version}") -# Note : project name is {project} and python module name is also {project} +# Note : project name is {project} and python module name is also {project} set(module_name _${project}) # target name project(${project}) +set(CXX_STANDARD 14) ############################################## # Define options @@ -19,6 +20,7 @@ option(PYBIND "python binding" ON) option(WERROR "Warning as error" OFF) option(TEST "Enable tests" ON) option(COVERAGE "Enable coverage" OFF) +option(ENABLE_ASAN "Enable ASan (AddressSanitizer) for runtime analysis of memory use (over/underflow, memory leak, ...)" OFF) ############################################## # Import utils CMakeLists @@ -36,8 +38,9 @@ enable_language(CUDA) find_package(CUDAToolkit REQUIRED) find_package(aidge_core REQUIRED) -find_package(aidge_backend_cpu REQUIRED) - +if(TEST) + find_package(aidge_backend_cpu REQUIRED) +endif() ############################################## # Create target and set properties @@ -48,11 +51,31 @@ add_library(${module_name} ${src_files} ${inc_files}) target_link_libraries(${module_name} PUBLIC _aidge_core # _ is added because we link the target not the project - _aidge_backend_cpu # _ is added because we link the target not the project CUDA::cudart + CUDA::cublas cudnn ) +if( ${ENABLE_ASAN} ) + message("Building ${module_name}Â with ASAN.") + set(SANITIZE_FLAGS -fsanitize=address -fno-omit-frame-pointer) + target_link_libraries(${module_name} + PUBLIC + -fsanitize=address + ) + target_compile_options(${module_name} + PRIVATE + ${SANITIZE_FLAGS} + ) +endif() + +if(TEST) + target_link_libraries(${module_name} + PUBLIC + _aidge_backend_cpu # _ is added because we link the target not the project + ) +endif() + #Set target properties target_include_directories(${module_name} PUBLIC @@ -76,7 +99,7 @@ if (PYBIND) # Handles Python + pybind11 headers dependencies target_link_libraries(${module_name} - PUBLIC + PUBLIC pybind11::pybind11 PRIVATE Python::Python @@ -119,8 +142,8 @@ install(DIRECTORY include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) install(EXPORT ${project}-targets FILE "${project}-targets.cmake" DESTINATION ${INSTALL_CONFIGDIR} -# COMPONENT ${module_name} -) +# COMPONENT ${module_name} +) #Create a ConfigVersion.cmake file include(CMakePackageConfigHelpers) diff --git a/include/aidge/backend/cuda.hpp b/include/aidge/backend/cuda.hpp index cfae53b64115aa7946580d00f45be56f17163d7f..5b9b94f03a2c5a099f010ab7117479030003f5a8 100644 --- a/include/aidge/backend/cuda.hpp +++ b/include/aidge/backend/cuda.hpp @@ -13,7 +13,10 @@ #define AIDGE_BACKEND_CUDA_IMPORTS_H_ #include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/AvgPoolingImpl.hpp" #include "aidge/backend/cuda/operator/ConvImpl.hpp" -#include "aidge/backend/cuda/operator/ProducerImpl.hpp" +#include "aidge/backend/cuda/operator/FCImpl.hpp" +#include "aidge/backend/cuda/operator/MaxPoolingImpl.hpp" +#include "aidge/backend/cuda/operator/ReLUImpl.hpp" #endif /* AIDGE_BACKEND_CUDA_IMPORTS_H_ */ \ No newline at end of file diff --git a/include/aidge/backend/cuda/data/TensorImpl.hpp b/include/aidge/backend/cuda/data/TensorImpl.hpp index c61e926c88a9baf1fcdf64794c2a975a1b891356..f083a8ba9f68ed53929db95b3fd6604f31548e21 100644 --- a/include/aidge/backend/cuda/data/TensorImpl.hpp +++ b/include/aidge/backend/cuda/data/TensorImpl.hpp @@ -1,6 +1,10 @@ #ifndef AIDGE_BACKEND_CUDA_DATA_TENSORIMPL_H_ #define AIDGE_BACKEND_CUDA_DATA_TENSORIMPL_H_ +#include <cstddef> // std::size_t +#include <memory> +#include <string> + #include "aidge/backend/TensorImpl.hpp" #include "aidge/data/Tensor.hpp" #include "aidge/utils/Registrar.hpp" @@ -24,19 +28,27 @@ void thrust_copy(const half_float::half* srcData, half_float::half* dstData, siz /** * @brief Abstract class for the TensorImpl_cuda class template. - * @details Its purpose is to provide access to base methods that are specific - * to the implementation (which are therefore not present in the TensorImpl + * @details Its purpose is to provide access to base methods that are specific + * to the implementation (which are therefore not present in the TensorImpl * class), but whose data type does not need to be known. */ class TensorImpl_cuda_ { +protected: + mutable cudnnTensorDescriptor_t mCudnnTensor = nullptr; + public: /** * @brief Return the CuDNN tensor descriptor of the tensor. - * @details This method uses lazy initialization for the descriptor + * @details This method uses lazy initialization for the descriptor * (which is therefore mutable in the derived class). * @return cudnnTensorDescriptor_t CuDNN tensor descriptor. */ - virtual const cudnnTensorDescriptor_t& getCudnnTensorDesc() const = 0; + virtual const cudnnTensorDescriptor_t& getCudnnTensorDesc(const Tensor& tensor) const = 0; + + virtual ~TensorImpl_cuda_() { + if (mCudnnTensor != nullptr) + cudnnDestroyTensorDescriptor(mCudnnTensor); + } }; template <class T> @@ -54,119 +66,117 @@ private: } private: - const Tensor &mTensor; // Impl needs to access Tensor information, but is not - // supposed to change it! - /// Pointer to the data and its capacity future_std::span<T> mData; /// If this instance own the data, std::unique_ptr manages it std::unique_ptr<T, decltype(&cudaDelete)> mDataOwner; - mutable cudnnTensorDescriptor_t mCudnnTensor = nullptr; public: - static constexpr const char *Backend = "cuda"; + static const std::string Backend; + + TensorImpl_cuda(DeviceIdx_t device, std::vector<DimSize_t> dims) : TensorImpl(Backend, device, dims), mDataOwner(nullptr, cudaDelete) {} - TensorImpl_cuda(const Tensor &tensor) : TensorImpl(Backend), mTensor(tensor), mDataOwner(nullptr, cudaDelete) {} bool operator==(const TensorImpl &otherImpl) const override final; - static std::unique_ptr<TensorImpl_cuda> create(const Tensor &tensor) { - return std::make_unique<TensorImpl_cuda<T>>(tensor); + static std::shared_ptr<TensorImpl_cuda> create(DeviceIdx_t device, std::vector<DimSize_t> dims) { + return std::make_shared<TensorImpl_cuda<T>>(device, dims); } // native interface const future_std::span<T>& data() const { return mData; } - std::size_t size() const override { return mData.size(); } - std::size_t scalarSize() const override { return sizeof(T); } - - void setDevice(DeviceIdx_t device) override { - mDevice = device; - } + std::size_t scalarSize() const noexcept override { return sizeof(T); } void copy(const void *src, NbElts_t length, NbElts_t offset = 0) override { - void* dst = static_cast<void*>(static_cast<T*>(rawPtr()) + offset); - CHECK_CUDA_STATUS(cudaMemcpy(dst, src, length * sizeof(T), cudaMemcpyDeviceToDevice)); + AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "copy length is above capacity"); + const T* srcT = static_cast<const T *>(src); + T* dstT = static_cast<T *>(rawPtr(offset)); + + AIDGE_ASSERT(dstT < srcT || dstT >= srcT + length, "overlapping copy is not supported"); + CHECK_CUDA_STATUS(cudaMemcpy(dstT, srcT, length * sizeof(T), cudaMemcpyDeviceToDevice)); } - void copyCast(const void *src, NbElts_t length, const DataType srcDt) override { + void copyCast(const void *src, const DataType srcDt, NbElts_t length, NbElts_t offset = 0) override { if (length == 0) { return; } - AIDGE_ASSERT(length <= mData.size() || length <= mTensor.size(), "copy length is above capacity"); - if (srcDt == DataType::Float64) { + AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "copy length is above capacity"); + switch (srcDt) { + case DataType::Float64: thrust_copy(static_cast<const double*>(src), - static_cast<T*>(rawPtr()), + static_cast<T*>(rawPtr(offset)), length); - } - else if (srcDt == DataType::Float32) { + break; + case DataType::Float32: thrust_copy(static_cast<const float*>(src), - static_cast<T*>(rawPtr()), + static_cast<T*>(rawPtr(offset)), length); - } - else if (srcDt == DataType::Float16) { + break; + case DataType::Float16: thrust_copy(static_cast<const half_float::half*>(src), - static_cast<T*>(rawPtr()), + static_cast<T*>(rawPtr(offset)), length); - } - else if (srcDt == DataType::Int64) { + break; + case DataType::Int64: thrust_copy(static_cast<const int64_t*>(src), - static_cast<T*>(rawPtr()), + static_cast<T*>(rawPtr(offset)), length); - } - else if (srcDt == DataType::UInt64) { + break; + case DataType::UInt64: thrust_copy(static_cast<const uint64_t*>(src), - static_cast<T*>(rawPtr()), + static_cast<T*>(rawPtr(offset)), length); - } - else if (srcDt == DataType::Int32) { + break; + case DataType::Int32: thrust_copy(static_cast<const int32_t*>(src), - static_cast<T*>(rawPtr()), + static_cast<T*>(rawPtr(offset)), length); - } - else if (srcDt == DataType::UInt32) { + break; + case DataType::UInt32: thrust_copy(static_cast<const uint32_t*>(src), - static_cast<T*>(rawPtr()), + static_cast<T*>(rawPtr(offset)), length); - } - else if (srcDt == DataType::Int16) { + break; + case DataType::Int16: thrust_copy(static_cast<const int16_t*>(src), - static_cast<T*>(rawPtr()), + static_cast<T*>(rawPtr(offset)), length); - } - else if (srcDt == DataType::UInt16) { + break; + case DataType::UInt16: thrust_copy(static_cast<const uint16_t*>(src), - static_cast<T*>(rawPtr()), + static_cast<T*>(rawPtr(offset)), length); - } - else if (srcDt == DataType::Int8) { + break; + case DataType::Int8: thrust_copy(static_cast<const int8_t*>(src), - static_cast<T*>(rawPtr()), + static_cast<T*>(rawPtr(offset)), length); - } - else if (srcDt == DataType::UInt8) { + break; + case DataType::UInt8: thrust_copy(static_cast<const uint8_t*>(src), - static_cast<T*>(rawPtr()), + static_cast<T*>(rawPtr(offset)), length); - } - else { + break; + default: AIDGE_THROW_OR_ABORT(std::runtime_error, "Unsupported data type."); + break; } } - void copyFromDevice(const void *src, NbElts_t length, const std::pair<std::string, DeviceIdx_t>& device) override { - AIDGE_ASSERT(length <= mData.size() || length <= mTensor.size(), "copy length is above capacity"); - CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(), src, length * sizeof(T), cudaMemcpyDeviceToDevice)); + void copyFromDevice(const void *src, const std::pair<std::string, DeviceIdx_t>& device, NbElts_t length, NbElts_t offset = 0) override { + AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "copy length is above capacity"); + CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(offset), src, length * sizeof(T), cudaMemcpyDeviceToDevice)); } - void copyFromHost(const void *src, NbElts_t length) override { - AIDGE_ASSERT(length <= mData.size() || length <= mTensor.size(), "copy length is above capacity"); - CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(), src, length * sizeof(T), cudaMemcpyHostToDevice)); + void copyFromHost(const void *src, NbElts_t length, NbElts_t offset = 0) override { + AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "copy length is above capacity"); + CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(offset), src, length * sizeof(T), cudaMemcpyHostToDevice)); } - void copyToHost(void *dst, NbElts_t length) const override { - AIDGE_ASSERT(length <= mData.size() || length <= mTensor.size(), "copy length is above capacity"); - CHECK_CUDA_STATUS(cudaMemcpy(dst, rawPtr(), length * sizeof(T), cudaMemcpyDeviceToHost)); + void copyToHost(void *dst, NbElts_t length, NbElts_t offset = 0) const override { + AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "copy length is above capacity"); + CHECK_CUDA_STATUS(cudaMemcpy(dst, rawPtr(offset), length * sizeof(T), cudaMemcpyDeviceToHost)); } void *rawPtr(NbElts_t offset = 0) override { @@ -175,30 +185,27 @@ public: }; const void *rawPtr(NbElts_t offset = 0) const override { - AIDGE_ASSERT(mData.size() >= mTensor.size(), "accessing uninitialized const rawPtr"); + AIDGE_ASSERT(mData.size() >= mNbElts, "accessing uninitialized const rawPtr"); return (mData.data() + offset); }; - const cudnnTensorDescriptor_t& getCudnnTensorDesc() const override { + const cudnnTensorDescriptor_t& getCudnnTensorDesc(const Tensor& tensor) const override { if (mCudnnTensor == nullptr) { CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&mCudnnTensor)); - if (mTensor.size() > 0) { + if (tensor.size() > 0) { /** ** cudNN Tensors are restricted to having at least 4 dimensions : ** When working with lower dimensionsal data, unused dimensions are set to 1. ** Referes to the cudnnSetTensorNdDescriptor documentation from : ** https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html **/ - std::vector<int> dims(mTensor.dims().begin(), mTensor.dims().end()); + std::vector<int> dims(tensor.dims().cbegin(), tensor.dims().cend()); + std::vector<int> strides(tensor.strides().cbegin(), tensor.strides().cend()); - if (dims.size() < 4) + if (dims.size() < 4) { dims.resize(4, 1); - - std::vector<int> strides(dims.size(), 1); - - for (size_t dim = 1; dim < dims.size(); ++dim) { - strides[dims.size() - dim - 1] = strides[dims.size() - dim] * dims[dims.size() - dim]; + strides.resize(4, 1); } CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(mCudnnTensor, @@ -213,27 +220,27 @@ public: } void setRawPtr(void *ptr, NbElts_t length) override final { - AIDGE_ASSERT(length >= mTensor.size(), "trying to set raw pointer of insufficient capacity"); + AIDGE_ASSERT(length >= mNbElts, "trying to set raw pointer of insufficient capacity"); mData = future_std::span<T>(static_cast<T *>(ptr), length); mDataOwner.reset(); }; - virtual ~TensorImpl_cuda() { - if (mCudnnTensor != nullptr) - cudnnDestroyTensorDescriptor(mCudnnTensor); - } + virtual ~TensorImpl_cuda() = default; private: void lazyInit() { - if (mData.size() < mTensor.size()) { + if (mData.size() < mNbElts) { // Need more data, a re-allocation will occur AIDGE_ASSERT(mData.empty() || mDataOwner != nullptr, "trying to enlarge non-owned data"); - mDataOwner.reset(cudaAlloc(mTensor.size())); - mData = future_std::span<T>(mDataOwner.get(), mTensor.size()); + mDataOwner.reset(cudaAlloc(mNbElts)); + mData = future_std::span<T>(mDataOwner.get(), mNbElts); } } }; +template <typename T> +const std::string TensorImpl_cuda<T>::Backend = "cuda"; + namespace { static Registrar<Tensor> registrarTensorImpl_cuda_Float64( {"cuda", DataType::Float64}, Aidge::TensorImpl_cuda<double>::create); diff --git a/include/aidge/backend/cuda/operator/AvgPoolingImpl.hpp b/include/aidge/backend/cuda/operator/AvgPoolingImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..43a6bd57c0c6431705abe73d3f3c175046d72dc9 --- /dev/null +++ b/include/aidge/backend/cuda/operator/AvgPoolingImpl.hpp @@ -0,0 +1,59 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_BACKEND_CUDA_OPERATOR_AVGPOOLINGIMPL_H_ +#define AIDGE_BACKEND_CUDA_OPERATOR_AVGPOOLINGIMPL_H_ + +#include <array> +#include <memory> +#include <tuple> +#include <vector> + +#include <cudnn.h> + +#include "aidge/backend/OperatorImpl.hpp" +#include "aidge/operator/AvgPooling.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" + +#include "aidge/backend/cuda/utils/CudaUtils.hpp" + +namespace Aidge { +template <DimIdx_t DIM> +class AvgPoolingImpl_cuda : public OperatorImpl { +private: + // CuDNN specific variables + cudnnPoolingDescriptor_t mAvgPoolingDesc = nullptr; + cudnnPoolingMode_t mMode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; + std::shared_ptr<Tensor> mInputFallback; + +public: + AvgPoolingImpl_cuda(const AvgPooling_Op<DIM> &op) : OperatorImpl(op, "cuda") {} + + static std::unique_ptr<AvgPoolingImpl_cuda> create(const AvgPooling_Op<2> &op) { + return std::make_unique<AvgPoolingImpl_cuda>(op); + } + +public: + void forward(); + ~AvgPoolingImpl_cuda(); + +private: + template <class T> void forward_(const Tensor& input); +}; + +namespace { +// add cuda backend to AvgPooling_Op<2> implementation registry +static Registrar<AvgPooling_Op<2>> registrarAvgPoolingImpl_cuda("cuda", Aidge::AvgPoolingImpl_cuda<2>::create); +} // namespace +} // namespace Aidge + +#endif /* AIDGE_BACKEND_CUDA_OPERATOR_AVGPOOLINGIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/ConvImpl.hpp b/include/aidge/backend/cuda/operator/ConvImpl.hpp index 0ad995b24082782c611b93fbcc040d1319a7362f..8c591927ce0e52daeff447726c114ce3ae4d0103 100644 --- a/include/aidge/backend/cuda/operator/ConvImpl.hpp +++ b/include/aidge/backend/cuda/operator/ConvImpl.hpp @@ -34,22 +34,30 @@ private: cudnnConvolutionDescriptor_t mConvDesc = nullptr; cudnnFilterDescriptor_t mFilterDesc = nullptr; cudnnConvolutionFwdAlgo_t mFwdAlgo; + cudnnConvolutionBwdFilterAlgo_t mBwdFilterAlgo; + cudnnConvolutionBwdDataAlgo_t mBwdDataAlgo; size_t mWorkspaceSize = 0; - void* mWorkspace = nullptr; + void* mFwdWorkspace = nullptr; + void* mBwdWorkspace = nullptr; + std::shared_ptr<Tensor> mInput0Fallback; + std::shared_ptr<Tensor> mInput1Fallback; + std::shared_ptr<Tensor> mInput2Fallback; public: - ConvImpl_cuda(const Conv_Op<DIM> &op) : OperatorImpl(op) {} + ConvImpl_cuda(const Conv_Op<DIM> &op) : OperatorImpl(op, "cuda") {} - static std::unique_ptr<ConvImpl_cuda> create(const Conv_Op<2> &op) { + static std::unique_ptr<ConvImpl_cuda> create(const Conv_Op<DIM> &op) { return std::make_unique<ConvImpl_cuda>(op); } public: void forward(); + void backward(); ~ConvImpl_cuda(); private: template <class T> void forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2); + template <class T> void backward_(const Tensor& input0, const Tensor& input1, const Tensor& input2); }; namespace { diff --git a/include/aidge/backend/cuda/operator/FCImpl.hpp b/include/aidge/backend/cuda/operator/FCImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..3d8a1348d500fc533c7c9b601b09629995f97427 --- /dev/null +++ b/include/aidge/backend/cuda/operator/FCImpl.hpp @@ -0,0 +1,61 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_BACKEND_CUDA_OPERATOR_FCIMPL_H_ +#define AIDGE_BACKEND_CUDA_OPERATOR_FCIMPL_H_ + +#include <array> +#include <memory> +#include <tuple> +#include <vector> + +#include <cudnn.h> + +#include "aidge/backend/OperatorImpl.hpp" +#include "aidge/operator/FC.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" + +#include "aidge/backend/cuda/utils/CudaUtils.hpp" + +namespace Aidge { +class FCImplForward_cuda : public Registrable<FCImplForward_cuda, + std::tuple<DataType>, + void(std::size_t , std::size_t, std::size_t, bool, const void* , const void* , const void* , void*)> {}; +class FCImpl_cuda : public OperatorImpl { +private: + std::shared_ptr<Tensor> mInput0Fallback; + std::shared_ptr<Tensor> mInput1Fallback; + std::shared_ptr<Tensor> mInput2Fallback; + + +public: + FCImpl_cuda(const FC_Op &op) : OperatorImpl(op, "cuda") {} + + static std::unique_ptr<FCImpl_cuda> create(const FC_Op &op) { + return std::make_unique<FCImpl_cuda>(op); + } + +public: + void forward(); + // ~FCImpl_cuda(); + +private: + template <class T> void forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, bool noBias, std::size_t outChannels); +}; + +namespace { +// add cuda backend to FC_Op implementation registry +static Registrar<FC_Op> registrarFCImpl_cuda("cuda", Aidge::FCImpl_cuda::create); +} // namespace +} // namespace Aidge + +#endif /* AIDGE_BACKEND_CUDA_OPERATOR_FCIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp new file mode 100644 index 0000000000000000000000000000000000000000..9084e01fc08cb3d00e80fc8cf6246064b20591f2 --- /dev/null +++ b/include/aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp @@ -0,0 +1,36 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CUDA_OPERATOR_FCIMPL_FORWARD_KERNEL_H_ +#define AIDGE_CUDA_OPERATOR_FCIMPL_FORWARD_KERNEL_H_ + +#include <stdexcept> +#include <cfloat> +#include <cuda.h> +#include <cuda_runtime_api.h> +#include <cuda_fp16.h> + +#include "aidge/data/Data.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" + +namespace Aidge { + +template <class T> +cublasStatus_t cublasGemm(cublasHandle_t handle, + cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, + const T *alpha, + const T *A, int lda, + const T *B, int ldb, + const T *beta, + T *C, int ldc); +} +#endif /* AIDGE_CUDA_OPERATOR_FCIMPL_FORWARD_KERNEL_H_ */ \ No newline at end of file diff --git a/include/aidge/backend/cuda/operator/MaxPoolingImpl.hpp b/include/aidge/backend/cuda/operator/MaxPoolingImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..4d08d7fab3c7cb2baa18838fd872e44a8eccc923 --- /dev/null +++ b/include/aidge/backend/cuda/operator/MaxPoolingImpl.hpp @@ -0,0 +1,59 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_BACKEND_CUDA_OPERATOR_MAXPOOLINGIMPL_H_ +#define AIDGE_BACKEND_CUDA_OPERATOR_MAXPOOLINGIMPL_H_ + +#include <array> +#include <memory> +#include <tuple> +#include <vector> + +#include <cudnn.h> + +#include "aidge/backend/OperatorImpl.hpp" +#include "aidge/operator/MaxPooling.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" + +#include "aidge/backend/cuda/utils/CudaUtils.hpp" + +namespace Aidge { +template <DimIdx_t DIM> +class MaxPoolingImpl_cuda : public OperatorImpl { +private: + // CuDNN specific variables + cudnnPoolingDescriptor_t mMaxPoolingDesc = nullptr; + cudnnPoolingMode_t mMode = CUDNN_POOLING_MAX; + std::shared_ptr<Tensor> mInputFallback; + +public: + MaxPoolingImpl_cuda(const MaxPooling_Op<DIM> &op) : OperatorImpl(op, "cuda") {} + + static std::unique_ptr<MaxPoolingImpl_cuda> create(const MaxPooling_Op<2> &op) { + return std::make_unique<MaxPoolingImpl_cuda>(op); + } + +public: + void forward(); + ~MaxPoolingImpl_cuda(); + +private: + template <class T> void forward_(const Tensor& input); +}; + +namespace { +// add cuda backend to MaxPooling_Op<2> implementation registry +static Registrar<MaxPooling_Op<2>> registrarMaxPoolingImpl_cuda("cuda", Aidge::MaxPoolingImpl_cuda<2>::create); +} // namespace +} // namespace Aidge + +#endif /* AIDGE_BACKEND_CUDA_OPERATOR_MAXPOOLINGIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/ProducerImpl.hpp b/include/aidge/backend/cuda/operator/ProducerImpl.hpp deleted file mode 100644 index 9912133072e23181df8f384841660bf89a829b60..0000000000000000000000000000000000000000 --- a/include/aidge/backend/cuda/operator/ProducerImpl.hpp +++ /dev/null @@ -1,40 +0,0 @@ -/******************************************************************************** - * Copyright (c) 2023 CEA-List - * - * This program and the accompanying materials are made available under the - * terms of the Eclipse Public License 2.0 which is available at - * http://www.eclipse.org/legal/epl-2.0. - * - * SPDX-License-Identifier: EPL-2.0 - * - ********************************************************************************/ - -#ifndef AIDGE_CUDA_OPERATOR_PRODUCERIMPL_H_ -#define AIDGE_CUDA_OPERATOR_PRODUCERIMPL_H_ - -#include <memory> - -#include "aidge/backend/OperatorImpl.hpp" -#include "aidge/operator/Producer.hpp" -#include "aidge/utils/Registrar.hpp" -#include "aidge/utils/Types.h" - -namespace Aidge { -class ProducerImpl_cuda : public OperatorImpl { -public: - ProducerImpl_cuda(const Producer_Op &op) : OperatorImpl(op) {} - - static std::unique_ptr<ProducerImpl_cuda> create(const Producer_Op &op) { - return std::make_unique<ProducerImpl_cuda>(op); - } - - NbElts_t getNbProducedData(const IOIndex_t outputIdx) const override final; - void forward() override; -}; - -namespace { -static Registrar<Producer_Op> registrarProducerImpl_cuda("cuda", Aidge::ProducerImpl_cuda::create); -} // namespace -} // namespace Aidge - -#endif /* AIDGE_CUDA_OPERATOR_PRODUCERIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/ReLUImpl.hpp b/include/aidge/backend/cuda/operator/ReLUImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..6570662fa5df27e54a9df6f357e918243a71330a --- /dev/null +++ b/include/aidge/backend/cuda/operator/ReLUImpl.hpp @@ -0,0 +1,61 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_BACKEND_CUDA_OPERATOR_RELUIMPL_H_ +#define AIDGE_BACKEND_CUDA_OPERATOR_RELUIMPL_H_ + +#include <array> +#include <memory> +#include <tuple> +#include <vector> + +#include <cudnn.h> + +#include "aidge/backend/OperatorImpl.hpp" +#include "aidge/operator/ReLU.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" + +#include "aidge/backend/cuda/utils/CudaUtils.hpp" + +namespace Aidge { +class ReLUImpl_cuda : public OperatorImpl { +private: + // CuDNN specific variables + #if CUDNN_VERSION >= 5000 + cudnnActivationDescriptor_t mReLUDesc = nullptr; + #else + cudnnActivationMode_t mReLUDesc = nullptr; + #endif + std::shared_ptr<Tensor> mInputFallback; + +public: + ReLUImpl_cuda(const ReLU_Op &op) : OperatorImpl(op, "cuda") {} + + static std::unique_ptr<ReLUImpl_cuda> create(const ReLU_Op &op) { + return std::make_unique<ReLUImpl_cuda>(op); + } + +public: + void forward(); + ~ReLUImpl_cuda(); + +private: + template <class T> void forward_(const Tensor& input); +}; + +namespace { +// add cuda backend to ReLU_Op implementation registry +static Registrar<ReLU_Op> registrarReLUImpl_cuda("cuda", Aidge::ReLUImpl_cuda::create); +} // namespace +} // namespace Aidge + +#endif /* AIDGE_BACKEND_CUDA_OPERATOR_RELUIMPL_H_ */ diff --git a/include/aidge/backend/cuda/utils/CudaContext.hpp b/include/aidge/backend/cuda/utils/CudaContext.hpp index 82dd395e6bbb33bae29c5d881290d6996bfb0332..7218cc24aed718f57a1866be74e7ba9124a5a7f1 100644 --- a/include/aidge/backend/cuda/utils/CudaContext.hpp +++ b/include/aidge/backend/cuda/utils/CudaContext.hpp @@ -2,8 +2,8 @@ #define AIDGE_BACKEND_CUDA_CUDA_CONTEXT_H #include <vector> -#include <cstdio> +#include "aidge/utils/ErrorHandling.hpp" #include "aidge/backend/cuda/utils/CudaUtils.hpp" namespace Aidge { @@ -87,7 +87,7 @@ public: if (cublas_h[dev] == NULL) { CHECK_CUBLAS_STATUS(cublasCreate(&cublas_h[dev])); - printf("CUBLAS initialized on device #%d\n", dev); + fmt::print("CUBLAS initialized on device #{}\n", dev); } return cublas_h[dev]; @@ -113,7 +113,7 @@ public: if (cudnn_h[dev] == NULL) { CHECK_CUDNN_STATUS(cudnnCreate(&cudnn_h[dev])); - printf("CUDNN initialized on device #%d\n", dev); + fmt::print("CUDNN initialized on device #{}\n", dev); } return cudnn_h[dev]; diff --git a/include/aidge/backend/cuda/utils/CudaUtils.hpp b/include/aidge/backend/cuda/utils/CudaUtils.hpp index 2f66d0e778778400f0b7def345619d635cc37674..ab7c805224ed6fe073baf2036b84f4ed6f49b077 100644 --- a/include/aidge/backend/cuda/utils/CudaUtils.hpp +++ b/include/aidge/backend/cuda/utils/CudaUtils.hpp @@ -11,6 +11,9 @@ #include <cuda.h> #include <cudnn.h> +#include "aidge/data/half.hpp" +#include "aidge/utils/ErrorHandling.hpp" + #define CHECK_CUDNN_STATUS(status) \ do { \ const cudnnStatus_t e = (status); \ @@ -62,6 +65,29 @@ namespace Aidge { namespace Cuda { + // CuDNN scaling parameters are typically "alpha" and "beta". + // Their type must be "float" for HALF and FLOAT (default template) + // and "double" for DOUBLE (specialized template) + template <class T> + struct cudnn_scaling_type { + typedef float type; + }; + + template <> + struct cudnn_scaling_type<double> { + typedef double type; + }; + + template <class T> + struct cuda_type { + typedef T type; + }; + + template <> + struct cuda_type<half_float::half> { + typedef __half type; + }; + const char* cublasGetErrorString(cublasStatus_t error); // Enable Peer-to-Peer communications between devices diff --git a/setup.py b/setup.py index dbf82e2826065c812a3bea5f5e1d34557b79f00d..80500f3165dd87eb7b6dd73c78b89806cc8a874a 100644 --- a/setup.py +++ b/setup.py @@ -62,10 +62,10 @@ class CMakeBuild(build_ext): os.chdir(str(build_temp)) - # Impose to use the executable of the python + # Impose to use the executable of the python # used to launch setup.py to setup PythonInterp param_py = "-DPYTHON_EXECUTABLE=" + sys.executable - + compile_type = 'Debug' install_path = os.path.join(sys.prefix, "lib", "libAidge") if "AIDGE_INSTALL" not in os.environ else os.environ["AIDGE_INSTALL"] @@ -85,11 +85,11 @@ class CMakeBuild(build_ext): for file in files: if (file.endswith('.so') or file.endswith('.pyd')) and (root != str(aidge_package.absolute())): currentFile=os.path.join(root, file) - shutil.copy(currentFile, str(aidge_package.absolute())) + shutil.copy(currentFile, str(aidge_package.absolute())) # Copy version.txt in aidge_package os.chdir(os.path.dirname(__file__)) - shutil.copy("version.txt", str(aidge_package.absolute())) + shutil.copy("version.txt", str(aidge_package.absolute())) if __name__ == '__main__': @@ -108,7 +108,7 @@ if __name__ == '__main__': cmdclass={ 'build_ext': CMakeBuild, }, - install_requires=['aidge_core', 'aidge_backend_cpu'], + install_requires=['aidge_core'], zip_safe=False, ) diff --git a/src/data/TensorImpl.cu b/src/data/TensorImpl.cu index ecacd4d678dd7d79462332fb28e238b063d8bdd1..898475b5db325afcaedff44756cc2157cf9e2eec 100644 --- a/src/data/TensorImpl.cu +++ b/src/data/TensorImpl.cu @@ -91,10 +91,10 @@ template <class T> bool Aidge::TensorImpl_cuda<T>::operator==(const TensorImpl &otherImpl) const { const auto& otherImplCuda = static_cast<const TensorImpl_cuda<T>&>(otherImpl); - if (mTensor.size() != otherImplCuda.mTensor.size()) + if (mNbElts != otherImplCuda.size()) return false; thrust::device_ptr<T> thrustData(mData.data()); thrust::device_ptr<T> thrustOtherData(otherImplCuda.mData.data()); - return thrust::equal(thrustData, thrustData + mTensor.size(), thrustOtherData); + return thrust::equal(thrustData, thrustData + mNbElts, thrustOtherData); } diff --git a/src/operator/AvgPoolingImpl.cpp b/src/operator/AvgPoolingImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..eb9cc6a1f4412178525a5e6bccd32e94c4413d4d --- /dev/null +++ b/src/operator/AvgPoolingImpl.cpp @@ -0,0 +1,90 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <cassert> +#include <vector> + +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/AvgPoolingImpl.hpp" +#include "aidge/backend/cuda/utils/CudaContext.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/operator/AvgPooling.hpp" +#include "aidge/utils/Types.h" + +template <Aidge::DimIdx_t DIM> +void Aidge::AvgPoolingImpl_cuda<DIM>::forward() { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + + assert(mOp.getRawInput(0) && "missing input #0"); + + const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0)); + + // Lazy-initialize CuDNN AvgPooling descriptor + if (mAvgPoolingDesc == nullptr) { + const AvgPooling_Op<DIM>& avgPoolingOp = static_cast<const AvgPooling_Op<DIM>&>(op); + const std::vector<int> strides(avgPoolingOp.template getAttr<AvgPoolingAttr::StrideDims>().begin(), avgPoolingOp.template getAttr<AvgPoolingAttr::StrideDims>().end()); + const std::vector<int> paddings(DIM, 0); + const std::vector<int> window_dims(avgPoolingOp.template getAttr<AvgPoolingAttr::KernelDims>().begin(), avgPoolingOp.template getAttr<AvgPoolingAttr::KernelDims>().end()); + + CHECK_CUDNN_STATUS(cudnnCreatePoolingDescriptor(&mAvgPoolingDesc)); + CHECK_CUDNN_STATUS( + cudnnSetPoolingNdDescriptor(mAvgPoolingDesc, + mMode, + CUDNN_NOT_PROPAGATE_NAN, + DIM, + &window_dims[0], + &paddings[0], + &strides[0])); + } + + switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + forward_<double>(input); + break; + case DataType::Float32: + forward_<float>(input); + break; + case DataType::Float16: + forward_<half>(input); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + } +} + +template <Aidge::DimIdx_t DIM> +template <class T> +void Aidge::AvgPoolingImpl_cuda<DIM>::forward_(const Tensor& input) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; + const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; + CHECK_CUDNN_STATUS( + cudnnPoolingForward( + CudaContext::cudnnHandle(), + mAvgPoolingDesc, + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input), + input.getImpl()->rawPtr(), + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), + std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr() + ) + ); +} + +template <Aidge::DimIdx_t DIM> +Aidge::AvgPoolingImpl_cuda<DIM>::~AvgPoolingImpl_cuda() { + if(mAvgPoolingDesc != nullptr) + cudnnDestroyPoolingDescriptor(mAvgPoolingDesc); +} + +// Template declarations +template class Aidge::AvgPoolingImpl_cuda<2>; diff --git a/src/operator/ConvImpl.cpp b/src/operator/ConvImpl.cpp index 9c3684e89f6b27133ca99be16b332c4e9f9a27b1..096ee9485a03b736326f46e9a569c6b3c9b5a631 100644 --- a/src/operator/ConvImpl.cpp +++ b/src/operator/ConvImpl.cpp @@ -10,32 +10,26 @@ ********************************************************************************/ #include <cassert> -#include <chrono> // std::chrono::milliseconds -#include <numeric> // std::accumulate -#include <thread> // std::this_thread::sleep_for #include <vector> -#include "aidge/utils/Types.h" -#include "aidge/operator/Conv.hpp" - #include "aidge/backend/cuda/data/TensorImpl.hpp" #include "aidge/backend/cuda/operator/ConvImpl.hpp" -#include "aidge/backend/cuda/utils/CudaContext.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/operator/Conv.hpp" +#include "aidge/utils/Types.h" template <Aidge::DimIdx_t DIM> void Aidge::ConvImpl_cuda<DIM>::forward() { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + // FIXME: uncomment the following code once memory handling will work assert(mOp.getRawInput(0) && "missing input #0"); assert(mOp.getRawInput(1) && "missing input #1"); // Convert input data (no overhead if not needed!) - // TODO: right now, if needed, memory will be allocated/deallocated at each - // call to forward(). We might put the following shared_ptr as members of - // this class to avoid that. - std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback; - const auto& input0 = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->refCastFrom(input0Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); - const auto& input1 = std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->refCastFrom(input1Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); - const auto& input2 = std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->refCastFrom(input2Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); + const auto& input0 = op.getInput(0)->refCastFrom(mInput0Fallback, *op.getOutput(0)); + const auto& input1 = op.getInput(1)->refCastFrom(mInput1Fallback, *op.getOutput(0)); + const auto& input2 = op.getInput(2)->refCastFrom(mInput2Fallback, *op.getOutput(0)); // Lazy-initialize CuDNN convolution descriptor if (mConvDesc == nullptr) { @@ -45,14 +39,13 @@ void Aidge::ConvImpl_cuda<DIM>::forward() { const std::vector<int> upscales(convOp.template getAttr<ConvAttr::DilationDims>().begin(), convOp.template getAttr<ConvAttr::DilationDims>().end()); CHECK_CUDNN_STATUS(cudnnCreateConvolutionDescriptor(&mConvDesc)); - CHECK_CUDNN_STATUS( - cudnnSetConvolutionNdDescriptor(mConvDesc, - DIM, - &paddings[0], - &strides[0], - &upscales[0], - CUDNN_CROSS_CORRELATION, - DataTypeToCudnn(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()))); + CHECK_CUDNN_STATUS(cudnnSetConvolutionNdDescriptor(mConvDesc, + DIM, + &paddings[0], + &strides[0], + &upscales[0], + CUDNN_CROSS_CORRELATION, + DataTypeToCudnn(op.getOutput(0)->dataType()))); } // Lazy-initialize CuDNN filter descriptor @@ -61,14 +54,14 @@ void Aidge::ConvImpl_cuda<DIM>::forward() { CHECK_CUDNN_STATUS(cudnnCreateFilterDescriptor(&mFilterDesc)); CHECK_CUDNN_STATUS(cudnnSetFilterNdDescriptor(mFilterDesc, - DataTypeToCudnn(input1.dataType()), - CUDNN_TENSOR_NCHW, - kernels.size(), - &kernels[0])); + DataTypeToCudnn(input1.dataType()), + CUDNN_TENSOR_NCHW, + kernels.size(), + &kernels[0])); } // Set forward algorithm and allocate the required workspace - if (mWorkspace == nullptr) { + if (mFwdWorkspace == nullptr) { // Find the best CuDNN forward algorithm (the one with the lowest compute time) int maxAlgoIterations = 0; cudnnGetConvolutionForwardAlgorithmMaxCount(CudaContext::cudnnHandle(), @@ -80,14 +73,14 @@ void Aidge::ConvImpl_cuda<DIM>::forward() { std::vector<cudnnConvolutionFwdAlgoPerf_t> returnFwdAlgo(maxAlgoIterations); CHECK_CUDNN_STATUS(cudnnFindConvolutionForwardAlgorithm( - CudaContext::cudnnHandle(), - dynamic_cast<TensorImpl_cuda_*>(input0.getImpl().get())->getCudnnTensorDesc(), - mFilterDesc, - mConvDesc, - dynamic_cast<TensorImpl_cuda_*>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl().get())->getCudnnTensorDesc(), - maxAlgoIterations, - &returnAlgoCounts, - &returnFwdAlgo[0])); + CudaContext::cudnnHandle(), + std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0), + mFilterDesc, + mConvDesc, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), + maxAlgoIterations, + &returnAlgoCounts, + &returnFwdAlgo[0])); mFwdAlgo = returnFwdAlgo[0].algo; // Allocate the workspace required by the chosen CuDNN forward algorithm @@ -95,48 +88,54 @@ void Aidge::ConvImpl_cuda<DIM>::forward() { CHECK_CUDNN_STATUS(cudnnGetConvolutionForwardWorkspaceSize( CudaContext::cudnnHandle(), - dynamic_cast<TensorImpl_cuda_*>(input0.getImpl().get())->getCudnnTensorDesc(), + std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0), mFilterDesc, mConvDesc, - dynamic_cast<TensorImpl_cuda_*>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl().get())->getCudnnTensorDesc(), + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), mFwdAlgo, &workspaceSize)); - CHECK_CUDA_STATUS(cudaMalloc(&mWorkspace, workspaceSize)); + CHECK_CUDA_STATUS(cudaMalloc(&mFwdWorkspace, workspaceSize)); mWorkspaceSize = workspaceSize; } // Do the actual forward computation // Template is only for scaling parameters, which are always in float // excepted when the convolution is performed in double precision. - if (std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType() == DataType::Float64) { - forward_<double>(input0, input1, input2); - } - else { - forward_<float>(input0, input1, input2); + switch(op.getOutput(0)->dataType()) { + case DataType::Float64: + forward_<double>(input0, input1, input2); + break; + case DataType::Float32: + forward_<float>(input0, input1, input2); + break; + case DataType::Float16: + forward_<half>(input0, input1, input2); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); } } template <Aidge::DimIdx_t DIM> template <class T> void Aidge::ConvImpl_cuda<DIM>::forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2) { - const T alpha = 1.0f; - const T beta = 0.0f; - - CHECK_CUDNN_STATUS( - cudnnConvolutionForward(CudaContext::cudnnHandle(), - &alpha, - dynamic_cast<TensorImpl_cuda_*>(input0.getImpl().get())->getCudnnTensorDesc(), - input0.getImpl()->rawPtr(), - mFilterDesc, - input1.getImpl()->rawPtr(), - mConvDesc, - mFwdAlgo, - mWorkspace, - mWorkspaceSize, - &beta, - dynamic_cast<TensorImpl_cuda_*>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl().get())->getCudnnTensorDesc(), - std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr())); + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; + const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; + CHECK_CUDNN_STATUS(cudnnConvolutionForward(CudaContext::cudnnHandle(), + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0), + input0.getImpl()->rawPtr(), + mFilterDesc, + input1.getImpl()->rawPtr(), + mConvDesc, + mFwdAlgo, + mFwdWorkspace, + mWorkspaceSize, + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), + op.getOutput(0)->getImpl()->rawPtr())); // Add bias (if there is any) if (mOp.getRawInput(2) && input2.size() > 0) { @@ -151,12 +150,182 @@ void Aidge::ConvImpl_cuda<DIM>::forward_(const Tensor& input0, const Tensor& inp // TODO: find a more elegant solution(?) CHECK_CUDNN_STATUS(cudnnAddTensor(CudaContext::cudnnHandle(), - &alpha, - dynamic_cast<TensorImpl_cuda_*>(bias.getImpl().get())->getCudnnTensorDesc(), - input2.getImpl()->rawPtr(), - &alpha, - dynamic_cast<TensorImpl_cuda_*>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl().get())->getCudnnTensorDesc(), - std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr())); + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(bias.getImpl())->getCudnnTensorDesc(bias), + input2.getImpl()->rawPtr(), + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), + op.getOutput(0)->getImpl()->rawPtr())); + } +} + +template <Aidge::DimIdx_t DIM> +void Aidge::ConvImpl_cuda<DIM>::backward() { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + + // FIXME: uncomment the following code once memory handling will work + assert(mOp.getRawInput(0) && "missing input #0"); + assert(mOp.getRawInput(1) && "missing input #1"); + + // Convert input data (no overhead if not needed!) + const auto& input0 = op.getInput(0)->ref(mInput0Fallback, *op.getOutput(0)); + const auto& input1 = op.getInput(1)->ref(mInput1Fallback, *op.getOutput(0)); + const auto& input2 = op.getInput(2)->ref(mInput2Fallback, *op.getOutput(0)); + + // Set forward algorithm and allocate the required workspace + if (mBwdWorkspace == nullptr) { + // Find the best CuDNN backward algorithm (the one with the lowest compute time) + int maxAlgoIterations = 0; + cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(CudaContext::cudnnHandle(), + &maxAlgoIterations); + assert(maxAlgoIterations > 0 && "No available CUDNN ConvolutionBackwardFilterAlgorithm"); + + int returnAlgoCounts = 0; + std::vector<cudnnConvolutionBwdFilterAlgoPerf_t> returnBwdFilterAlgo(maxAlgoIterations); + + CHECK_CUDNN_STATUS(cudnnFindConvolutionBackwardFilterAlgorithm( + CudaContext::cudnnHandle(), + std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0), + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), + mConvDesc, + mFilterDesc, + maxAlgoIterations, + &returnAlgoCounts, + &returnBwdFilterAlgo[0])); + + mBwdFilterAlgo = returnBwdFilterAlgo[0].algo; + + maxAlgoIterations = 0; + cudnnGetConvolutionBackwardDataAlgorithmMaxCount(CudaContext::cudnnHandle(), + &maxAlgoIterations); + assert(maxAlgoIterations > 0 && "No available CUDNN ConvolutionBackwardDataAlgorithm"); + + returnAlgoCounts = 0; + std::vector<cudnnConvolutionBwdDataAlgoPerf_t> returnBwdDataAlgo(maxAlgoIterations); + + CHECK_CUDNN_STATUS(cudnnFindConvolutionBackwardDataAlgorithm( + CudaContext::cudnnHandle(), + mFilterDesc, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), + mConvDesc, + std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0), + maxAlgoIterations, + &returnAlgoCounts, + &returnBwdDataAlgo[0])); + + mBwdDataAlgo = returnBwdDataAlgo[0].algo; + + // Allocate the workspace required by the chosen CuDNN backward algorithm + size_t workspaceSize = 0; + CHECK_CUDNN_STATUS(cudnnGetConvolutionBackwardFilterWorkspaceSize( + CudaContext::cudnnHandle(), + // same arguments as cudnnGetConvolutionBackwardFilterAlgorithm() + // --> + std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0), + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), + mConvDesc, + mFilterDesc, + // <-- + mBwdFilterAlgo, + &workspaceSize)); + + size_t workspaceSizeData = 0; + CHECK_CUDNN_STATUS(cudnnGetConvolutionBackwardDataWorkspaceSize( + CudaContext::cudnnHandle(), + // same arguments as cudnnGetConvolutionBackwardDataAlgorithm() --> + mFilterDesc, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), + mConvDesc, + std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0), + // <-- + mBwdDataAlgo, + &workspaceSizeData)); + + if (workspaceSizeData > workspaceSize) + workspaceSize = workspaceSizeData; + + if (workspaceSize > mWorkspaceSize) { + if (mFwdWorkspace != nullptr) { + cudaFree(mFwdWorkspace); + } + CHECK_CUDA_STATUS(cudaMalloc(&mFwdWorkspace, workspaceSize)); + mWorkspaceSize = workspaceSize; + } + + mBwdWorkspace = mFwdWorkspace; + } + + // Do the actual backward computation + // Template is only for scaling parameters, which are always in float + // excepted when the convolution is performed in double precision. + if (op.getOutput(0)->dataType() == DataType::Float64) { + backward_<double>(input0, input1, input2); + } + else { + backward_<float>(input0, input1, input2); + } +} + +template <Aidge::DimIdx_t DIM> +template <class T> +void Aidge::ConvImpl_cuda<DIM>::backward_(const Tensor& input0, const Tensor& input1, const Tensor& input2) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + + std::shared_ptr<Tensor> gradOutputFallback; + const auto& gradOutput = op.getOutput(0)->grad()->refCastFrom(gradOutputFallback, *(op.getInput(0)->grad())); + + const T alpha = 1.0f; + const T beta = 0.0f; + + CHECK_CUDNN_STATUS(cudnnConvolutionBackwardFilter( + CudaContext::cudnnHandle(), + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0), + input0.getImpl()->rawPtr(), + std::dynamic_pointer_cast<TensorImpl_cuda_>(gradOutput.getImpl())->getCudnnTensorDesc(gradOutput), + gradOutput.getImpl()->rawPtr(), + mConvDesc, + mBwdFilterAlgo, + mBwdWorkspace, + mWorkspaceSize, + &beta, + mFilterDesc, + op.getInput(1)->grad()->getImpl()->rawPtr())); + + CHECK_CUDNN_STATUS(cudnnConvolutionBackwardData( + CudaContext::cudnnHandle(), + &alpha, + mFilterDesc, + input1.getImpl()->rawPtr(), + std::dynamic_pointer_cast<TensorImpl_cuda_>(gradOutput.getImpl())->getCudnnTensorDesc(gradOutput), + gradOutput.getImpl()->rawPtr(), + mConvDesc, + mBwdDataAlgo, + mBwdWorkspace, + mWorkspaceSize, + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->grad()->getImpl())->getCudnnTensorDesc(*op.getInput(0)), + op.getInput(0)->grad()->getImpl()->rawPtr())); + + // Add bias (if there is any) + if (mOp.getRawInput(2) && input2.size() > 0) { + // Bias tensor needs to have the same number of dims than output tensor for cudnnAddTensor() + std::vector<DimSize_t> gradBiasDims(DIM+2, 1); + gradBiasDims[1] = op.getInput(2)->grad()->size(); + + // Create a dummy tensor with the right dims in order to get a CuDNN tensor descriptor (with getCudnnTensorDesc()) + Tensor gradBias(op.getInput(2)->grad()->dataType()); + gradBias.setBackend("cuda"); + gradBias.resize(gradBiasDims); + // TODO: find a more elegant solution(?) + + CHECK_CUDNN_STATUS(cudnnConvolutionBackwardBias(CudaContext::cudnnHandle(), + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(gradOutput.getImpl())->getCudnnTensorDesc(gradOutput), + gradOutput.getImpl()->rawPtr(), + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>(gradBias.getImpl())->getCudnnTensorDesc(gradBias), + op.getInput(2)->grad()->getImpl()->rawPtr())); } } @@ -170,8 +339,8 @@ Aidge::ConvImpl_cuda<DIM>::~ConvImpl_cuda() { cudnnDestroyFilterDescriptor(mFilterDesc); } - if (mWorkspace != nullptr) { - cudaFree(mWorkspace); + if (mFwdWorkspace != nullptr) { + cudaFree(mFwdWorkspace); } } diff --git a/src/operator/FCImpl.cpp b/src/operator/FCImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8b60f7fd6aa41f206b2c6eaa5d8f8daa1bd81374 --- /dev/null +++ b/src/operator/FCImpl.cpp @@ -0,0 +1,117 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <cassert> +#include <chrono> // std::chrono::milliseconds +#include <numeric> // std::accumulate +#include <thread> // std::this_thread::sleep_for +#include <vector> + +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/FCImpl.hpp" +#include "aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp" +#include "aidge/backend/cuda/utils/CudaContext.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/operator/FC.hpp" +#include "aidge/utils/Types.h" + +void Aidge::FCImpl_cuda::forward() { + assert(mOp.getRawInput(0) && "missing input #0"); + assert(mOp.getRawInput(1) && "missing input #1"); + assert(mOp.getRawInput(2) && "missing input #2"); + + const auto& fcOp = static_cast<const FC_Op&>(mOp); + bool noBias = fcOp.template getAttr<FCAttr::NoBias>(); + std::size_t outChannels = static_cast<std::size_t>(fcOp.template getAttr<FCAttr::OutChannels>()); + + const auto& input0 = fcOp.getInput(0)->refCastFrom(mInput0Fallback, *fcOp.getOutput(0)); + const auto& input1 = fcOp.getInput(1)->refCastFrom(mInput1Fallback, *fcOp.getOutput(0)); + const auto& input2 = fcOp.getInput(2)->refCastFrom(mInput2Fallback, *fcOp.getOutput(0)); + + switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + forward_<double>(input0, input1, input2, noBias, outChannels); + break; + case DataType::Float32: + forward_<float>(input0, input1, input2, noBias, outChannels); + break; + case DataType::Float16: + forward_<half>(input0, input1, input2, noBias, outChannels); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + } +} + +template<class T> +void Aidge::FCImpl_cuda::forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, bool noBias, std::size_t outChannels) +{ + const T * input = static_cast<const T*>(input0.getImpl()->rawPtr()); + const T * weights = static_cast<const T*>(input1.getImpl()->rawPtr()); + T * output = static_cast<T*>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr()); + + // Performing output = T(weights) * input + // [n x m] = [n x k] * [k x m] + // cublas is column-major so instead of transposing inputs, computing output [m x n] and transposing output, we compute output as [n x m] + int n = outChannels; + int m = std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->size()/n; + int k = input0.size()/m; + int lda = k; // leading dimension of weights + int ldb = k; // leading dimension of input + int ldc = n; // leading dimension of output + const T alpha = 1.0f; + const T beta = 0.0f; + CHECK_CUBLAS_STATUS(cublasGemm(CudaContext::cublasHandle(), + CUBLAS_OP_T, + CUBLAS_OP_N, + n, + m, + k, + reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&alpha), + weights, + ldb, + input, + lda, + reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&beta), + output, + ldc)); + + if(!noBias){ + T* onesVector; + CHECK_CUDA_STATUS(cudaMalloc((void**)&onesVector, m * sizeof(T))); + // Fill the vector with ones + std::vector<T> onesVec(m, T(1.0)); + CHECK_CUDA_STATUS(cudaMemcpy(onesVector, + &onesVec[0], + m * sizeof(T), + cudaMemcpyHostToDevice)); + const T * biases = static_cast<const T*>(input2.getImpl()->rawPtr()); + // Performing output = biases * onesVector + output + // [n x m] = [n x 1] * [1 x m] + [n x m] + CHECK_CUBLAS_STATUS(cublasGemm(CudaContext::cublasHandle(), + CUBLAS_OP_N, + CUBLAS_OP_N, + n, + m, + 1, + reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&alpha), + biases, + n, + onesVector, + 1, + reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&alpha), + output, + n)); + + cudaFree(onesVector); + } + +} \ No newline at end of file diff --git a/src/operator/FCImpl_CUDA_kernels.cu b/src/operator/FCImpl_CUDA_kernels.cu new file mode 100644 index 0000000000000000000000000000000000000000..5139ac1d7edf61cf347870e6add2870b2792a0e5 --- /dev/null +++ b/src/operator/FCImpl_CUDA_kernels.cu @@ -0,0 +1,76 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ +#include <stdio.h> + +#include "aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp" + +namespace Aidge{ + +template <> +cublasStatus_t cublasGemm<__half>(cublasHandle_t handle, + cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, + const __half *alpha, + const __half *A, int lda, + const __half *B, int ldb, + const __half *beta, + __half *C, int ldc) +{ + return cublasHgemm(handle, + transa, transb, + m, n, k, + alpha, + A, lda, + B, ldb, + beta, + C, ldc); +} + +template <> +cublasStatus_t cublasGemm<float>(cublasHandle_t handle, + cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, + const float *alpha, + const float *A, int lda, + const float *B, int ldb, + const float *beta, + float *C, int ldc) +{ + return cublasSgemm(handle, + transa, transb, + m, n, k, + alpha, + A, lda, + B, ldb, + beta, + C, ldc); +} + +template <> +cublasStatus_t cublasGemm<double>(cublasHandle_t handle, + cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, + const double *alpha, + const double *A, int lda, + const double *B, int ldb, + const double *beta, + double *C, int ldc) +{ + return cublasDgemm(handle, + transa, transb, + m, n, k, + alpha, + A, lda, + B, ldb, + beta, + C, ldc); +} +} \ No newline at end of file diff --git a/src/operator/MaxPoolingImpl.cpp b/src/operator/MaxPoolingImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b8d7c81948bd898b7cc4e2f3bead9c498175e2c1 --- /dev/null +++ b/src/operator/MaxPoolingImpl.cpp @@ -0,0 +1,91 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <cassert> +#include <vector> + +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/MaxPoolingImpl.hpp" +#include "aidge/backend/cuda/utils/CudaContext.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/operator/MaxPooling.hpp" +#include "aidge/utils/Types.h" + +template <Aidge::DimIdx_t DIM> +void Aidge::MaxPoolingImpl_cuda<DIM>::forward() { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + + assert(mOp.getRawInput(0) && "missing input #0"); + + const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0)); + + // Lazy-initialize CuDNN MaxPooling descriptor + if (mMaxPoolingDesc == nullptr) { + const MaxPooling_Op<DIM>& maxPoolingOp = static_cast<const MaxPooling_Op<DIM>&>(op); + const std::vector<int> strides(maxPoolingOp.template getAttr<MaxPoolingAttr::StrideDims>().begin(), maxPoolingOp.template getAttr<MaxPoolingAttr::StrideDims>().end()); + const std::vector<int> paddings(DIM, 0); + const std::vector<int> window_dims(maxPoolingOp.template getAttr<MaxPoolingAttr::KernelDims>().begin(), maxPoolingOp.template getAttr<MaxPoolingAttr::KernelDims>().end()); + + CHECK_CUDNN_STATUS(cudnnCreatePoolingDescriptor(&mMaxPoolingDesc)); + CHECK_CUDNN_STATUS( + cudnnSetPoolingNdDescriptor(mMaxPoolingDesc, + mMode, + CUDNN_NOT_PROPAGATE_NAN, + DIM, + &window_dims[0], + &paddings[0], + &strides[0])); + } + + switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + forward_<double>(input); + break; + case DataType::Float32: + forward_<float>(input); + break; + case DataType::Float16: + forward_<half>(input); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + } +} + +template <Aidge::DimIdx_t DIM> +template <class T> +void Aidge::MaxPoolingImpl_cuda<DIM>::forward_(const Tensor& input) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; + const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; + CHECK_CUDNN_STATUS( + cudnnPoolingForward( + CudaContext::cudnnHandle(), + mMaxPoolingDesc, + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input), + input.getImpl()->rawPtr(), + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), + std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr() + ) + ); +} + +template <Aidge::DimIdx_t DIM> +Aidge::MaxPoolingImpl_cuda<DIM>::~MaxPoolingImpl_cuda() { + if(mMaxPoolingDesc != nullptr) + cudnnDestroyPoolingDescriptor(mMaxPoolingDesc); +} + + +// Template declarations +template class Aidge::MaxPoolingImpl_cuda<2>; diff --git a/src/operator/ProducerImpl.cpp b/src/operator/ProducerImpl.cpp deleted file mode 100644 index aca3c4945e357be13017e302cb6e7f12ba61237c..0000000000000000000000000000000000000000 --- a/src/operator/ProducerImpl.cpp +++ /dev/null @@ -1,34 +0,0 @@ -/******************************************************************************** - * Copyright (c) 2023 CEA-List - * - * This program and the accompanying materials are made available under the - * terms of the Eclipse Public License 2.0 which is available at - * http://www.eclipse.org/legal/epl-2.0. - * - * SPDX-License-Identifier: EPL-2.0 - * - ********************************************************************************/ - -#include <cassert> -#include <numeric> // std::accumulate -#include <vector> - -#include "aidge/data/Tensor.hpp" -#include "aidge/operator/Producer.hpp" -#include "aidge/utils/Types.h" - -#include "aidge/backend/cuda/operator/ProducerImpl.hpp" - -Aidge::DimSize_t Aidge::ProducerImpl_cuda::getNbProducedData( - Aidge::IOIndex_t outputIdx) const -{ - // Requires the whole tensors, regardless of available data on inputs - assert(outputIdx == 0 && "operator has only one output"); - (void) outputIdx; - - return std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->size(); -} - -void Aidge::ProducerImpl_cuda::forward() -{ -} diff --git a/src/operator/ReLUImpl.cpp b/src/operator/ReLUImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2ebd6b276e02000bdefb22fe8d2373255a1a5c2c --- /dev/null +++ b/src/operator/ReLUImpl.cpp @@ -0,0 +1,78 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <cassert> +#include <vector> + +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/ReLUImpl.hpp" +#include "aidge/backend/cuda/utils/CudaContext.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/operator/ReLU.hpp" +#include "aidge/utils/Types.h" + +void Aidge::ReLUImpl_cuda::forward() { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + + assert(mOp.getRawInput(0) && "missing input #0"); + + const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0)); + + // Lazy-initialize CuDNN ReLU descriptor + if (mReLUDesc == nullptr) { + #if CUDNN_VERSION >= 5000 + CHECK_CUDNN_STATUS(cudnnCreateActivationDescriptor(&mReLUDesc)); + CHECK_CUDNN_STATUS(cudnnSetActivationDescriptor( + mReLUDesc, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, 0.0)); + #else + mReLUDesc = CUDNN_ACTIVATION_RELU; + #endif + } + + switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + forward_<double>(input); + break; + case DataType::Float32: + forward_<float>(input); + break; + case DataType::Float16: + forward_<half>(input); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + } +} + +template <class T> +void Aidge::ReLUImpl_cuda::forward_(const Tensor& input) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; + const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; + CHECK_CUDNN_STATUS( + cudnnActivationForward(CudaContext::cudnnHandle(), + mReLUDesc, + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input), + input.getImpl()->rawPtr(), + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), + std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr())); +} + +Aidge::ReLUImpl_cuda::~ReLUImpl_cuda() { + if (mReLUDesc != nullptr) { + #if CUDNN_VERSION >= 5000 + cudnnDestroyActivationDescriptor(mReLUDesc); + #endif + } +} + diff --git a/src/utils/CudaUtils.cpp b/src/utils/CudaUtils.cpp index a6e0514f9a949a805561d966e2a712701c18936c..ca3263a282322e70157b7537c502a63a3edb526f 100644 --- a/src/utils/CudaUtils.cpp +++ b/src/utils/CudaUtils.cpp @@ -40,7 +40,7 @@ void Aidge::Cuda::setMultiDevicePeerAccess(unsigned int size, unsigned int* devi CHECK_CUDA_STATUS(cudaSetDevice(devices[j])); const cudaError_t status = cudaDeviceEnablePeerAccess(devices[i], 0); if (status == cudaErrorPeerAccessAlreadyEnabled) { - printf("Peer access already enabled between device %d and device %d\n", devices[j], devices[i]); + fmt::print("Peer access already enabled between device {} and device {}\n", devices[j], devices[i]); } else { CHECK_CUDA_STATUS(status); } diff --git a/unit_tests/Test_AvgPoolingImpl.cpp b/unit_tests/Test_AvgPoolingImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d4d39db555e9d12c7e5135d1eb3db6ffc8f459c3 --- /dev/null +++ b/unit_tests/Test_AvgPoolingImpl.cpp @@ -0,0 +1,163 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <array> + +#include <catch2/catch_test_macros.hpp> +#include <cuda_fp16.h> +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution + +#include "Test_cuda.hpp" + +#include "aidge/data/half.hpp" +#include "aidge/data/Tensor.hpp" + +#include "aidge/backend/cpu.hpp" +#include "aidge/backend/cuda.hpp" + +using namespace Aidge; + +TEST_CASE("[gpu/operator] AvgPooling(forward)", "[AvgPooling][GPU]") { + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,2,5,5> { //NCHW + { + { + {{ 0, 1, 2, 3, 4}, + { 5, 6, 7, 8, 9}, + { 10, 11, 12, 13, 14}, + { 15, 16, 17, 18, 19}, + { 20, 21, 22, 23, 24}}, + + {{ 25, 26, 27, 28, 29}, + { 30, 31, 32, 33, 34}, + { 35, 36, 37, 38, 39}, + { 40, 41, 42, 43, 44}, + { 45, 46, 47, 48, 49}} + }, + { + {{100, 101, 102, 103, 104}, + {105, 106, 107, 108, 109}, + {110, 111, 112, 113, 114}, + {115, 116, 117, 118, 119}, + {120, 121, 122, 123, 124}}, + + {{125, 126, 127, 128, 129}, + {130, 131, 132, 133, 134}, + {135, 136, 137, 138, 139}, + {140, 141, 142, 143, 144}, + {145, 146, 147, 148, 149}} + } + } + }); + SECTION("Stride") { + std::shared_ptr<Node> myAvgPool = AvgPooling({2,2}, "myAvgPool", {2,2}); + auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool -> getOperator()); + + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,2,2,2> { + { + { + {{ 3, 5}, + { 13, 15}}, + {{ 28, 30}, + { 38, 40}} + }, + { + {{103, 105}, + {113, 115}}, + {{128, 130}, + {138, 140}} + } + } + }); + op->associateInput(0,myInput); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + op->computeOutputDims(); + myAvgPool->forward(); + + float* computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + + for(int i = 0; i < myOutput->size(); i++){ + const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + } + + delete[] computedOutput; + } + + SECTION("Stride >= feature dim") { + std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(Array4D<float,1,1,3,3> { //NCHW + { + { + {{0.3745, 0.9507, 0.7320}, + {0.5987, 0.1560, 0.1560}, + {0.0581, 0.8662, 0.6011}} + } + } + }); + std::shared_ptr<Node> myAvgPool = AvgPooling({3,3}, "myAvgPool", {3,3}); + auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool -> getOperator()); + + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,1,1,1,1> { + {{{{(0.3745 + 0.9507 + 0.7320 + 0.5987 + 0.1560 + 0.1560 + 0.0581 + 0.8662 + 0.6011)/9.0}}}} + }); + op->associateInput(0,myInput2); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + op->computeOutputDims(); + myAvgPool->forward(); + + float* computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + + for(int i = 0; i < myOutput->size(); i++){ + const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + } + + delete[] computedOutput; + } + + SECTION("half") { + std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(Array4D<half_float::half,1,1,3,3> { //NCHW + { + { + {{half_float::half(0.3745), half_float::half(0.9507), half_float::half(0.7320)}, + {half_float::half(0.5987), half_float::half(0.1560), half_float::half(0.1560)}, + {half_float::half(0.0581), half_float::half(0.8662), half_float::half(0.6011)}} + } + } + }); + myInput2->setBackend("cuda"); + + std::shared_ptr<Node> myAvgPool = AvgPooling({3,3}, "mymyAvgPoolcdw", {3,3}); + auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool -> getOperator()); + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<half_float::half,1,1,1,1> { + {{{{(half_float::half(0.3745) + half_float::half(0.9507) + half_float::half(0.7320) + half_float::half(0.5987) + half_float::half(0.1560) + half_float::half(0.1560) + half_float::half(0.0581) + half_float::half(0.8662) + half_float::half(0.6011))/half_float::half(9.0)}}}} + }); + op->associateInput(0,myInput2); + op->setDataType(DataType::Float16); + op->setBackend("cuda"); + op->computeOutputDims(); + myAvgPool->forward(); + + half_float::half* computedOutput = new half_float::half[myOutput->size()](); + cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(half_float::half) * myOutput->size(), cudaMemcpyDeviceToHost); + + for(int i = 0; i < myOutput->size(); i++){ + const half_float::half targetOutput = *(static_cast<half_float::half*>(myOutput->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + } + + delete[] computedOutput; + } +} \ No newline at end of file diff --git a/unit_tests/Test_CastMove.cpp b/unit_tests/Test_CastMove.cpp index 0b68a4f9dcb6c72df91506a6f92be8c31e95f068..c96600f79967c69e43b3c334d3624f6514b6f936 100644 --- a/unit_tests/Test_CastMove.cpp +++ b/unit_tests/Test_CastMove.cpp @@ -18,8 +18,8 @@ #include "aidge/graph/Node.hpp" #include "aidge/graph/GraphView.hpp" #include "aidge/graph/OpArgs.hpp" -#include "aidge/scheduler/Scheduler.hpp" -#include "aidge/recipies/Recipies.hpp" +#include "aidge/scheduler/SequentialScheduler.hpp" +#include "aidge/recipes/Recipes.hpp" #include "aidge/backend/cuda.hpp" diff --git a/unit_tests/Test_FCImpl.cpp b/unit_tests/Test_FCImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..54e37db15ded5546eb8fc3caacff9bae238b452c --- /dev/null +++ b/unit_tests/Test_FCImpl.cpp @@ -0,0 +1,133 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <array> + +#include <catch2/catch_test_macros.hpp> + +#include "Test_cuda.hpp" + +#include "aidge/data/Tensor.hpp" + +#include "aidge/backend/cpu.hpp" +#include "aidge/backend/cuda.hpp" + +using namespace Aidge; + +TEST_CASE("[gpu/operator] FC(forward)", "[FC][GPU]") { + std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array2D<float, 5, 75>{ + {{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}}}); + std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float, 5>{{1, 2, 3, 4, 5}}); + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float, 2, 5>{ + {{23601, 23602, 23603, 23604, 23605}, {68601, 68602, 68603, 68604, 68605}}}); + myWeights->setBackend("cuda"); + myBias->setBackend("cuda"); + std::shared_ptr<Node> myFC = FC(75, 5, false, "myfc"); + auto op = std::static_pointer_cast<OperatorTensor>(myFC -> getOperator()); + op -> associateInput(1, myWeights); + op -> associateInput(2, myBias); + SECTION("2D input") { + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<float, 2, 75>{ + {{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, + 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, + 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74}, + {75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, + 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149}}}); + myInput->setBackend("cuda"); + op->associateInput(0, myInput); + op -> setDataType(DataType::Float32); + op -> setBackend("cuda"); + op->computeOutputDims(); + myFC->forward(); + + float* computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + + for(int i = 0; i < myOutput->size(); i++){ + const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + } + + delete[] computedOutput; + } + SECTION("4D input") { + std::shared_ptr<Tensor> myInput = + std::make_shared<Tensor>(Array4D<float, 2, 3, 5, 5>{{{{{0, 1, 2, 3, 4}, + {5, 6, 7, 8, 9}, + {10, 11, 12, 13, 14}, + {15, 16, 17, 18, 19}, + {20, 21, 22, 23, 24}}, + {{25, 26, 27, 28, 29}, + {30, 31, 32, 33, 34}, + {35, 36, 37, 38, 39}, + {40, 41, 42, 43, 44}, + {45, 46, 47, 48, 49}}, + {{50, 51, 52, 53, 54}, + {55, 56, 57, 58, 59}, + {60, 61, 62, 63, 64}, + {65, 66, 67, 68, 69}, + {70, 71, 72, 73, 74}}}, + {{{75, 76, 77, 78, 79}, + {80, 81, 82, 83, 84}, + {85, 86, 87, 88, 89}, + {90, 91, 92, 93, 94}, + {95, 96, 97, 98, 99}}, + {{100, 101, 102, 103, 104}, + {105, 106, 107, 108, 109}, + {110, 111, 112, 113, 114}, + {115, 116, 117, 118, 119}, + {120, 121, 122, 123, 124}}, + {{125, 126, 127, 128, 129}, + {130, 131, 132, 133, 134}, + {135, 136, 137, 138, 139}, + {140, 141, 142, 143, 144}, + {145, 146, 147, 148, 149}}}}}); + myInput->setBackend("cuda"); + op->associateInput(0, myInput); + op -> setDataType(DataType::Float32); + op -> setBackend("cuda"); + op->computeOutputDims(); + myFC->forward(); + + float* computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + + for(int i = 0; i < myOutput->size(); i++){ + const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + } + + delete[] computedOutput; + } +} \ No newline at end of file diff --git a/unit_tests/Test_MaxPoolingImpl.cpp b/unit_tests/Test_MaxPoolingImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b2ec0dfe5dc6df072b6be3b20c075190cd3f6fce --- /dev/null +++ b/unit_tests/Test_MaxPoolingImpl.cpp @@ -0,0 +1,93 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <array> + +#include <catch2/catch_test_macros.hpp> + +#include "Test_cuda.hpp" + +#include "aidge/data/Tensor.hpp" + +#include "aidge/backend/cpu.hpp" +#include "aidge/backend/cuda.hpp" + +using namespace Aidge; + + +TEST_CASE("[cpu/operator] MaxPooling(forward)", "[MaxPooling][CPU]") { + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,2,5,5> { //NCHW + { + { + {{-0.3848, 0.2166, -0.4373, 0.6142, 0.5277}, + {0.7995, 0.3638, -1.4589, -1.0843, 1.0918}, + {0.7147, 0.0936, -1.2902, 1.2037, 0.4874}, + {-0.5981, 2.1184, -0.9175, 1.3859, 0.3305}, + {-1.7700, 0.0563, -0.3914, 0.0538, -0.3955}}, + + {{-3.1409, -0.4554, 0.0524, 2.2291, 0.4859}, + {-0.7465, -0.6567, -2.3703, -0.6386, -1.4152}, + { 2.2329, -0.5850, 0.0700, 1.2838, -1.7363}, + { 0.2139, 0.0624, -1.0689, -0.8221, -0.8038}, + { 0.1886, -0.7840, -0.2313, 0.2651, -1.6244}} + }, + { + {{ 0.4371, 1.6417, 0.9129, 0.6325, 0.5438}, + {-2.3552, -0.8850, -0.0232, -0.5462, -1.2011}, + {1.7653, -1.6668, -1.0814, 0.6182, 1.2071}, + {0.9541, -0.5133, 0.8664, -0.8892, 1.4585}, + {1.0220, -0.5107, 0.1829, -0.2301, -0.4268}}, + + {{ 1.0429, 0.6279, -0.2875, 0.7187, -0.1500}, + {1.6041, 2.9635, 1.4172, -0.7517, 0.5441}, + {-0.2276, 0.0857, 0.6776, -0.1389, -0.0614}, + {-0.1547, -0.3435, 0.0650, -0.5095, -1.8073}, + {1.7217, 0.3999, -0.5953, 1.0604, -0.4126}} + } + } + }); + SECTION("Stride") { + std::shared_ptr<Node> myMaxPool = MaxPooling({2,2}, "mycdw", {2,2}); + auto op = std::static_pointer_cast<OperatorTensor>(myMaxPool -> getOperator()); + + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,2,2,2> { + { + { + {{ 0.7995, 0.6142}, + { 2.1184, 1.3859}}, + {{ -0.4554, 2.2291}, + { 2.2329, 1.2838}} + }, + { + {{1.6417, 0.9129}, + {1.7653, 0.8664}}, + {{2.9635, 1.4172}, + {0.0857, 0.6776}} + } + } + }); + myMaxPool->getOperator()->associateInput(0,myInput); + myMaxPool->getOperator()->setDataType(DataType::Float32); + myMaxPool->getOperator()->setBackend("cuda"); + op->computeOutputDims(); + myMaxPool->forward(); + + float* computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + + for(int i = 0; i < myOutput->size(); i++){ + const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + } + + delete[] computedOutput; + } +} \ No newline at end of file diff --git a/unit_tests/Test_ReLUImpl.cpp b/unit_tests/Test_ReLUImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..82da6fae6737ee39fc60d771c10dc69fa2dea5f6 --- /dev/null +++ b/unit_tests/Test_ReLUImpl.cpp @@ -0,0 +1,200 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <array> + +#include <catch2/catch_test_macros.hpp> + +#include "Test_cuda.hpp" + +#include "aidge/data/Tensor.hpp" + +#include "aidge/backend/cpu.hpp" +#include "aidge/backend/cuda.hpp" + +using namespace Aidge; + + +TEST_CASE("[gpu/operator] ReLU(forward)", "[ReLU][GPU]") { + SECTION("1D Tensor") { + std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array1D<float,10> { + {0, 1, 2,-3, 4,-5,-6, 7, 8, 9} + }); + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array1D<float,10> { + {0, 1, 2, 0, 4, 0, 0, 7, 8, 9} + }); + + std::shared_ptr<Node> myReLU = ReLU(); + auto op = std::static_pointer_cast<OperatorTensor>(myReLU -> getOperator()); + op->associateInput(0,input0); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + op->computeOutputDims(); + myReLU->forward(); + + float* computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + + for(int i = 0; i < myOutput->size(); i++){ + const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + } + + delete[] computedOutput; + } + + SECTION("2D Tensor") { + std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array2D<float,2,10> { + { + { 0, 1, 2,-3, 4,-5,-6, 7, 8, 9}, + {-5, 4, 2,-3, 4,-5,-6, 7,-1,10} + } + }); + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float,2,10> { + { + { 0, 1, 2, 0, 4, 0, 0, 7, 8, 9}, + { 0, 4, 2, 0, 4, 0, 0, 7, 0,10} + } + }); + + std::shared_ptr<Node> myReLU = ReLU(); + auto op = std::static_pointer_cast<OperatorTensor>(myReLU -> getOperator()); + op->associateInput(0,input0); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + op->computeOutputDims(); + myReLU->forward(); + + float* computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + + for(int i = 0; i < myOutput->size(); i++){ + const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + } + + delete[] computedOutput; + } + + SECTION("3D Tensor") { + std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array3D<float,2,2,10> { + { + { + { 0, 1, 2,-3, 4,-5,-6, 7, 8, 9}, + {-5, 4, 2,-3, 4,-5,-6, 7,-1,10} + }, + { + { 0, 1, 2,-3, 4,-5,-6, 7, 8, 9}, + {-5, 4, 2,-3, 4,-5,-6, 7,-1,10} + } + } + }); + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array3D<float,2,2,10> { + { + { + { 0, 1, 2, 0, 4, 0, 0, 7, 8, 9}, + { 0, 4, 2, 0, 4, 0, 0, 7, 0,10} + }, + { + { 0, 1, 2, 0, 4, 0, 0, 7, 8, 9}, + { 0, 4, 2, 0, 4, 0, 0, 7, 0,10} + } + } + }); + + std::shared_ptr<Node> myReLU = ReLU(); + auto op = std::static_pointer_cast<OperatorTensor>(myReLU -> getOperator()); + op->associateInput(0,input0); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + op->computeOutputDims(); + myReLU->forward(); + + float* computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + + for(int i = 0; i < myOutput->size(); i++){ + const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + } + + delete[] computedOutput; + } + + SECTION("4D Tensor") { + std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array4D<float,2,2,2,10> { + { + { + { + { 0, 1, 2,-3, 4,-5,-6, 7, 8, 9}, + {-5, 4, 2,-3, 4,-5,-6, 7,-1,10} + }, + { + { 0, 1, 2,-3, 4,-5,-6, 7, 8, 9}, + {-5, 4, 2,-3, 4,-5,-6, 7,-1,10} + } + }, + { + { + { 0, 1, 2,-3, 4,-5,-6, 7, 8, 9}, + {-5, 4, 2,-3, 4,-5,-6, 7,-1,10} + }, + { + { 0, 1, 2,-3, 4,-5,-6, 7, 8, 9}, + {-5, 4, 2,-3, 4,-5,-6, 7,-1,10} + } + } + } + }); + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,2,2,10> { + { + { + { + { 0, 1, 2, 0, 4, 0, 0, 7, 8, 9}, + { 0, 4, 2, 0, 4, 0, 0, 7, 0,10} + }, + { + { 0, 1, 2, 0, 4, 0, 0, 7, 8, 9}, + { 0, 4, 2, 0, 4, 0, 0, 7, 0,10} + } + }, + { + { + { 0, 1, 2, 0, 4, 0, 0, 7, 8, 9}, + { 0, 4, 2, 0, 4, 0, 0, 7, 0,10} + }, + { + { 0, 1, 2, 0, 4, 0, 0, 7, 8, 9}, + { 0, 4, 2, 0, 4, 0, 0, 7, 0,10} + } + } + } + }); + + std::shared_ptr<Node> myReLU = ReLU(); + auto op = std::static_pointer_cast<OperatorTensor>(myReLU -> getOperator()); + op->associateInput(0,input0); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + op->computeOutputDims(); + op->forward(); + + float* computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + + for(int i = 0; i < myOutput->size(); i++){ + const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + } + + delete[] computedOutput; + } +} diff --git a/version.txt b/version.txt index 8a9ecc2ea99d607e92feae1656ddbf6fdd82a2c1..341cf11faf9a29504168de4e54beaad182c5adc5 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -0.0.1 \ No newline at end of file +0.2.0 \ No newline at end of file