diff --git a/CMakeLists.txt b/CMakeLists.txt index 6e30fb0010f05825123586e4c4d4c9e56873e854..21f57bc75b2fa4ad8b57711b092693c70b20d628 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -86,8 +86,8 @@ endif() target_compile_features(${module_name} PRIVATE cxx_std_14) target_compile_options(${module_name} PRIVATE - $<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>: - -Wall -Wextra -Wold-style-cast -Winline -pedantic -Werror=narrowing -Wshadow $<$<BOOL:${WERROR}>:-Werror>>) + $<$<COMPILE_LANGUAGE:CPP>:$<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>: + -Wall -Wextra -Wold-style-cast -Winline -pedantic -Werror=narrowing -Wshadow $<$<BOOL:${WERROR}>:-Werror>>>) target_compile_options(${module_name} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>: -Wall>) diff --git a/include/aidge/backend/cuda.hpp b/include/aidge/backend/cuda.hpp index f1dbc28597660273b346802c48c01de4713ce081..a6bae174471e665f229d08a489d6b9f7911a6e9f 100644 --- a/include/aidge/backend/cuda.hpp +++ b/include/aidge/backend/cuda.hpp @@ -13,5 +13,6 @@ #define AIDGE_BACKEND_CUDA_IMPORTS_H_ #include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/ConvImpl.hpp" #endif /* AIDGE_BACKEND_CUDA_IMPORTS_H_ */ \ No newline at end of file diff --git a/include/aidge/backend/cuda/data/TensorImpl.hpp b/include/aidge/backend/cuda/data/TensorImpl.hpp index 1b939d70d02615eb04d89890c40a9da3aedbd531..f09cf5d91d60d05762cc86a8fb9ca1400cf1be8b 100644 --- a/include/aidge/backend/cuda/data/TensorImpl.hpp +++ b/include/aidge/backend/cuda/data/TensorImpl.hpp @@ -2,7 +2,7 @@ #define AIDGE_BACKEND_CUDA_DATA_TENSORIMPL_H_ #include <thrust/equal.h> -#include <thrust/execution_policy.h> +#include <thrust/device_ptr.h> #include "aidge/backend/TensorImpl.hpp" #include "aidge/data/Tensor.hpp" @@ -31,14 +31,7 @@ class TensorImpl_cuda : public TensorImpl, public TensorImpl_cuda_ { TensorImpl_cuda(const Tensor &tensor) : TensorImpl(Backend), mTensor(tensor) {} - bool operator==(const TensorImpl &otherImpl) const override final { - const auto& otherImplCuda = static_cast<const TensorImpl_cuda<T>&>(otherImpl); - - if (mTensor.size() != otherImplCuda.mTensor.size()) - return false; - - return thrust::equal(mData, mData + mTensor.size(), otherImplCuda.mData); - } + bool operator==(const TensorImpl &otherImpl) const override final; static std::unique_ptr<TensorImpl_cuda> create(const Tensor &tensor) { return std::make_unique<TensorImpl_cuda<T>>(tensor); @@ -87,9 +80,6 @@ class TensorImpl_cuda : public TensorImpl, public TensorImpl_cuda_ { stride *= mTensor.dims()[dim]; } - std::reverse(dims.begin(), dims.end()); - std::reverse(strides.begin(), strides.end()); - CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(mCudnnTensor, CudaContext::data_type<T>::value, dims.size(), diff --git a/include/aidge/backend/cuda/operator/ConvImpl.hpp b/include/aidge/backend/cuda/operator/ConvImpl.hpp index cb1202f4cffd814fac67453065a786c50f1eed18..07c8a7f0d4b8c9663b17a42e216619e8e927ba89 100644 --- a/include/aidge/backend/cuda/operator/ConvImpl.hpp +++ b/include/aidge/backend/cuda/operator/ConvImpl.hpp @@ -46,6 +46,7 @@ private: public: ConvImpl_cuda(const Conv_Op<DIM> &op) : mOp(op), mNbConsumedData({0, 0, 0}), mNbProducedData({0}) { CHECK_CUDNN_STATUS(cudnnCreateConvolutionDescriptor(&mConvDesc)); + CHECK_CUDNN_STATUS(cudnnCreateFilterDescriptor(&mFilterDesc)); } static std::unique_ptr<ConvImpl_cuda> create(const Conv_Op<2> &op) { diff --git a/src/data/TensorImpl.cu b/src/data/TensorImpl.cu new file mode 100644 index 0000000000000000000000000000000000000000..4104d27321fb75500fa55dd696bf1c589d69e76e --- /dev/null +++ b/src/data/TensorImpl.cu @@ -0,0 +1,24 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include "aidge/backend/cuda/data/TensorImpl.hpp" + +template <class T> +bool Aidge::TensorImpl_cuda<T>::operator==(const TensorImpl &otherImpl) const { + const auto& otherImplCuda = static_cast<const TensorImpl_cuda<T>&>(otherImpl); + + if (mTensor.size() != otherImplCuda.mTensor.size()) + return false; + + thrust::device_ptr<T> thrustData(mData); + thrust::device_ptr<T> thrustOtherData(otherImplCuda.mData); + return thrust::equal(thrustData, thrustData + mTensor.size(), thrustOtherData); +} diff --git a/src/operator/ConvImpl.cpp b/src/operator/ConvImpl.cpp index fdfdc626cbbb86d03a2e24a0b2dc42134be8904c..7d46072725566d901dadd60b92a42e9b9491c3aa 100644 --- a/src/operator/ConvImpl.cpp +++ b/src/operator/ConvImpl.cpp @@ -78,9 +78,9 @@ void Aidge::ConvImpl_cuda<DIM>::forward() { assert(mOp.getInput(0) && "missing input #0"); assert(mOp.getInput(1) && "missing input #1"); - const std::vector<int> strides(mOp.template get<ConvParam::StrideDims>().rbegin(), mOp.template get<ConvParam::StrideDims>().rend()); + const std::vector<int> strides(mOp.template get<ConvParam::StrideDims>().begin(), mOp.template get<ConvParam::StrideDims>().end()); const std::vector<int> paddings(DIM, 0); - const std::vector<int> upscales(mOp.template get<ConvParam::DilationDims>().rbegin(), mOp.template get<ConvParam::DilationDims>().rend()); + const std::vector<int> upscales(mOp.template get<ConvParam::DilationDims>().begin(), mOp.template get<ConvParam::DilationDims>().end()); CHECK_CUDNN_STATUS( cudnnSetConvolutionNdDescriptor(mConvDesc, @@ -89,17 +89,15 @@ void Aidge::ConvImpl_cuda<DIM>::forward() { &strides[0], &upscales[0], CUDNN_CROSS_CORRELATION, - DataTypeToCudnn(mOp.getInput(2)->dataType()))); + DataTypeToCudnn(mOp.getOutput(0)->dataType()))); - const std::vector<int> cudaKernelDims(mOp.getInput(1)->dims().rbegin(), - mOp.getInput(1)->dims().rend()); + const std::vector<int> kernels(mOp.getInput(1)->dims().begin(), mOp.getInput(1)->dims().end()); - CHECK_CUDNN_STATUS(cudnnCreateFilterDescriptor(&mFilterDesc)); CHECK_CUDNN_STATUS(cudnnSetFilterNdDescriptor(mFilterDesc, DataTypeToCudnn(mOp.getInput(1)->dataType()), CUDNN_TENSOR_NCHW, - cudaKernelDims.size(), - &cudaKernelDims[0])); + kernels.size(), + &kernels[0])); int maxAlgoIterations = 0; cudnnGetConvolutionForwardAlgorithmMaxCount(CudaContext::cudnnHandle(), @@ -221,10 +219,20 @@ void Aidge::ConvImpl_cuda<DIM>::forward_() { dynamic_cast<TensorImpl_cuda_*>(mOp.getOutput(0)->getImpl().get())->getCudnnTensorDesc(), mOp.getOutput(0)->getImpl()->rawPtr())); - if (mOp.getInput(2) != nullptr) { + if (mOp.getInput(2) && mOp.getInput(2)->size() > 0) { + // Bias tensor needs to have the same number of dims than output tensor for cudnnAddTensor() + std::vector<DimSize_t> biasDims(DIM+2, 1); + biasDims[1] = mOp.getInput(2)->size(); + + // Create a dummy tensor with the right dims in order to get a CuDNN tensor descriptor (with getCudnnTensorDesc()) + Tensor bias(mOp.getInput(2)->dataType()); + bias.setBackend("cuda"); + bias.resize(biasDims); + // TODO: find a more elegant solution + CHECK_CUDNN_STATUS(cudnnAddTensor(CudaContext::cudnnHandle(), &alpha, - dynamic_cast<TensorImpl_cuda_*>(mOp.getInput(2)->getImpl().get())->getCudnnTensorDesc(), + dynamic_cast<TensorImpl_cuda_*>(bias.getImpl().get())->getCudnnTensorDesc(), mOp.getInput(2)->getImpl()->rawPtr(), &alpha, dynamic_cast<TensorImpl_cuda_*>(mOp.getOutput(0)->getImpl().get())->getCudnnTensorDesc(), @@ -234,7 +242,12 @@ void Aidge::ConvImpl_cuda<DIM>::forward_() { template <Aidge::DimIdx_t DIM> Aidge::ConvImpl_cuda<DIM>::~ConvImpl_cuda() { + cudnnDestroyConvolutionDescriptor(mConvDesc); + cudnnDestroyFilterDescriptor(mFilterDesc); + if (mWorkspace != nullptr) { + cudaFree(mWorkspace); + } } template <Aidge::DimIdx_t DIM> diff --git a/unit_tests/Test_ConvImpl.cpp b/unit_tests/Test_ConvImpl.cpp index 2746f82ffa9840eec7ceec15a1977d0e272a9bde..659528dd1b2a45fcdd67ca0bd3440391a0e79654 100644 --- a/unit_tests/Test_ConvImpl.cpp +++ b/unit_tests/Test_ConvImpl.cpp @@ -23,11 +23,61 @@ using namespace Aidge; TEST_CASE("[gpu/operator] Conv(forward)") { + SECTION("Simple Conv no bias") { + std::shared_ptr<Node> myConv = Conv(1,1,{3,3}, "myconv"); + myConv->getOperator()->setDatatype(DataType::Float32); + myConv->getOperator()->setBackend("cuda"); + std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array4D<float,1,1,3,3> { + { + { + {{ 0, 1, 2}, + { 3, 4, 5}, + { 6, 7, 8}} + } + } + }); + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,1,1,3,3> { //NCHW + { + { + {{ 0, 1, 2}, + { 3, 4, 5}, + { 6, 7, 8}} + } + } + }); + const float myOutput = 0*0+1*1+2*2+3*3+4*4+5*5+6*6+7*7+8*8; + + myInput->setBackend("cuda"); + myWeights->setBackend("cuda"); + + myConv->getOperator()->associateInput(0,myInput); + myConv->getOperator()->associateInput(1,myWeights); + myConv->getOperator()->computeOutputDims(); + myConv->forward(); + + REQUIRE(myConv->getOperator()->getOutput(0)->size() == 1); + + std::array<float, 9> kernel; + cudaMemcpy(&kernel[0], myWeights->getImpl()->rawPtr(), 9 * sizeof(float), cudaMemcpyDeviceToHost); + std::array<float, 9> input; + cudaMemcpy(&input[0], myInput->getImpl()->rawPtr(), 9 * sizeof(float), cudaMemcpyDeviceToHost); + + for (int i = 0; i < 9; ++i) { + REQUIRE(kernel[i] == i); + REQUIRE(input[i] == i); + } + + float computedOutput; + cudaMemcpy(&computedOutput, myConv->getOperator()->getOutput(0)->getImpl()->rawPtr(), sizeof(float), cudaMemcpyDeviceToHost); + + REQUIRE(fabs(computedOutput - myOutput) < 1e-6); + } + SECTION("Classic Conv") { std::shared_ptr<Node> myConv = Conv(3,4,{3,3}, "myconv"); myConv->getOperator()->setDatatype(DataType::Float32); myConv->getOperator()->setBackend("cuda"); - std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array4D<int,4,3,3,3> { + std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array4D<float,4,3,3,3> { { { {{ 0, 1, 2}, @@ -75,8 +125,8 @@ TEST_CASE("[gpu/operator] Conv(forward)") { } } }); - std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<int,4> {{7,0,9,0}}); - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<int,2,3,5,5> { //NCHW + std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float,4> {{7,0,9,0}}); + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,3,5,5> { //NCHW { { {{ 0, 1, 2, 3, 4}, @@ -118,7 +168,7 @@ TEST_CASE("[gpu/operator] Conv(forward)") { } } }); - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<int,2,4,3,3> { + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,4,3,3> { { { {{ 15226, 15577, 15928}, @@ -150,12 +200,26 @@ TEST_CASE("[gpu/operator] Conv(forward)") { } } }); + + myInput->setBackend("cuda"); + myWeights->setBackend("cuda"); + myBias->setBackend("cuda"); + myConv->getOperator()->associateInput(0,myInput); myConv->getOperator()->associateInput(1,myWeights); myConv->getOperator()->associateInput(2,myBias); myConv->getOperator()->computeOutputDims(); myConv->forward(); // myConv->getOperator()->getOutput(0)->print(); - REQUIRE(*(myConv->getOperator()->getOutput(0)) == *myOutput); + + float* computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, myConv->getOperator()->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + + for(int i = 0; i < myOutput->size(); i++){ + const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + } + + delete[] computedOutput; } }