Skip to content
Snippets Groups Projects
Commit b46d4793 authored by Grégoire Kubler's avatar Grégoire Kubler
Browse files

Merge branch 'dev' of...

Merge branch 'dev' of https://gitlab.eclipse.org/eclipse/aidge/aidge_backend_cuda into feat/support_ASAN
parents 65127b6e 5c979c8e
No related branches found
No related tags found
2 merge requests!15version 0.2.0,!11feat : added ASAN support
This commit is part of merge request !11. Comments created here will be created in the context of that merge request.
...@@ -14,6 +14,5 @@ ...@@ -14,6 +14,5 @@
#include "aidge/backend/cuda/data/TensorImpl.hpp" #include "aidge/backend/cuda/data/TensorImpl.hpp"
#include "aidge/backend/cuda/operator/ConvImpl.hpp" #include "aidge/backend/cuda/operator/ConvImpl.hpp"
#include "aidge/backend/cuda/operator/ProducerImpl.hpp"
#endif /* AIDGE_BACKEND_CUDA_IMPORTS_H_ */ #endif /* AIDGE_BACKEND_CUDA_IMPORTS_H_ */
\ No newline at end of file
...@@ -34,8 +34,11 @@ private: ...@@ -34,8 +34,11 @@ private:
cudnnConvolutionDescriptor_t mConvDesc = nullptr; cudnnConvolutionDescriptor_t mConvDesc = nullptr;
cudnnFilterDescriptor_t mFilterDesc = nullptr; cudnnFilterDescriptor_t mFilterDesc = nullptr;
cudnnConvolutionFwdAlgo_t mFwdAlgo; cudnnConvolutionFwdAlgo_t mFwdAlgo;
cudnnConvolutionBwdFilterAlgo_t mBwdFilterAlgo;
cudnnConvolutionBwdDataAlgo_t mBwdDataAlgo;
size_t mWorkspaceSize = 0; size_t mWorkspaceSize = 0;
void* mFwdWorkspace = nullptr; void* mFwdWorkspace = nullptr;
void* mBwdWorkspace = nullptr;
std::shared_ptr<Tensor> mInput0Fallback; std::shared_ptr<Tensor> mInput0Fallback;
std::shared_ptr<Tensor> mInput1Fallback; std::shared_ptr<Tensor> mInput1Fallback;
std::shared_ptr<Tensor> mInput2Fallback; std::shared_ptr<Tensor> mInput2Fallback;
...@@ -49,10 +52,12 @@ public: ...@@ -49,10 +52,12 @@ public:
public: public:
void forward(); void forward();
void backward();
~ConvImpl_cuda(); ~ConvImpl_cuda();
private: private:
template <class T> void forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2); template <class T> void forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2);
template <class T> void backward_(const Tensor& input0, const Tensor& input1, const Tensor& input2);
}; };
namespace { namespace {
......
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CUDA_OPERATOR_PRODUCERIMPL_H_
#define AIDGE_CUDA_OPERATOR_PRODUCERIMPL_H_
#include <memory>
#include "aidge/backend/OperatorImpl.hpp"
#include "aidge/operator/Producer.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
namespace Aidge {
class ProducerImpl_cuda : public OperatorImpl {
public:
ProducerImpl_cuda(const Producer_Op &op) : OperatorImpl(op) {}
static std::unique_ptr<ProducerImpl_cuda> create(const Producer_Op &op) {
return std::make_unique<ProducerImpl_cuda>(op);
}
NbElts_t getNbProducedData(const IOIndex_t outputIdx) const override final;
void forward() override;
};
namespace {
static Registrar<Producer_Op> registrarProducerImpl_cuda("cuda", Aidge::ProducerImpl_cuda::create);
} // namespace
} // namespace Aidge
#endif /* AIDGE_CUDA_OPERATOR_PRODUCERIMPL_H_ */
...@@ -2,8 +2,8 @@ ...@@ -2,8 +2,8 @@
#define AIDGE_BACKEND_CUDA_CUDA_CONTEXT_H #define AIDGE_BACKEND_CUDA_CUDA_CONTEXT_H
#include <vector> #include <vector>
#include <cstdio>
#include "aidge/utils/ErrorHandling.hpp"
#include "aidge/backend/cuda/utils/CudaUtils.hpp" #include "aidge/backend/cuda/utils/CudaUtils.hpp"
namespace Aidge { namespace Aidge {
...@@ -87,7 +87,7 @@ public: ...@@ -87,7 +87,7 @@ public:
if (cublas_h[dev] == NULL) { if (cublas_h[dev] == NULL) {
CHECK_CUBLAS_STATUS(cublasCreate(&cublas_h[dev])); CHECK_CUBLAS_STATUS(cublasCreate(&cublas_h[dev]));
printf("CUBLAS initialized on device #%d\n", dev); fmt::print("CUBLAS initialized on device #{}\n", dev);
} }
return cublas_h[dev]; return cublas_h[dev];
...@@ -113,7 +113,7 @@ public: ...@@ -113,7 +113,7 @@ public:
if (cudnn_h[dev] == NULL) { if (cudnn_h[dev] == NULL) {
CHECK_CUDNN_STATUS(cudnnCreate(&cudnn_h[dev])); CHECK_CUDNN_STATUS(cudnnCreate(&cudnn_h[dev]));
printf("CUDNN initialized on device #%d\n", dev); fmt::print("CUDNN initialized on device #{}\n", dev);
} }
return cudnn_h[dev]; return cudnn_h[dev];
......
...@@ -11,6 +11,8 @@ ...@@ -11,6 +11,8 @@
#include <cuda.h> #include <cuda.h>
#include <cudnn.h> #include <cudnn.h>
#include "aidge/utils/ErrorHandling.hpp"
#define CHECK_CUDNN_STATUS(status) \ #define CHECK_CUDNN_STATUS(status) \
do { \ do { \
const cudnnStatus_t e = (status); \ const cudnnStatus_t e = (status); \
......
...@@ -157,6 +157,176 @@ void Aidge::ConvImpl_cuda<DIM>::forward_(const Tensor& input0, const Tensor& inp ...@@ -157,6 +157,176 @@ void Aidge::ConvImpl_cuda<DIM>::forward_(const Tensor& input0, const Tensor& inp
} }
} }
template <Aidge::DimIdx_t DIM>
void Aidge::ConvImpl_cuda<DIM>::backward() {
const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
// FIXME: uncomment the following code once memory handling will work
assert(mOp.getRawInput(0) && "missing input #0");
assert(mOp.getRawInput(1) && "missing input #1");
// Convert input data (no overhead if not needed!)
const auto& input0 = op.getInput(0)->ref(mInput0Fallback, *op.getOutput(0));
const auto& input1 = op.getInput(1)->ref(mInput1Fallback, *op.getOutput(0));
const auto& input2 = op.getInput(2)->ref(mInput2Fallback, *op.getOutput(0));
// Set forward algorithm and allocate the required workspace
if (mBwdWorkspace == nullptr) {
// Find the best CuDNN backward algorithm (the one with the lowest compute time)
int maxAlgoIterations = 0;
cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(CudaContext::cudnnHandle(),
&maxAlgoIterations);
assert(maxAlgoIterations > 0 && "No available CUDNN ConvolutionBackwardFilterAlgorithm");
int returnAlgoCounts = 0;
std::vector<cudnnConvolutionBwdFilterAlgoPerf_t> returnBwdFilterAlgo(maxAlgoIterations);
CHECK_CUDNN_STATUS(cudnnFindConvolutionBackwardFilterAlgorithm(
CudaContext::cudnnHandle(),
std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0),
std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
mConvDesc,
mFilterDesc,
maxAlgoIterations,
&returnAlgoCounts,
&returnBwdFilterAlgo[0]));
mBwdFilterAlgo = returnBwdFilterAlgo[0].algo;
maxAlgoIterations = 0;
cudnnGetConvolutionBackwardDataAlgorithmMaxCount(CudaContext::cudnnHandle(),
&maxAlgoIterations);
assert(maxAlgoIterations > 0 && "No available CUDNN ConvolutionBackwardDataAlgorithm");
returnAlgoCounts = 0;
std::vector<cudnnConvolutionBwdDataAlgoPerf_t> returnBwdDataAlgo(maxAlgoIterations);
CHECK_CUDNN_STATUS(cudnnFindConvolutionBackwardDataAlgorithm(
CudaContext::cudnnHandle(),
mFilterDesc,
std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
mConvDesc,
std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0),
maxAlgoIterations,
&returnAlgoCounts,
&returnBwdDataAlgo[0]));
mBwdDataAlgo = returnBwdDataAlgo[0].algo;
// Allocate the workspace required by the chosen CuDNN backward algorithm
size_t workspaceSize = 0;
CHECK_CUDNN_STATUS(cudnnGetConvolutionBackwardFilterWorkspaceSize(
CudaContext::cudnnHandle(),
// same arguments as cudnnGetConvolutionBackwardFilterAlgorithm()
// -->
std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0),
std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
mConvDesc,
mFilterDesc,
// <--
mBwdFilterAlgo,
&workspaceSize));
size_t workspaceSizeData = 0;
CHECK_CUDNN_STATUS(cudnnGetConvolutionBackwardDataWorkspaceSize(
CudaContext::cudnnHandle(),
// same arguments as cudnnGetConvolutionBackwardDataAlgorithm() -->
mFilterDesc,
std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
mConvDesc,
std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0),
// <--
mBwdDataAlgo,
&workspaceSizeData));
if (workspaceSizeData > workspaceSize)
workspaceSize = workspaceSizeData;
if (workspaceSize > mWorkspaceSize) {
if (mFwdWorkspace != nullptr) {
cudaFree(mFwdWorkspace);
}
CHECK_CUDA_STATUS(cudaMalloc(&mFwdWorkspace, workspaceSize));
mWorkspaceSize = workspaceSize;
}
mBwdWorkspace = mFwdWorkspace;
}
// Do the actual backward computation
// Template is only for scaling parameters, which are always in float
// excepted when the convolution is performed in double precision.
if (op.getOutput(0)->dataType() == DataType::Float64) {
backward_<double>(input0, input1, input2);
}
else {
backward_<float>(input0, input1, input2);
}
}
template <Aidge::DimIdx_t DIM>
template <class T>
void Aidge::ConvImpl_cuda<DIM>::backward_(const Tensor& input0, const Tensor& input1, const Tensor& input2) {
const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
std::shared_ptr<Tensor> gradOutputFallback;
const auto& gradOutput = op.getOutput(0)->grad()->refCastFrom(gradOutputFallback, *(op.getInput(0)->grad()));
const T alpha = 1.0f;
const T beta = 0.0f;
CHECK_CUDNN_STATUS(cudnnConvolutionBackwardFilter(
CudaContext::cudnnHandle(),
&alpha,
std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0),
input0.getImpl()->rawPtr(),
std::dynamic_pointer_cast<TensorImpl_cuda_>(gradOutput.getImpl())->getCudnnTensorDesc(gradOutput),
gradOutput.getImpl()->rawPtr(),
mConvDesc,
mBwdFilterAlgo,
mBwdWorkspace,
mWorkspaceSize,
&beta,
mFilterDesc,
op.getInput(1)->grad()->getImpl()->rawPtr()));
CHECK_CUDNN_STATUS(cudnnConvolutionBackwardData(
CudaContext::cudnnHandle(),
&alpha,
mFilterDesc,
input1.getImpl()->rawPtr(),
std::dynamic_pointer_cast<TensorImpl_cuda_>(gradOutput.getImpl())->getCudnnTensorDesc(gradOutput),
gradOutput.getImpl()->rawPtr(),
mConvDesc,
mBwdDataAlgo,
mBwdWorkspace,
mWorkspaceSize,
&beta,
std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->grad()->getImpl())->getCudnnTensorDesc(*op.getInput(0)),
op.getInput(0)->grad()->getImpl()->rawPtr()));
// Add bias (if there is any)
if (mOp.getRawInput(2) && input2.size() > 0) {
// Bias tensor needs to have the same number of dims than output tensor for cudnnAddTensor()
std::vector<DimSize_t> gradBiasDims(DIM+2, 1);
gradBiasDims[1] = op.getInput(2)->grad()->size();
// Create a dummy tensor with the right dims in order to get a CuDNN tensor descriptor (with getCudnnTensorDesc())
Tensor gradBias(op.getInput(2)->grad()->dataType());
gradBias.setBackend("cuda");
gradBias.resize(gradBiasDims);
// TODO: find a more elegant solution(?)
CHECK_CUDNN_STATUS(cudnnConvolutionBackwardBias(CudaContext::cudnnHandle(),
&alpha,
std::dynamic_pointer_cast<TensorImpl_cuda_>(gradOutput.getImpl())->getCudnnTensorDesc(gradOutput),
gradOutput.getImpl()->rawPtr(),
&beta,
std::dynamic_pointer_cast<TensorImpl_cuda_>(gradBias.getImpl())->getCudnnTensorDesc(gradBias),
op.getInput(2)->grad()->getImpl()->rawPtr()));
}
}
template <Aidge::DimIdx_t DIM> template <Aidge::DimIdx_t DIM>
Aidge::ConvImpl_cuda<DIM>::~ConvImpl_cuda() { Aidge::ConvImpl_cuda<DIM>::~ConvImpl_cuda() {
if (mConvDesc != nullptr) { if (mConvDesc != nullptr) {
......
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#include <cassert>
#include <numeric> // std::accumulate
#include <vector>
#include "aidge/data/Tensor.hpp"
#include "aidge/operator/Producer.hpp"
#include "aidge/utils/Types.h"
#include "aidge/backend/cuda/operator/ProducerImpl.hpp"
Aidge::DimSize_t Aidge::ProducerImpl_cuda::getNbProducedData(
Aidge::IOIndex_t outputIdx) const
{
// Requires the whole tensors, regardless of available data on inputs
assert(outputIdx == 0 && "operator has only one output");
(void) outputIdx;
return std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->size();
}
void Aidge::ProducerImpl_cuda::forward()
{
}
...@@ -40,7 +40,7 @@ void Aidge::Cuda::setMultiDevicePeerAccess(unsigned int size, unsigned int* devi ...@@ -40,7 +40,7 @@ void Aidge::Cuda::setMultiDevicePeerAccess(unsigned int size, unsigned int* devi
CHECK_CUDA_STATUS(cudaSetDevice(devices[j])); CHECK_CUDA_STATUS(cudaSetDevice(devices[j]));
const cudaError_t status = cudaDeviceEnablePeerAccess(devices[i], 0); const cudaError_t status = cudaDeviceEnablePeerAccess(devices[i], 0);
if (status == cudaErrorPeerAccessAlreadyEnabled) { if (status == cudaErrorPeerAccessAlreadyEnabled) {
printf("Peer access already enabled between device %d and device %d\n", devices[j], devices[i]); fmt::print("Peer access already enabled between device {} and device {}\n", devices[j], devices[i]);
} else { } else {
CHECK_CUDA_STATUS(status); CHECK_CUDA_STATUS(status);
} }
......
...@@ -18,8 +18,8 @@ ...@@ -18,8 +18,8 @@
#include "aidge/graph/Node.hpp" #include "aidge/graph/Node.hpp"
#include "aidge/graph/GraphView.hpp" #include "aidge/graph/GraphView.hpp"
#include "aidge/graph/OpArgs.hpp" #include "aidge/graph/OpArgs.hpp"
#include "aidge/scheduler/Scheduler.hpp" #include "aidge/scheduler/SequentialScheduler.hpp"
#include "aidge/recipies/Recipies.hpp" #include "aidge/recipes/Recipes.hpp"
#include "aidge/backend/cuda.hpp" #include "aidge/backend/cuda.hpp"
......
0.0.1 0.2.0
\ No newline at end of file \ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment