Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • eclipse/aidge/aidge_backend_cuda
  • hrouis/aidge_backend_cuda
  • oantoni/aidge_backend_cuda
  • jeromeh/aidge_backend_cuda
  • lrakotoarivony/aidge_backend_cuda
  • silvanosky/aidge_backend_cuda
  • maab05/aidge_backend_cuda
  • noamzerah/aidge_backend_cuda
  • farnez/aidge_backend_cuda
  • axelfarr/aidge_backend_cuda
10 results
Show changes
Commits on Source (4)
......@@ -14,6 +14,5 @@
#include "aidge/backend/cuda/data/TensorImpl.hpp"
#include "aidge/backend/cuda/operator/ConvImpl.hpp"
#include "aidge/backend/cuda/operator/ProducerImpl.hpp"
#endif /* AIDGE_BACKEND_CUDA_IMPORTS_H_ */
\ No newline at end of file
......@@ -34,8 +34,11 @@ private:
cudnnConvolutionDescriptor_t mConvDesc = nullptr;
cudnnFilterDescriptor_t mFilterDesc = nullptr;
cudnnConvolutionFwdAlgo_t mFwdAlgo;
cudnnConvolutionBwdFilterAlgo_t mBwdFilterAlgo;
cudnnConvolutionBwdDataAlgo_t mBwdDataAlgo;
size_t mWorkspaceSize = 0;
void* mFwdWorkspace = nullptr;
void* mBwdWorkspace = nullptr;
std::shared_ptr<Tensor> mInput0Fallback;
std::shared_ptr<Tensor> mInput1Fallback;
std::shared_ptr<Tensor> mInput2Fallback;
......@@ -49,10 +52,12 @@ public:
public:
void forward();
void backward();
~ConvImpl_cuda();
private:
template <class T> void forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2);
template <class T> void backward_(const Tensor& input0, const Tensor& input1, const Tensor& input2);
};
namespace {
......
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CUDA_OPERATOR_PRODUCERIMPL_H_
#define AIDGE_CUDA_OPERATOR_PRODUCERIMPL_H_
#include <memory>
#include "aidge/backend/OperatorImpl.hpp"
#include "aidge/operator/Producer.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
namespace Aidge {
class ProducerImpl_cuda : public OperatorImpl {
public:
ProducerImpl_cuda(const Producer_Op &op) : OperatorImpl(op) {}
static std::unique_ptr<ProducerImpl_cuda> create(const Producer_Op &op) {
return std::make_unique<ProducerImpl_cuda>(op);
}
NbElts_t getNbProducedData(const IOIndex_t outputIdx) const override final;
void forward() override;
};
namespace {
static Registrar<Producer_Op> registrarProducerImpl_cuda("cuda", Aidge::ProducerImpl_cuda::create);
} // namespace
} // namespace Aidge
#endif /* AIDGE_CUDA_OPERATOR_PRODUCERIMPL_H_ */
......@@ -2,8 +2,8 @@
#define AIDGE_BACKEND_CUDA_CUDA_CONTEXT_H
#include <vector>
#include <cstdio>
#include "aidge/utils/ErrorHandling.hpp"
#include "aidge/backend/cuda/utils/CudaUtils.hpp"
namespace Aidge {
......@@ -87,7 +87,7 @@ public:
if (cublas_h[dev] == NULL) {
CHECK_CUBLAS_STATUS(cublasCreate(&cublas_h[dev]));
printf("CUBLAS initialized on device #%d\n", dev);
fmt::print("CUBLAS initialized on device #{}\n", dev);
}
return cublas_h[dev];
......@@ -113,7 +113,7 @@ public:
if (cudnn_h[dev] == NULL) {
CHECK_CUDNN_STATUS(cudnnCreate(&cudnn_h[dev]));
printf("CUDNN initialized on device #%d\n", dev);
fmt::print("CUDNN initialized on device #{}\n", dev);
}
return cudnn_h[dev];
......
......@@ -11,6 +11,8 @@
#include <cuda.h>
#include <cudnn.h>
#include "aidge/utils/ErrorHandling.hpp"
#define CHECK_CUDNN_STATUS(status) \
do { \
const cudnnStatus_t e = (status); \
......
......@@ -157,6 +157,176 @@ void Aidge::ConvImpl_cuda<DIM>::forward_(const Tensor& input0, const Tensor& inp
}
}
template <Aidge::DimIdx_t DIM>
void Aidge::ConvImpl_cuda<DIM>::backward() {
const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
// FIXME: uncomment the following code once memory handling will work
assert(mOp.getRawInput(0) && "missing input #0");
assert(mOp.getRawInput(1) && "missing input #1");
// Convert input data (no overhead if not needed!)
const auto& input0 = op.getInput(0)->ref(mInput0Fallback, *op.getOutput(0));
const auto& input1 = op.getInput(1)->ref(mInput1Fallback, *op.getOutput(0));
const auto& input2 = op.getInput(2)->ref(mInput2Fallback, *op.getOutput(0));
// Set forward algorithm and allocate the required workspace
if (mBwdWorkspace == nullptr) {
// Find the best CuDNN backward algorithm (the one with the lowest compute time)
int maxAlgoIterations = 0;
cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(CudaContext::cudnnHandle(),
&maxAlgoIterations);
assert(maxAlgoIterations > 0 && "No available CUDNN ConvolutionBackwardFilterAlgorithm");
int returnAlgoCounts = 0;
std::vector<cudnnConvolutionBwdFilterAlgoPerf_t> returnBwdFilterAlgo(maxAlgoIterations);
CHECK_CUDNN_STATUS(cudnnFindConvolutionBackwardFilterAlgorithm(
CudaContext::cudnnHandle(),
std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0),
std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
mConvDesc,
mFilterDesc,
maxAlgoIterations,
&returnAlgoCounts,
&returnBwdFilterAlgo[0]));
mBwdFilterAlgo = returnBwdFilterAlgo[0].algo;
maxAlgoIterations = 0;
cudnnGetConvolutionBackwardDataAlgorithmMaxCount(CudaContext::cudnnHandle(),
&maxAlgoIterations);
assert(maxAlgoIterations > 0 && "No available CUDNN ConvolutionBackwardDataAlgorithm");
returnAlgoCounts = 0;
std::vector<cudnnConvolutionBwdDataAlgoPerf_t> returnBwdDataAlgo(maxAlgoIterations);
CHECK_CUDNN_STATUS(cudnnFindConvolutionBackwardDataAlgorithm(
CudaContext::cudnnHandle(),
mFilterDesc,
std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
mConvDesc,
std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0),
maxAlgoIterations,
&returnAlgoCounts,
&returnBwdDataAlgo[0]));
mBwdDataAlgo = returnBwdDataAlgo[0].algo;
// Allocate the workspace required by the chosen CuDNN backward algorithm
size_t workspaceSize = 0;
CHECK_CUDNN_STATUS(cudnnGetConvolutionBackwardFilterWorkspaceSize(
CudaContext::cudnnHandle(),
// same arguments as cudnnGetConvolutionBackwardFilterAlgorithm()
// -->
std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0),
std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
mConvDesc,
mFilterDesc,
// <--
mBwdFilterAlgo,
&workspaceSize));
size_t workspaceSizeData = 0;
CHECK_CUDNN_STATUS(cudnnGetConvolutionBackwardDataWorkspaceSize(
CudaContext::cudnnHandle(),
// same arguments as cudnnGetConvolutionBackwardDataAlgorithm() -->
mFilterDesc,
std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
mConvDesc,
std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0),
// <--
mBwdDataAlgo,
&workspaceSizeData));
if (workspaceSizeData > workspaceSize)
workspaceSize = workspaceSizeData;
if (workspaceSize > mWorkspaceSize) {
if (mFwdWorkspace != nullptr) {
cudaFree(mFwdWorkspace);
}
CHECK_CUDA_STATUS(cudaMalloc(&mFwdWorkspace, workspaceSize));
mWorkspaceSize = workspaceSize;
}
mBwdWorkspace = mFwdWorkspace;
}
// Do the actual backward computation
// Template is only for scaling parameters, which are always in float
// excepted when the convolution is performed in double precision.
if (op.getOutput(0)->dataType() == DataType::Float64) {
backward_<double>(input0, input1, input2);
}
else {
backward_<float>(input0, input1, input2);
}
}
template <Aidge::DimIdx_t DIM>
template <class T>
void Aidge::ConvImpl_cuda<DIM>::backward_(const Tensor& input0, const Tensor& input1, const Tensor& input2) {
const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
std::shared_ptr<Tensor> gradOutputFallback;
const auto& gradOutput = op.getOutput(0)->grad()->refCastFrom(gradOutputFallback, *(op.getInput(0)->grad()));
const T alpha = 1.0f;
const T beta = 0.0f;
CHECK_CUDNN_STATUS(cudnnConvolutionBackwardFilter(
CudaContext::cudnnHandle(),
&alpha,
std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0),
input0.getImpl()->rawPtr(),
std::dynamic_pointer_cast<TensorImpl_cuda_>(gradOutput.getImpl())->getCudnnTensorDesc(gradOutput),
gradOutput.getImpl()->rawPtr(),
mConvDesc,
mBwdFilterAlgo,
mBwdWorkspace,
mWorkspaceSize,
&beta,
mFilterDesc,
op.getInput(1)->grad()->getImpl()->rawPtr()));
CHECK_CUDNN_STATUS(cudnnConvolutionBackwardData(
CudaContext::cudnnHandle(),
&alpha,
mFilterDesc,
input1.getImpl()->rawPtr(),
std::dynamic_pointer_cast<TensorImpl_cuda_>(gradOutput.getImpl())->getCudnnTensorDesc(gradOutput),
gradOutput.getImpl()->rawPtr(),
mConvDesc,
mBwdDataAlgo,
mBwdWorkspace,
mWorkspaceSize,
&beta,
std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->grad()->getImpl())->getCudnnTensorDesc(*op.getInput(0)),
op.getInput(0)->grad()->getImpl()->rawPtr()));
// Add bias (if there is any)
if (mOp.getRawInput(2) && input2.size() > 0) {
// Bias tensor needs to have the same number of dims than output tensor for cudnnAddTensor()
std::vector<DimSize_t> gradBiasDims(DIM+2, 1);
gradBiasDims[1] = op.getInput(2)->grad()->size();
// Create a dummy tensor with the right dims in order to get a CuDNN tensor descriptor (with getCudnnTensorDesc())
Tensor gradBias(op.getInput(2)->grad()->dataType());
gradBias.setBackend("cuda");
gradBias.resize(gradBiasDims);
// TODO: find a more elegant solution(?)
CHECK_CUDNN_STATUS(cudnnConvolutionBackwardBias(CudaContext::cudnnHandle(),
&alpha,
std::dynamic_pointer_cast<TensorImpl_cuda_>(gradOutput.getImpl())->getCudnnTensorDesc(gradOutput),
gradOutput.getImpl()->rawPtr(),
&beta,
std::dynamic_pointer_cast<TensorImpl_cuda_>(gradBias.getImpl())->getCudnnTensorDesc(gradBias),
op.getInput(2)->grad()->getImpl()->rawPtr()));
}
}
template <Aidge::DimIdx_t DIM>
Aidge::ConvImpl_cuda<DIM>::~ConvImpl_cuda() {
if (mConvDesc != nullptr) {
......
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#include <cassert>
#include <numeric> // std::accumulate
#include <vector>
#include "aidge/data/Tensor.hpp"
#include "aidge/operator/Producer.hpp"
#include "aidge/utils/Types.h"
#include "aidge/backend/cuda/operator/ProducerImpl.hpp"
Aidge::DimSize_t Aidge::ProducerImpl_cuda::getNbProducedData(
Aidge::IOIndex_t outputIdx) const
{
// Requires the whole tensors, regardless of available data on inputs
assert(outputIdx == 0 && "operator has only one output");
(void) outputIdx;
return std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->size();
}
void Aidge::ProducerImpl_cuda::forward()
{
}
......@@ -40,7 +40,7 @@ void Aidge::Cuda::setMultiDevicePeerAccess(unsigned int size, unsigned int* devi
CHECK_CUDA_STATUS(cudaSetDevice(devices[j]));
const cudaError_t status = cudaDeviceEnablePeerAccess(devices[i], 0);
if (status == cudaErrorPeerAccessAlreadyEnabled) {
printf("Peer access already enabled between device %d and device %d\n", devices[j], devices[i]);
fmt::print("Peer access already enabled between device {} and device {}\n", devices[j], devices[i]);
} else {
CHECK_CUDA_STATUS(status);
}
......
......@@ -18,8 +18,8 @@
#include "aidge/graph/Node.hpp"
#include "aidge/graph/GraphView.hpp"
#include "aidge/graph/OpArgs.hpp"
#include "aidge/scheduler/Scheduler.hpp"
#include "aidge/recipies/Recipies.hpp"
#include "aidge/scheduler/SequentialScheduler.hpp"
#include "aidge/recipes/Recipes.hpp"
#include "aidge/backend/cuda.hpp"
......
0.0.1
\ No newline at end of file
0.2.0
\ No newline at end of file