-
Olivier BICHLER authoredOlivier BICHLER authored
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
ConvImpl.cpp 9.73 KiB
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#include <cassert>
#include <chrono> // std::chrono::milliseconds
#include <numeric> // std::accumulate
#include <thread> // std::this_thread::sleep_for
#include <vector>
#include "aidge/utils/Types.h"
#include "aidge/operator/Conv.hpp"
#include "aidge/backend/cuda/data/TensorImpl.hpp"
#include "aidge/backend/cuda/operator/ConvImpl.hpp"
#include "aidge/backend/cuda/utils/CudaContext.hpp"
template <Aidge::DimIdx_t DIM>
Aidge::NbElts_t Aidge::ConvImpl_cuda<DIM>::getNbRequiredData(const Aidge::IOIndex_t inputIdx) const {
assert(mOp.getInput(inputIdx) && "requires valid input");
// Requires the whole tensors
const auto &inputDims = std::static_pointer_cast<Tensor>(mOp.getInput(inputIdx))->dims();
return std::accumulate(inputDims.begin(), inputDims.end(), Aidge::NbElts_t(1), std::multiplies<NbElts_t>());
}
template <Aidge::DimIdx_t DIM>
Aidge::NbElts_t Aidge::ConvImpl_cuda<DIM>::getNbRequiredProtected(IOIndex_t /*inputIdx*/) const {
// for the direct convolution algorithm, convolutions can be in-place, if
// there is no padding!
return 0;
}
template <Aidge::DimIdx_t DIM>
Aidge::NbElts_t Aidge::ConvImpl_cuda<DIM>::getRequiredMemory(const Aidge::IOIndex_t outputIdx,
const std::vector<Aidge::DimSize_t> &/*inputsSize*/) const {
// Requires the whole tensors, regardless of available data on inputs
assert(outputIdx == 0 && "operator has only one output");
(void) outputIdx;
const auto &outputDims = std::static_pointer_cast<Tensor>(mOp.getOutput(0))->dims();
return std::accumulate(outputDims.begin(), outputDims.end(), NbElts_t(1), std::multiplies<NbElts_t>());
}
template <Aidge::DimIdx_t DIM>
Aidge::NbElts_t Aidge::ConvImpl_cuda<DIM>::getNbConsumedData(Aidge::IOIndex_t inputIdx) const {
assert(static_cast<std::size_t>(inputIdx) < mNbConsumedData.size());
return mNbConsumedData[static_cast<std::size_t>(inputIdx)];
}
template <Aidge::DimIdx_t DIM>
Aidge::NbElts_t Aidge::ConvImpl_cuda<DIM>::getNbProducedData(Aidge::IOIndex_t outputIdx) const {
assert((outputIdx == 0) && (static_cast<std::size_t>(outputIdx) < mNbProducedData.size()));
return mNbProducedData[static_cast<std::size_t>(outputIdx)];
}
template <Aidge::DimIdx_t DIM>
void Aidge::ConvImpl_cuda<DIM>::updateConsummerProducer(){
// Update producer-consumer data
for (std::size_t inputIdx = 0; inputIdx < mNbConsumedData.size(); ++inputIdx)
mNbConsumedData[inputIdx] += getNbRequiredData(static_cast<IOIndex_t>(inputIdx)); // each input is consumed by the minimum
// amount for a forward pass
mNbProducedData[0] += getRequiredMemory(0, {});
}
template <Aidge::DimIdx_t DIM>
void Aidge::ConvImpl_cuda<DIM>::forward() {
// FIXME: uncomment the following code once memory handling will work
assert(mOp.getInput(0) && "missing input #0");
assert(mOp.getInput(1) && "missing input #1");
// Lazy-initialize CuDNN convolution descriptor
if (mConvDesc == nullptr) {
const std::vector<int> strides(mOp.template get<ConvParam::StrideDims>().begin(), mOp.template get<ConvParam::StrideDims>().end());
const std::vector<int> paddings(DIM, 0);
const std::vector<int> upscales(mOp.template get<ConvParam::DilationDims>().begin(), mOp.template get<ConvParam::DilationDims>().end());
CHECK_CUDNN_STATUS(cudnnCreateConvolutionDescriptor(&mConvDesc));
CHECK_CUDNN_STATUS(
cudnnSetConvolutionNdDescriptor(mConvDesc,
DIM,
&paddings[0],
&strides[0],
&upscales[0],
CUDNN_CROSS_CORRELATION,
DataTypeToCudnn(mOp.getOutput(0)->dataType())));
}
// Lazy-initialize CuDNN filter descriptor
if (mFilterDesc == nullptr) {
const std::vector<int> kernels(mOp.getInput(1)->dims().begin(), mOp.getInput(1)->dims().end());
CHECK_CUDNN_STATUS(cudnnCreateFilterDescriptor(&mFilterDesc));
CHECK_CUDNN_STATUS(cudnnSetFilterNdDescriptor(mFilterDesc,
DataTypeToCudnn(mOp.getInput(1)->dataType()),
CUDNN_TENSOR_NCHW,
kernels.size(),
&kernels[0]));
}
// Set forward algorithm and allocate the required workspace
if (mWorkspace == nullptr) {
// Find the best CuDNN forward algorithm (the one with the lowest compute time)
int maxAlgoIterations = 0;
cudnnGetConvolutionForwardAlgorithmMaxCount(CudaContext::cudnnHandle(),
&maxAlgoIterations);
assert(maxAlgoIterations > 0 && "No available CUDNN ConvolutionForwardAlgorithm");
int returnAlgoCounts = 0;
std::vector<cudnnConvolutionFwdAlgoPerf_t> returnFwdAlgo(maxAlgoIterations);
CHECK_CUDNN_STATUS(cudnnFindConvolutionForwardAlgorithm(
CudaContext::cudnnHandle(),
dynamic_cast<TensorImpl_cuda_*>(mOp.getInput(0)->getImpl().get())->getCudnnTensorDesc(),
mFilterDesc,
mConvDesc,
dynamic_cast<TensorImpl_cuda_*>(mOp.getOutput(0)->getImpl().get())->getCudnnTensorDesc(),
maxAlgoIterations,
&returnAlgoCounts,
&returnFwdAlgo[0]));
mFwdAlgo = returnFwdAlgo[0].algo;
// Allocate the workspace required by the chosen CuDNN forward algorithm
size_t workspaceSize = 0;
CHECK_CUDNN_STATUS(cudnnGetConvolutionForwardWorkspaceSize(
CudaContext::cudnnHandle(),
dynamic_cast<TensorImpl_cuda_*>(mOp.getInput(0)->getImpl().get())->getCudnnTensorDesc(),
mFilterDesc,
mConvDesc,
dynamic_cast<TensorImpl_cuda_*>(mOp.getOutput(0)->getImpl().get())->getCudnnTensorDesc(),
mFwdAlgo,
&workspaceSize));
CHECK_CUDA_STATUS(cudaMalloc(&mWorkspace, workspaceSize));
mWorkspaceSize = workspaceSize;
}
// Do the actual forward computation
// Template is only for scaling parameters, which are always in float
// excepted when the convolution is performed in double precision.
if (mOp.getOutput(0)->dataType() == DataType::Float64) {
forward_<double>();
}
else {
forward_<float>();
}
}
template <Aidge::DimIdx_t DIM>
template <class T>
void Aidge::ConvImpl_cuda<DIM>::forward_() {
const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
CHECK_CUDNN_STATUS(
cudnnConvolutionForward(CudaContext::cudnnHandle(),
&alpha,
dynamic_cast<TensorImpl_cuda_*>(mOp.getInput(0)->getImpl().get())->getCudnnTensorDesc(),
mOp.getInput(0)->getImpl()->rawPtr(),
mFilterDesc,
mOp.getInput(1)->getImpl()->rawPtr(),
mConvDesc,
mFwdAlgo,
mWorkspace,
mWorkspaceSize,
&beta,
dynamic_cast<TensorImpl_cuda_*>(mOp.getOutput(0)->getImpl().get())->getCudnnTensorDesc(),
mOp.getOutput(0)->getImpl()->rawPtr()));
// Add bias (if there is any)
if (mOp.getInput(2) && mOp.getInput(2)->size() > 0) {
// Bias tensor needs to have the same number of dims than output tensor for cudnnAddTensor()
std::vector<DimSize_t> biasDims(DIM+2, 1);
biasDims[1] = mOp.getInput(2)->size();
// Create a dummy tensor with the right dims in order to get a CuDNN tensor descriptor (with getCudnnTensorDesc())
Tensor bias(mOp.getInput(2)->dataType());
bias.setBackend("cuda");
bias.resize(biasDims);
// TODO: find a more elegant solution(?)
CHECK_CUDNN_STATUS(cudnnAddTensor(CudaContext::cudnnHandle(),
&alpha,
dynamic_cast<TensorImpl_cuda_*>(bias.getImpl().get())->getCudnnTensorDesc(),
mOp.getInput(2)->getImpl()->rawPtr(),
&alpha,
dynamic_cast<TensorImpl_cuda_*>(mOp.getOutput(0)->getImpl().get())->getCudnnTensorDesc(),
mOp.getOutput(0)->getImpl()->rawPtr()));
}
}
template <Aidge::DimIdx_t DIM>
Aidge::ConvImpl_cuda<DIM>::~ConvImpl_cuda() {
if (mConvDesc != nullptr) {
cudnnDestroyConvolutionDescriptor(mConvDesc);
}
if (mFilterDesc != nullptr) {
cudnnDestroyFilterDescriptor(mFilterDesc);
}
if (mWorkspace != nullptr) {
cudaFree(mWorkspace);
}
}
template <Aidge::DimIdx_t DIM>
void Aidge::ConvImpl_cuda<DIM>::backward() { printf("Not implemented yet.\n"); }
// Template declarations
template class Aidge::ConvImpl_cuda<2>;