Skip to content
Snippets Groups Projects
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
ConvImpl.cpp 8.16 KiB
/********************************************************************************
 * Copyright (c) 2023 CEA-List
 *
 * This program and the accompanying materials are made available under the
 * terms of the Eclipse Public License 2.0 which is available at
 * http://www.eclipse.org/legal/epl-2.0.
 *
 * SPDX-License-Identifier: EPL-2.0
 *
 ********************************************************************************/

#include <cassert>
#include <chrono>  // std::chrono::milliseconds
#include <numeric> // std::accumulate
#include <thread>  // std::this_thread::sleep_for
#include <vector>

#include "aidge/utils/Types.h"
#include "aidge/operator/Conv.hpp"

#include "aidge/backend/cuda/data/TensorImpl.hpp"
#include "aidge/backend/cuda/operator/ConvImpl.hpp"
#include "aidge/backend/cuda/utils/CudaContext.hpp"

template <Aidge::DimIdx_t DIM>
void Aidge::ConvImpl_cuda<DIM>::forward() {
    // FIXME: uncomment the following code once memory handling will work
    assert(mOp.getRawInput(0) && "missing input #0");
    assert(mOp.getRawInput(1) && "missing input #1");

    // Lazy-initialize CuDNN convolution descriptor
    if (mConvDesc == nullptr) {
        const Conv_Op<DIM>& convOp = static_cast<const Conv_Op<DIM>&>(mOp);
        const std::vector<int> strides(convOp.template getAttr<ConvAttr::StrideDims>().begin(), convOp.template getAttr<ConvAttr::StrideDims>().end());
        const std::vector<int> paddings(DIM, 0);
        const std::vector<int> upscales(convOp.template getAttr<ConvAttr::DilationDims>().begin(), convOp.template getAttr<ConvAttr::DilationDims>().end());

        CHECK_CUDNN_STATUS(cudnnCreateConvolutionDescriptor(&mConvDesc));
        CHECK_CUDNN_STATUS(
            cudnnSetConvolutionNdDescriptor(mConvDesc,
                                            DIM,
                                            &paddings[0],
                                            &strides[0],
                                            &upscales[0],
                                            CUDNN_CROSS_CORRELATION,
                                            DataTypeToCudnn(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType())));
    }

    // Lazy-initialize CuDNN filter descriptor
    if (mFilterDesc == nullptr) {
        const std::vector<int> kernels(std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims().begin(), std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims().end());

        CHECK_CUDNN_STATUS(cudnnCreateFilterDescriptor(&mFilterDesc));
        CHECK_CUDNN_STATUS(cudnnSetFilterNdDescriptor(mFilterDesc,
                                                    DataTypeToCudnn(std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dataType()),
                                                    CUDNN_TENSOR_NCHW,
                                                    kernels.size(),
                                                    &kernels[0]));
    }

    // Set forward algorithm and allocate the required workspace
    if (mWorkspace == nullptr) {
        // Find the best CuDNN forward algorithm (the one with the lowest compute time)
        int maxAlgoIterations = 0;
        cudnnGetConvolutionForwardAlgorithmMaxCount(CudaContext::cudnnHandle(),
                                                    &maxAlgoIterations);

        assert(maxAlgoIterations > 0 && "No available CUDNN ConvolutionForwardAlgorithm");

        int returnAlgoCounts = 0;
        std::vector<cudnnConvolutionFwdAlgoPerf_t> returnFwdAlgo(maxAlgoIterations);

        CHECK_CUDNN_STATUS(cudnnFindConvolutionForwardAlgorithm(
                            CudaContext::cudnnHandle(),
                            dynamic_cast<TensorImpl_cuda_*>(std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->getImpl().get())->getCudnnTensorDesc(),
                            mFilterDesc,
                            mConvDesc,
                            dynamic_cast<TensorImpl_cuda_*>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl().get())->getCudnnTensorDesc(),
                            maxAlgoIterations,
                            &returnAlgoCounts,
                            &returnFwdAlgo[0]));
        mFwdAlgo = returnFwdAlgo[0].algo;

        // Allocate the workspace required by the chosen CuDNN forward algorithm
        size_t workspaceSize = 0;

        CHECK_CUDNN_STATUS(cudnnGetConvolutionForwardWorkspaceSize(
            CudaContext::cudnnHandle(),
            dynamic_cast<TensorImpl_cuda_*>(std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->getImpl().get())->getCudnnTensorDesc(),
            mFilterDesc,
            mConvDesc,
            dynamic_cast<TensorImpl_cuda_*>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl().get())->getCudnnTensorDesc(),
            mFwdAlgo,
            &workspaceSize));

        CHECK_CUDA_STATUS(cudaMalloc(&mWorkspace, workspaceSize));
        mWorkspaceSize = workspaceSize;
    }

    // Do the actual forward computation
    // Template is only for scaling parameters, which are always in float
    // excepted when the convolution is performed in double precision.
    if (std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType() == DataType::Float64) {
        forward_<double>();
    }
    else {
        forward_<float>();
    }
}

template <Aidge::DimIdx_t DIM>
template <class T>
void Aidge::ConvImpl_cuda<DIM>::forward_() {
    const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
    typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;

    CHECK_CUDNN_STATUS(
        cudnnConvolutionForward(CudaContext::cudnnHandle(),
                                &alpha,
                                dynamic_cast<TensorImpl_cuda_*>(std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->getImpl().get())->getCudnnTensorDesc(),
                                std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->getImpl()->rawPtr(),
                                mFilterDesc,
                                std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->getImpl()->rawPtr(),
                                mConvDesc,
                                mFwdAlgo,
                                mWorkspace,
                                mWorkspaceSize,
                                &beta,
                                dynamic_cast<TensorImpl_cuda_*>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl().get())->getCudnnTensorDesc(),
                                std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr()));

    // Add bias (if there is any)
    if (mOp.getRawInput(2) && std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->size() > 0) {
        // Bias tensor needs to have the same number of dims than output tensor for cudnnAddTensor()
        std::vector<DimSize_t> biasDims(DIM+2, 1);
        biasDims[1] = std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->size();

        // Create a dummy tensor with the right dims in order to get a CuDNN tensor descriptor (with getCudnnTensorDesc())
        Tensor bias(std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->dataType());
        bias.setBackend("cuda");
        bias.resize(biasDims);
        // TODO: find a more elegant solution(?)

        CHECK_CUDNN_STATUS(cudnnAddTensor(CudaContext::cudnnHandle(),
                                            &alpha,
                                            dynamic_cast<TensorImpl_cuda_*>(bias.getImpl().get())->getCudnnTensorDesc(),
                                            std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->getImpl()->rawPtr(),
                                            &alpha,
                                            dynamic_cast<TensorImpl_cuda_*>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl().get())->getCudnnTensorDesc(),
                                            std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr()));
    }
}

template <Aidge::DimIdx_t DIM>
Aidge::ConvImpl_cuda<DIM>::~ConvImpl_cuda() {
    if (mConvDesc != nullptr) {
        cudnnDestroyConvolutionDescriptor(mConvDesc);
    }

    if (mFilterDesc != nullptr) {
        cudnnDestroyFilterDescriptor(mFilterDesc);
    }

    if (mWorkspace != nullptr) {
        cudaFree(mWorkspace);
    }
}


// Template declarations
template class Aidge::ConvImpl_cuda<2>;