From 60ca181b058b884f0789b54a25663eb6f5aa6e28 Mon Sep 17 00:00:00 2001 From: Olivier BICHLER <olivier.bichler@cea.fr> Date: Wed, 19 Feb 2025 17:58:52 +0100 Subject: [PATCH 01/12] Added missing Tensor types --- .../aidge/backend/cuda/data/TensorImpl.hpp | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/include/aidge/backend/cuda/data/TensorImpl.hpp b/include/aidge/backend/cuda/data/TensorImpl.hpp index 67f6175..dfd347d 100644 --- a/include/aidge/backend/cuda/data/TensorImpl.hpp +++ b/include/aidge/backend/cuda/data/TensorImpl.hpp @@ -282,16 +282,17 @@ private: template <typename T> const std::string TensorImpl_cuda<T>::Backend = "cuda"; -namespace { -static Registrar<Tensor> registrarTensorImpl_cuda_Float64( - {"cuda", DataType::Float64}, Aidge::TensorImpl_cuda<double>::create); -static Registrar<Tensor> registrarTensorImpl_cuda_Float32( - {"cuda", DataType::Float32}, Aidge::TensorImpl_cuda<float>::create); -static Registrar<Tensor> registrarTensorImpl_cuda_Float16( - {"cuda", DataType::Float16}, Aidge::TensorImpl_cuda<half_float::half>::create); -static Registrar<Tensor> registrarTensorImpl_cuda_Int32( - {"cuda", DataType::Int32}, Aidge::TensorImpl_cuda<int32_t>::create); -} // namespace +REGISTRAR(Tensor, {"cuda", DataType::Float64}, Aidge::TensorImpl_cuda<double>::create); +REGISTRAR(Tensor, {"cuda", DataType::Float32}, Aidge::TensorImpl_cuda<float>::create); +REGISTRAR(Tensor, {"cuda", DataType::Float16}, Aidge::TensorImpl_cuda<half_float::half>::create); +REGISTRAR(Tensor, {"cuda", DataType::Int64}, Aidge::TensorImpl_cuda<int64_t>::create); +REGISTRAR(Tensor, {"cuda", DataType::Int32}, Aidge::TensorImpl_cuda<int32_t>::create); +REGISTRAR(Tensor, {"cuda", DataType::Int16}, Aidge::TensorImpl_cuda<int16_t>::create); +REGISTRAR(Tensor, {"cuda", DataType::Int8}, Aidge::TensorImpl_cuda<int8_t>::create); +REGISTRAR(Tensor, {"cuda", DataType::UInt64}, Aidge::TensorImpl_cuda<uint64_t>::create); +REGISTRAR(Tensor, {"cuda", DataType::UInt32}, Aidge::TensorImpl_cuda<uint32_t>::create); +REGISTRAR(Tensor, {"cuda", DataType::UInt16}, Aidge::TensorImpl_cuda<uint16_t>::create); +REGISTRAR(Tensor, {"cuda", DataType::UInt8}, Aidge::TensorImpl_cuda<uint8_t>::create); } // namespace Aidge #endif /* AIDGE_BACKEND_CUDA_DATA_TENSORIMPL_H_ */ -- GitLab From 58e5cf65fbcd4133bf812a2a804c15cc3b888c51 Mon Sep 17 00:00:00 2001 From: Olivier BICHLER <olivier.bichler@cea.fr> Date: Thu, 20 Feb 2025 09:34:30 +0100 Subject: [PATCH 02/12] Added missing thrust_copy --- src/data/TensorImpl.cu | 162 +++++++++++++++++++++++++++++++++-------- 1 file changed, 132 insertions(+), 30 deletions(-) diff --git a/src/data/TensorImpl.cu b/src/data/TensorImpl.cu index c70b024..528caf1 100644 --- a/src/data/TensorImpl.cu +++ b/src/data/TensorImpl.cu @@ -99,33 +99,135 @@ bool Aidge::TensorImpl_cuda<T>::operator==(const TensorImpl &otherImpl) const { return thrust::equal(thrustData, thrustData + mNbElts, thrustOtherData); } -template void Aidge::thrust_copy<double, double>(double const*, double*, unsigned long); -template void Aidge::thrust_copy<double, float>(double const*, float*, unsigned long); -template void Aidge::thrust_copy<double, int>(double const*, int*, unsigned long); -template void Aidge::thrust_copy<float, double>(float const*, double*, unsigned long); -template void Aidge::thrust_copy<float, float>(float const*, float*, unsigned long); -template void Aidge::thrust_copy<float, int>(float const*, int*, unsigned long); -template void Aidge::thrust_copy<int, double>(int const*, double*, unsigned long); -template void Aidge::thrust_copy<int, float>(int const*, float*, unsigned long); -template void Aidge::thrust_copy<int, int>(int const*, int*, unsigned long); -template void Aidge::thrust_copy<long, double>(long const*, double*, unsigned long); -template void Aidge::thrust_copy<long, float>(long const*, float*, unsigned long); -template void Aidge::thrust_copy<long, int>(long const*, int*, unsigned long); -template void Aidge::thrust_copy<short, double>(short const*, double*, unsigned long); -template void Aidge::thrust_copy<short, float>(short const*, float*, unsigned long); -template void Aidge::thrust_copy<short, int>(short const*, int*, unsigned long); -template void Aidge::thrust_copy<signed char, double>(signed char const*, double*, unsigned long); -template void Aidge::thrust_copy<signed char, float>(signed char const*, float*, unsigned long); -template void Aidge::thrust_copy<signed char, int>(signed char const*, int*, unsigned long); -template void Aidge::thrust_copy<unsigned char, double>(unsigned char const*, double*, unsigned long); -template void Aidge::thrust_copy<unsigned char, float>(unsigned char const*, float*, unsigned long); -template void Aidge::thrust_copy<unsigned char, int>(unsigned char const*, int*, unsigned long); -template void Aidge::thrust_copy<unsigned int, double>(unsigned int const*, double*, unsigned long); -template void Aidge::thrust_copy<unsigned int, float>(unsigned int const*, float*, unsigned long); -template void Aidge::thrust_copy<unsigned int, int>(unsigned int const*, int*, unsigned long); -template void Aidge::thrust_copy<unsigned long, double>(unsigned long const*, double*, unsigned long); -template void Aidge::thrust_copy<unsigned long, float>(unsigned long const*, float*, unsigned long); -template void Aidge::thrust_copy<unsigned long, int>(unsigned long const*, int*, unsigned long); -template void Aidge::thrust_copy<unsigned short, double>(unsigned short const*, double*, unsigned long); -template void Aidge::thrust_copy<unsigned short, float>(unsigned short const*, float*, unsigned long); -template void Aidge::thrust_copy<unsigned short, int>(unsigned short const*, int*, unsigned long); +// double +template void Aidge::thrust_copy<>(double const*, double*, size_t); +template void Aidge::thrust_copy<>(double const*, float*, size_t); +template void Aidge::thrust_copy<>(double const*, half_float::half*, size_t); +template void Aidge::thrust_copy<>(double const*, int64_t*, size_t); +template void Aidge::thrust_copy<>(double const*, int32_t*, size_t); +template void Aidge::thrust_copy<>(double const*, int16_t*, size_t); +template void Aidge::thrust_copy<>(double const*, int8_t*, size_t); +template void Aidge::thrust_copy<>(double const*, uint64_t*, size_t); +template void Aidge::thrust_copy<>(double const*, uint32_t*, size_t); +template void Aidge::thrust_copy<>(double const*, uint16_t*, size_t); +template void Aidge::thrust_copy<>(double const*, uint8_t*, size_t); +// float +template void Aidge::thrust_copy<>(float const*, double*, size_t); +template void Aidge::thrust_copy<>(float const*, float*, size_t); +template void Aidge::thrust_copy<>(float const*, half_float::half*, size_t); +template void Aidge::thrust_copy<>(float const*, int64_t*, size_t); +template void Aidge::thrust_copy<>(float const*, int32_t*, size_t); +template void Aidge::thrust_copy<>(float const*, int16_t*, size_t); +template void Aidge::thrust_copy<>(float const*, int8_t*, size_t); +template void Aidge::thrust_copy<>(float const*, uint64_t*, size_t); +template void Aidge::thrust_copy<>(float const*, uint32_t*, size_t); +template void Aidge::thrust_copy<>(float const*, uint16_t*, size_t); +template void Aidge::thrust_copy<>(float const*, uint8_t*, size_t); +// half_float::half +template void Aidge::thrust_copy<>(half_float::half const*, double*, size_t); +template void Aidge::thrust_copy<>(half_float::half const*, float*, size_t); +template void Aidge::thrust_copy<>(half_float::half const*, half_float::half*, size_t); +template void Aidge::thrust_copy<>(half_float::half const*, int64_t*, size_t); +template void Aidge::thrust_copy<>(half_float::half const*, int32_t*, size_t); +template void Aidge::thrust_copy<>(half_float::half const*, int16_t*, size_t); +template void Aidge::thrust_copy<>(half_float::half const*, int8_t*, size_t); +template void Aidge::thrust_copy<>(half_float::half const*, uint64_t*, size_t); +template void Aidge::thrust_copy<>(half_float::half const*, uint32_t*, size_t); +template void Aidge::thrust_copy<>(half_float::half const*, uint16_t*, size_t); +template void Aidge::thrust_copy<>(half_float::half const*, uint8_t*, size_t); +// int64_t +template void Aidge::thrust_copy<>(int64_t const*, double*, size_t); +template void Aidge::thrust_copy<>(int64_t const*, float*, size_t); +template void Aidge::thrust_copy<>(int64_t const*, half_float::half*, size_t); +template void Aidge::thrust_copy<>(int64_t const*, int64_t*, size_t); +template void Aidge::thrust_copy<>(int64_t const*, int32_t*, size_t); +template void Aidge::thrust_copy<>(int64_t const*, int16_t*, size_t); +template void Aidge::thrust_copy<>(int64_t const*, int8_t*, size_t); +template void Aidge::thrust_copy<>(int64_t const*, uint64_t*, size_t); +template void Aidge::thrust_copy<>(int64_t const*, uint32_t*, size_t); +template void Aidge::thrust_copy<>(int64_t const*, uint16_t*, size_t); +template void Aidge::thrust_copy<>(int64_t const*, uint8_t*, size_t); +// int32_t +template void Aidge::thrust_copy<>(int32_t const*, double*, size_t); +template void Aidge::thrust_copy<>(int32_t const*, float*, size_t); +template void Aidge::thrust_copy<>(int32_t const*, half_float::half*, size_t); +template void Aidge::thrust_copy<>(int32_t const*, int64_t*, size_t); +template void Aidge::thrust_copy<>(int32_t const*, int32_t*, size_t); +template void Aidge::thrust_copy<>(int32_t const*, int16_t*, size_t); +template void Aidge::thrust_copy<>(int32_t const*, int8_t*, size_t); +template void Aidge::thrust_copy<>(int32_t const*, uint64_t*, size_t); +template void Aidge::thrust_copy<>(int32_t const*, uint32_t*, size_t); +template void Aidge::thrust_copy<>(int32_t const*, uint16_t*, size_t); +template void Aidge::thrust_copy<>(int32_t const*, uint8_t*, size_t); +// int16_t +template void Aidge::thrust_copy<>(int16_t const*, double*, size_t); +template void Aidge::thrust_copy<>(int16_t const*, float*, size_t); +template void Aidge::thrust_copy<>(int16_t const*, half_float::half*, size_t); +template void Aidge::thrust_copy<>(int16_t const*, int64_t*, size_t); +template void Aidge::thrust_copy<>(int16_t const*, int32_t*, size_t); +template void Aidge::thrust_copy<>(int16_t const*, int16_t*, size_t); +template void Aidge::thrust_copy<>(int16_t const*, int8_t*, size_t); +template void Aidge::thrust_copy<>(int16_t const*, uint64_t*, size_t); +template void Aidge::thrust_copy<>(int16_t const*, uint32_t*, size_t); +template void Aidge::thrust_copy<>(int16_t const*, uint16_t*, size_t); +template void Aidge::thrust_copy<>(int16_t const*, uint8_t*, size_t); +// int8_t +template void Aidge::thrust_copy<>(int8_t const*, double*, size_t); +template void Aidge::thrust_copy<>(int8_t const*, float*, size_t); +template void Aidge::thrust_copy<>(int8_t const*, half_float::half*, size_t); +template void Aidge::thrust_copy<>(int8_t const*, int64_t*, size_t); +template void Aidge::thrust_copy<>(int8_t const*, int32_t*, size_t); +template void Aidge::thrust_copy<>(int8_t const*, int16_t*, size_t); +template void Aidge::thrust_copy<>(int8_t const*, int8_t*, size_t); +template void Aidge::thrust_copy<>(int8_t const*, uint64_t*, size_t); +template void Aidge::thrust_copy<>(int8_t const*, uint32_t*, size_t); +template void Aidge::thrust_copy<>(int8_t const*, uint16_t*, size_t); +template void Aidge::thrust_copy<>(int8_t const*, uint8_t*, size_t); +// uint64_t +template void Aidge::thrust_copy<>(uint64_t const*, double*, size_t); +template void Aidge::thrust_copy<>(uint64_t const*, float*, size_t); +template void Aidge::thrust_copy<>(uint64_t const*, half_float::half*, size_t); +template void Aidge::thrust_copy<>(uint64_t const*, int64_t*, size_t); +template void Aidge::thrust_copy<>(uint64_t const*, int32_t*, size_t); +template void Aidge::thrust_copy<>(uint64_t const*, int16_t*, size_t); +template void Aidge::thrust_copy<>(uint64_t const*, int8_t*, size_t); +template void Aidge::thrust_copy<>(uint64_t const*, uint64_t*, size_t); +template void Aidge::thrust_copy<>(uint64_t const*, uint32_t*, size_t); +template void Aidge::thrust_copy<>(uint64_t const*, uint16_t*, size_t); +template void Aidge::thrust_copy<>(uint64_t const*, uint8_t*, size_t); +// uint32_t +template void Aidge::thrust_copy<>(uint32_t const*, double*, size_t); +template void Aidge::thrust_copy<>(uint32_t const*, float*, size_t); +template void Aidge::thrust_copy<>(uint32_t const*, half_float::half*, size_t); +template void Aidge::thrust_copy<>(uint32_t const*, int64_t*, size_t); +template void Aidge::thrust_copy<>(uint32_t const*, int32_t*, size_t); +template void Aidge::thrust_copy<>(uint32_t const*, int16_t*, size_t); +template void Aidge::thrust_copy<>(uint32_t const*, int8_t*, size_t); +template void Aidge::thrust_copy<>(uint32_t const*, uint64_t*, size_t); +template void Aidge::thrust_copy<>(uint32_t const*, uint32_t*, size_t); +template void Aidge::thrust_copy<>(uint32_t const*, uint16_t*, size_t); +template void Aidge::thrust_copy<>(uint32_t const*, uint8_t*, size_t); +// uint16_t +template void Aidge::thrust_copy<>(uint16_t const*, double*, size_t); +template void Aidge::thrust_copy<>(uint16_t const*, float*, size_t); +template void Aidge::thrust_copy<>(uint16_t const*, half_float::half*, size_t); +template void Aidge::thrust_copy<>(uint16_t const*, int64_t*, size_t); +template void Aidge::thrust_copy<>(uint16_t const*, int32_t*, size_t); +template void Aidge::thrust_copy<>(uint16_t const*, int16_t*, size_t); +template void Aidge::thrust_copy<>(uint16_t const*, int8_t*, size_t); +template void Aidge::thrust_copy<>(uint16_t const*, uint64_t*, size_t); +template void Aidge::thrust_copy<>(uint16_t const*, uint32_t*, size_t); +template void Aidge::thrust_copy<>(uint16_t const*, uint16_t*, size_t); +template void Aidge::thrust_copy<>(uint16_t const*, uint8_t*, size_t); +// uint8_t +template void Aidge::thrust_copy<>(uint8_t const*, double*, size_t); +template void Aidge::thrust_copy<>(uint8_t const*, float*, size_t); +template void Aidge::thrust_copy<>(uint8_t const*, half_float::half*, size_t); +template void Aidge::thrust_copy<>(uint8_t const*, int64_t*, size_t); +template void Aidge::thrust_copy<>(uint8_t const*, int32_t*, size_t); +template void Aidge::thrust_copy<>(uint8_t const*, int16_t*, size_t); +template void Aidge::thrust_copy<>(uint8_t const*, int8_t*, size_t); +template void Aidge::thrust_copy<>(uint8_t const*, uint64_t*, size_t); +template void Aidge::thrust_copy<>(uint8_t const*, uint32_t*, size_t); +template void Aidge::thrust_copy<>(uint8_t const*, uint16_t*, size_t); +template void Aidge::thrust_copy<>(uint8_t const*, uint8_t*, size_t); \ No newline at end of file -- GitLab From fd020207ce2fe9fd4fd8d1ac5ba33bca7681494a Mon Sep 17 00:00:00 2001 From: Olivier BICHLER <olivier.bichler@cea.fr> Date: Sat, 22 Feb 2025 13:29:49 +0000 Subject: [PATCH 03/12] Fix bad reduce implementation and memory leak --- .../backend/cuda/operator/ReduceMeanImpl.hpp | 9 +- .../backend/cuda/operator/ReduceSumImpl.hpp | 8 +- src/operator/ReduceMeanImpl.cpp | 142 +++++++++--------- src/operator/ReduceSumImpl.cpp | 139 +++++++++-------- 4 files changed, 150 insertions(+), 148 deletions(-) diff --git a/include/aidge/backend/cuda/operator/ReduceMeanImpl.hpp b/include/aidge/backend/cuda/operator/ReduceMeanImpl.hpp index 1f68784..d84f39c 100644 --- a/include/aidge/backend/cuda/operator/ReduceMeanImpl.hpp +++ b/include/aidge/backend/cuda/operator/ReduceMeanImpl.hpp @@ -46,10 +46,17 @@ public: void forward() override; void backward() override; + ~ReduceMeanImpl_cuda(); +private: private: // CuDNN specific variables - std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback; + cudnnReduceTensorDescriptor_t mReduceDesc = nullptr; + cudnnTensorDescriptor_t mOutputDesc = nullptr; + size_t mWorkspaceSize = 0; + void* mWorkspace = nullptr; + std::shared_ptr<Tensor> mInputFallback; + std::shared_ptr<Tensor> mOutputGradFallback; template <class T> void forward_(const Tensor& input, const std::vector<int>& axes, bool keepDims); template <class T> void backward_(const Tensor& output_grad, const std::vector<int>& axes); diff --git a/include/aidge/backend/cuda/operator/ReduceSumImpl.hpp b/include/aidge/backend/cuda/operator/ReduceSumImpl.hpp index 10af90b..500670d 100644 --- a/include/aidge/backend/cuda/operator/ReduceSumImpl.hpp +++ b/include/aidge/backend/cuda/operator/ReduceSumImpl.hpp @@ -46,10 +46,16 @@ public: void forward() override; void backward() override; + ~ReduceSumImpl_cuda(); private: // CuDNN specific variables - std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback; + cudnnReduceTensorDescriptor_t mReduceDesc = nullptr; + cudnnTensorDescriptor_t mOutputDesc = nullptr; + size_t mWorkspaceSize = 0; + void* mWorkspace = nullptr; + std::shared_ptr<Tensor> mInputFallback; + std::shared_ptr<Tensor> mOutputGradFallback; template <class T> void forward_(const Tensor& input, const std::vector<int>& axes, bool keepDims); template <class T> void backward_(const Tensor& output_grad, const std::vector<int>& axes); diff --git a/src/operator/ReduceMeanImpl.cpp b/src/operator/ReduceMeanImpl.cpp index 2746d4c..a3e14c1 100644 --- a/src/operator/ReduceMeanImpl.cpp +++ b/src/operator/ReduceMeanImpl.cpp @@ -36,6 +36,51 @@ void Aidge::ReduceMeanImpl_cuda::forward() { std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->copy(input.getImpl()->rawPtr(), input.size()); } else { + if (!keepDims && mOutputDesc == nullptr) { + std::vector<int> outputDims; + std::copy(input.dims().begin(), input.dims().end(), std::back_inserter(outputDims)); + for (const auto axis:axes) { + outputDims[axis] = 1; + } + if (outputDims.size() < 4) { + outputDims.resize(4, 1); + } + // Compute the corresponding strides + std::vector<int> outputStrides(outputDims.size()); + int product = 1; + for (size_t i = outputDims.size(); i > 0; --i) { + outputStrides[i - 1] = product; + product *= outputDims[i - 1]; + } + + CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&mOutputDesc)); + CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(mOutputDesc, DataTypeToCudnn(op.getOutput(0)->dataType()), outputDims.size(), outputDims.data(), outputStrides.data())); + } + + if (mReduceDesc == nullptr) { + CHECK_CUDNN_STATUS(cudnnCreateReduceTensorDescriptor(&mReduceDesc)); + CHECK_CUDNN_STATUS(cudnnSetReduceTensorDescriptor(mReduceDesc, + CUDNN_REDUCE_TENSOR_AVG, + DataTypeToCudnn(op.getOutput(0)->dataType()), + CUDNN_PROPAGATE_NAN, + CUDNN_REDUCE_TENSOR_NO_INDICES, + CUDNN_32BIT_INDICES)); + } + + if (mWorkspace == nullptr) { + const auto outputDesc = (keepDims) + ? std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)) + : mOutputDesc; + + CHECK_CUDNN_STATUS(cudnnGetReductionWorkspaceSize(CudaContext::cudnnHandle(), + mReduceDesc, + std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input), + mOutputDesc, + &mWorkspaceSize)); + + CHECK_CUDA_STATUS(cudaMalloc(&mWorkspace, mWorkspaceSize)); + } + switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { case DataType::Float64: forward_<double>(input, axes, keepDims); @@ -59,97 +104,32 @@ void Aidge::ReduceMeanImpl_cuda::forward_(const Tensor& input, const std::vector const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; - cudnnReduceTensorDescriptor_t reduceDesc; - cudnnTensorDescriptor_t outputDesc; if (keepDims) { - outputDesc = std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)); - CHECK_CUDNN_STATUS(cudnnCreateReduceTensorDescriptor(&reduceDesc)); - CHECK_CUDNN_STATUS(cudnnSetReduceTensorDescriptor(reduceDesc, - CUDNN_REDUCE_TENSOR_AVG, - CudaContext::data_type<T>::value, - CUDNN_PROPAGATE_NAN, - CUDNN_REDUCE_TENSOR_NO_INDICES, - CUDNN_32BIT_INDICES)); - - - size_t workspaceSize; - CHECK_CUDNN_STATUS(cudnnGetReductionWorkspaceSize(CudaContext::cudnnHandle(), - reduceDesc, - std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input), - outputDesc, - &workspaceSize)); - - void *d_workspace; - CHECK_CUDA_STATUS(cudaMalloc(&d_workspace, workspaceSize)); - CHECK_CUDNN_STATUS(cudnnReduceTensor(CudaContext::cudnnHandle(), - reduceDesc, + mReduceDesc, NULL, 0, - d_workspace, - workspaceSize, + mWorkspace, + mWorkspaceSize, &alpha, std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input), input.getImpl()->rawPtr(), &beta, - outputDesc, - std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr())); - - CHECK_CUDNN_STATUS(cudnnDestroyReduceTensorDescriptor(reduceDesc)); - } + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), + std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr())); } else { - CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&outputDesc)); - std::vector<int> outputDims; - std::copy(input.dims().begin(), input.dims().end(), std::back_inserter(outputDims)); - for (const auto axis:axes) { - outputDims[axis] = 1; - } - if (outputDims.size() < 4) { - outputDims.resize(4, 1); - } - // Compute the corresponding strides - std::vector<int> outputStrides(outputDims.size()); - int product = 1; - for (size_t i = outputDims.size(); i > 0; --i) { - outputStrides[i - 1] = product; - product *= outputDims[i - 1]; - } - CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(outputDesc, CudaContext::data_type<T>::value, outputDims.size(), outputDims.data(), outputStrides.data())); - - CHECK_CUDNN_STATUS(cudnnCreateReduceTensorDescriptor(&reduceDesc)); - CHECK_CUDNN_STATUS(cudnnSetReduceTensorDescriptor(reduceDesc, - CUDNN_REDUCE_TENSOR_AVG, - CudaContext::data_type<T>::value, - CUDNN_PROPAGATE_NAN, - CUDNN_REDUCE_TENSOR_NO_INDICES, - CUDNN_32BIT_INDICES)); - - - size_t workspaceSize; - CHECK_CUDNN_STATUS(cudnnGetReductionWorkspaceSize(CudaContext::cudnnHandle(), - reduceDesc, - std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input), - outputDesc, - &workspaceSize)); - - void *d_workspace; - CHECK_CUDA_STATUS(cudaMalloc(&d_workspace, workspaceSize)); - CHECK_CUDNN_STATUS(cudnnReduceTensor(CudaContext::cudnnHandle(), - reduceDesc, + mReduceDesc, NULL, 0, - d_workspace, - workspaceSize, + mWorkspace, + mWorkspaceSize, &alpha, std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input), input.getImpl()->rawPtr(), &beta, - outputDesc, + mOutputDesc, std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr())); - - CHECK_CUDNN_STATUS(cudnnDestroyReduceTensorDescriptor(reduceDesc)); - CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(outputDesc)); } } @@ -206,3 +186,17 @@ void Aidge::ReduceMeanImpl_cuda::backward_(const Tensor& outGrad, const std::vec alpha, beta); } + +Aidge::ReduceMeanImpl_cuda::~ReduceMeanImpl_cuda() { + if (mReduceDesc != nullptr) { + cudnnDestroyReduceTensorDescriptor(mReduceDesc); + } + + if (mOutputDesc != nullptr) { + cudnnDestroyTensorDescriptor(mOutputDesc); + } + + if (mWorkspace != nullptr) { + cudaFree(mWorkspace); + } +} diff --git a/src/operator/ReduceSumImpl.cpp b/src/operator/ReduceSumImpl.cpp index e8c5b1e..46469c5 100644 --- a/src/operator/ReduceSumImpl.cpp +++ b/src/operator/ReduceSumImpl.cpp @@ -36,6 +36,51 @@ void Aidge::ReduceSumImpl_cuda::forward() { std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->copy(input.getImpl()->rawPtr(), input.size()); } else { + if (!keepDims && mOutputDesc == nullptr) { + std::vector<int> outputDims; + std::copy(input.dims().begin(), input.dims().end(), std::back_inserter(outputDims)); + for (const auto axis:axes) { + outputDims[axis] = 1; + } + if (outputDims.size() < 4) { + outputDims.resize(4, 1); + } + // Compute the corresponding strides + std::vector<int> outputStrides(outputDims.size()); + int product = 1; + for (size_t i = outputDims.size(); i > 0; --i) { + outputStrides[i - 1] = product; + product *= outputDims[i - 1]; + } + + CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&mOutputDesc)); + CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(mOutputDesc, DataTypeToCudnn(op.getOutput(0)->dataType()), outputDims.size(), outputDims.data(), outputStrides.data())); + } + + if (mReduceDesc == nullptr) { + CHECK_CUDNN_STATUS(cudnnCreateReduceTensorDescriptor(&mReduceDesc)); + CHECK_CUDNN_STATUS(cudnnSetReduceTensorDescriptor(mReduceDesc, + CUDNN_REDUCE_TENSOR_ADD, + DataTypeToCudnn(op.getOutput(0)->dataType()), + CUDNN_PROPAGATE_NAN, + CUDNN_REDUCE_TENSOR_NO_INDICES, + CUDNN_32BIT_INDICES)); + } + + if (mWorkspace == nullptr) { + const auto outputDesc = (keepDims) + ? std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)) + : mOutputDesc; + + CHECK_CUDNN_STATUS(cudnnGetReductionWorkspaceSize(CudaContext::cudnnHandle(), + mReduceDesc, + std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input), + outputDesc, + &mWorkspaceSize)); + + CHECK_CUDA_STATUS(cudaMalloc(&mWorkspace, mWorkspaceSize)); + } + switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { case DataType::Float64: forward_<double>(input, axes, keepDims); @@ -59,97 +104,33 @@ void Aidge::ReduceSumImpl_cuda::forward_(const Tensor& input, const std::vector< const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; - cudnnReduceTensorDescriptor_t reduceDesc; - cudnnTensorDescriptor_t outputDesc; if (keepDims) { - outputDesc = std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)); - CHECK_CUDNN_STATUS(cudnnCreateReduceTensorDescriptor(&reduceDesc)); - CHECK_CUDNN_STATUS(cudnnSetReduceTensorDescriptor(reduceDesc, - CUDNN_REDUCE_TENSOR_ADD, - CudaContext::data_type<T>::value, - CUDNN_PROPAGATE_NAN, - CUDNN_REDUCE_TENSOR_NO_INDICES, - CUDNN_32BIT_INDICES)); - - - size_t workspaceSize; - CHECK_CUDNN_STATUS(cudnnGetReductionWorkspaceSize(CudaContext::cudnnHandle(), - reduceDesc, - std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input), - outputDesc, - &workspaceSize)); - - void *d_workspace; - CHECK_CUDA_STATUS(cudaMalloc(&d_workspace, workspaceSize)); - CHECK_CUDNN_STATUS(cudnnReduceTensor(CudaContext::cudnnHandle(), - reduceDesc, + mReduceDesc, NULL, 0, - d_workspace, - workspaceSize, + mWorkspace, + mWorkspaceSize, &alpha, std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input), input.getImpl()->rawPtr(), &beta, - outputDesc, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr())); - - CHECK_CUDNN_STATUS(cudnnDestroyReduceTensorDescriptor(reduceDesc)); } else { - CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&outputDesc)); - std::vector<int> outputDims; - std::copy(input.dims().begin(), input.dims().end(), std::back_inserter(outputDims)); - for (const auto axis:axes) { - outputDims[axis] = 1; - } - if (outputDims.size() < 4) { - outputDims.resize(4, 1); - } - // Compute the corresponding strides - std::vector<int> outputStrides(outputDims.size()); - int product = 1; - for (size_t i = outputDims.size(); i > 0; --i) { - outputStrides[i - 1] = product; - product *= outputDims[i - 1]; - } - CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(outputDesc, CudaContext::data_type<T>::value, outputDims.size(), outputDims.data(), outputStrides.data())); - - CHECK_CUDNN_STATUS(cudnnCreateReduceTensorDescriptor(&reduceDesc)); - CHECK_CUDNN_STATUS(cudnnSetReduceTensorDescriptor(reduceDesc, - CUDNN_REDUCE_TENSOR_ADD, - CudaContext::data_type<T>::value, - CUDNN_PROPAGATE_NAN, - CUDNN_REDUCE_TENSOR_NO_INDICES, - CUDNN_32BIT_INDICES)); - - - size_t workspaceSize; - CHECK_CUDNN_STATUS(cudnnGetReductionWorkspaceSize(CudaContext::cudnnHandle(), - reduceDesc, - std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input), - outputDesc, - &workspaceSize)); - - void *d_workspace; - CHECK_CUDA_STATUS(cudaMalloc(&d_workspace, workspaceSize)); - CHECK_CUDNN_STATUS(cudnnReduceTensor(CudaContext::cudnnHandle(), - reduceDesc, + mReduceDesc, NULL, 0, - d_workspace, - workspaceSize, + mWorkspace, + mWorkspaceSize, &alpha, std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input), input.getImpl()->rawPtr(), &beta, - outputDesc, + mOutputDesc, std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr())); - - CHECK_CUDNN_STATUS(cudnnDestroyReduceTensorDescriptor(reduceDesc)); - CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(outputDesc)); } } @@ -203,3 +184,17 @@ void Aidge::ReduceSumImpl_cuda::backward_(const Tensor& outGrad, const std::vect alpha, beta); } + +Aidge::ReduceSumImpl_cuda::~ReduceSumImpl_cuda() { + if (mReduceDesc != nullptr) { + cudnnDestroyReduceTensorDescriptor(mReduceDesc); + } + + if (mOutputDesc != nullptr) { + cudnnDestroyTensorDescriptor(mOutputDesc); + } + + if (mWorkspace != nullptr) { + cudaFree(mWorkspace); + } +} -- GitLab From f71872060cdcd9dfa3e71c4be54754f1491e7f7d Mon Sep 17 00:00:00 2001 From: Olivier BICHLER <olivier.bichler@cea.fr> Date: Sat, 22 Feb 2025 14:14:47 +0000 Subject: [PATCH 04/12] Fixed poor Sub implementation --- .../aidge/backend/cuda/operator/SubImpl.hpp | 12 +- src/operator/SubImpl.cpp | 187 +++++++++--------- 2 files changed, 99 insertions(+), 100 deletions(-) diff --git a/include/aidge/backend/cuda/operator/SubImpl.hpp b/include/aidge/backend/cuda/operator/SubImpl.hpp index 529d0b2..791e231 100644 --- a/include/aidge/backend/cuda/operator/SubImpl.hpp +++ b/include/aidge/backend/cuda/operator/SubImpl.hpp @@ -46,10 +46,18 @@ public: void forward() override; void backward() override; + ~SubImpl_cuda(); private: - template <class T> void forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides); - template <class T> void backward_(const Tensor& outGrad, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides); + std::vector<cudnnTensorDescriptor_t> mTensorDesc; + cudnnReduceTensorDescriptor_t mBwdReduceDesc = nullptr; + size_t mBwdWorkspaceSize = 0; + void* mBwdWorkspace = nullptr; + std::vector<std::shared_ptr<Tensor>> mInputFallbacks; + std::shared_ptr<Tensor> mOutputGradFallback; + + template <class T> void forward_(const std::vector<std::reference_wrapper<Tensor>>& inputs); + template <class T> void backward_(const Tensor& outGrad); }; // Implementation entry point registration to Operator diff --git a/src/operator/SubImpl.cpp b/src/operator/SubImpl.cpp index 249d95f..6fbd78a 100644 --- a/src/operator/SubImpl.cpp +++ b/src/operator/SubImpl.cpp @@ -33,40 +33,50 @@ void Aidge::SubImpl_cuda::forward() { AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, "Cannot add inputs with two differents data type."); } - std::vector<std::shared_ptr<Tensor>> inputFallbacks(op.nbInputs()); - std::vector<Tensor> inputs(op.nbInputs()); - std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims - std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides - for (IOIndex_t i = 0; i < op.nbInputs(); ++i) { - inputs[i] = op.getInput(i)->refCastFrom(inputFallbacks[i], *op.getOutput(0)); - - // Get tensor dims and broadcast them - std::copy(inputs[i].dims().begin(), inputs[i].dims().end(), std::back_inserter(dims[i])); - dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1)); - - if (dims[i].size() < 4) { - dims[i].resize(4, 1); - } + if (mInputFallbacks.empty()) { + mInputFallbacks.resize(op.nbInputs()); + } - // Compute the corresponding strides - std::vector<int> tensorStrides(dims[i].size()); - int product = 1; - for (size_t j = dims[i].size(); j > 0; --j) { - tensorStrides[j - 1] = product; - product *= dims[i][j - 1]; + std::vector<std::reference_wrapper<Tensor>> inputs; + for (IOIndex_t i = 0; i < op.nbInputs(); ++i) { + inputs.push_back(op.getInput(i)->refCastFrom(mInputFallbacks[i], *op.getOutput(0))); + + if (mTensorDesc.size() <= i) { + std::vector<int> dims(op.nbInputs()); // For broadcasted dims + std::vector<int> strides(op.nbInputs()); // For the cooresponding strides + + // Get tensor dims and broadcast them + std::copy(inputs[i].get().dims().begin(), inputs[i].get().dims().end(), std::back_inserter(dims)); + dims.insert(dims.cbegin(), op.getOutput(0)->nbDims() - dims.size(), int(1)); + + if (dims.size() < 4) { + dims.resize(4, 1); + } + + // Compute the corresponding strides + std::vector<int> tensorStrides(dims.size()); + int product = 1; + for (size_t j = dims.size(); j > 0; --j) { + tensorStrides[j - 1] = product; + product *= dims[j - 1]; + } + strides = tensorStrides; + + mTensorDesc.push_back(nullptr); + CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&mTensorDesc[i])); + CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(mTensorDesc[i], DataTypeToCudnn(op.getOutput(0)->dataType()), dims.size(), dims.data(), strides.data())); } - strides[i] = tensorStrides; } switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { case DataType::Float64: - forward_<double>(inputs, dims, strides); + forward_<double>(inputs); break; case DataType::Float32: - forward_<float>(inputs, dims, strides); + forward_<float>(inputs); break; case DataType::Float16: - forward_<half>(inputs, dims, strides); + forward_<half>(inputs); break; default: AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); @@ -74,21 +84,18 @@ void Aidge::SubImpl_cuda::forward() { } template <class T> -void Aidge::SubImpl_cuda::forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) { +void Aidge::SubImpl_cuda::forward_(const std::vector<std::reference_wrapper<Tensor>>& inputs) { const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; const typename Cuda::cudnn_scaling_type<T>::type gamma = -1.0f; - // Create a Tensor descriptor with the broadcasted dims and strides - cudnnTensorDescriptor_t tensorDesc; - CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc)); - CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc, CudaContext::data_type<T>::value, inputsDims[0].size(), inputsDims[0].data(), inputsStrides[0].data())); + // Add first input to the output CHECK_CUDNN_STATUS( cudnnAddTensor(CudaContext::cudnnHandle(), &alpha, - tensorDesc, - inputs[0].getImpl()->rawPtr(), + mTensorDesc[0], + inputs[0].get().getImpl()->rawPtr(), &beta, std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()) @@ -96,18 +103,16 @@ void Aidge::SubImpl_cuda::forward_(const std::vector<Tensor>& inputs, const std: // Substract other inputs if there are any for (size_t i = 1; i < op.nbInputs(); ++i) { - CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc, CudaContext::data_type<T>::value, inputsDims[i].size(), inputsDims[i].data(), inputsStrides[i].data())); CHECK_CUDNN_STATUS( cudnnAddTensor(CudaContext::cudnnHandle(), &gamma, - tensorDesc, - inputs[i].getImpl()->rawPtr(), + mTensorDesc[0], + inputs[i].get().getImpl()->rawPtr(), &alpha, std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()) ); } - CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc)); } void Aidge::SubImpl_cuda::backward() { @@ -116,38 +121,42 @@ void Aidge::SubImpl_cuda::backward() { AIDGE_ASSERT(op.getOutput(0)->grad(), "missing output gradient in Sub operator"); AIDGE_ASSERT(op.getOutput(0)->grad()->hasImpl(), "cannot run Sub backward because the output gradient has no implementation."); - std::shared_ptr<Tensor> outputGradFallback; - const auto& outputGrad = op.getOutput(0)->grad()->refCastFrom(outputGradFallback, *op.getOutput(0)->grad()); + const auto& outputGrad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad()); - std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims - std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides - for (IOIndex_t i = 0; i < op.nbInputs(); ++i) { - std::shared_ptr<Tensor> inputFallback; - const Tensor input = op.getInput(i)->refCastFrom(inputFallback, *op.getOutput(0)); - - // Get tensor dims and broadcast them - std::copy(input.dims().begin(), input.dims().end(), std::back_inserter(dims[i])); - dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1)); - - // Compute the corresponding strides - std::vector<int> tensorStrides(dims[i].size()); - int product = 1; - for (size_t j = dims[i].size(); j > 0; --j) { - tensorStrides[j - 1] = product; - product *= dims[i][j - 1]; + if (mBwdReduceDesc == nullptr) { + CHECK_CUDNN_STATUS(cudnnCreateReduceTensorDescriptor(&mBwdReduceDesc)); + CHECK_CUDNN_STATUS(cudnnSetReduceTensorDescriptor(mBwdReduceDesc, + CUDNN_REDUCE_TENSOR_ADD, + DataTypeToCudnn(op.getOutput(0)->dataType()), + CUDNN_PROPAGATE_NAN, + CUDNN_REDUCE_TENSOR_NO_INDICES, + CUDNN_32BIT_INDICES)); + } + + if (mBwdWorkspace == nullptr) { + size_t workspaceSize = 0; + for (std::size_t i = 0; i < mTensorDesc.size(); i++) { + CHECK_CUDNN_STATUS(cudnnGetReductionWorkspaceSize(CudaContext::cudnnHandle(), + mBwdReduceDesc, + std::dynamic_pointer_cast<TensorImpl_cuda_>(outputGrad.getImpl())->getCudnnTensorDesc(*op.getOutput(0)), + mTensorDesc[i], + &workspaceSize)); + + mBwdWorkspaceSize = std::max(workspaceSize, mBwdWorkspaceSize); } - strides[i] = tensorStrides; + + CHECK_CUDA_STATUS(cudaMalloc(&mBwdWorkspace, mBwdWorkspaceSize)); } switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { case DataType::Float64: - backward_<double>(outputGrad, dims, strides); + backward_<double>(outputGrad); break; case DataType::Float32: - backward_<float>(outputGrad, dims, strides); + backward_<float>(outputGrad); break; case DataType::Float16: - backward_<half>(outputGrad, dims, strides); + backward_<half>(outputGrad); break; default: AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); @@ -155,10 +164,7 @@ void Aidge::SubImpl_cuda::backward() { } template <class T> -void Aidge::SubImpl_cuda::backward_( - const Tensor& outputGrad, - const std::vector<std::vector<int>>& inputsDims, - const std::vector<std::vector<int>>& inputsStrides) +void Aidge::SubImpl_cuda::backward_(const Tensor& outputGrad) { const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); @@ -166,12 +172,11 @@ void Aidge::SubImpl_cuda::backward_( const typename Cuda::cudnn_scaling_type<T>::type beta = 1.0f; // accumulate const typename Cuda::cudnn_scaling_type<T>::type gamma = -1.0f; - for (std::size_t i = 0; i < inputsDims.size(); i++) + for (std::size_t i = 0; i < mTensorDesc.size(); i++) { if (op.getInput(i)->size() == op.getOutput(0)->size()) { - CHECK_CUDNN_STATUS( - cudnnAddTensor(CudaContext::cudnnHandle(), + CHECK_CUDNN_STATUS(cudnnAddTensor(CudaContext::cudnnHandle(), i==0 ? &alpha: &gamma, std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), outputGrad.getImpl()->rawPtr(), @@ -182,48 +187,34 @@ void Aidge::SubImpl_cuda::backward_( else // In case of broadcasting { // Gradient with respect to input_i: sum outputGrad over the broadcasted dimensions using cudnnReduceTensor - cudnnReduceTensorDescriptor_t reduceDesc; - CHECK_CUDNN_STATUS(cudnnCreateReduceTensorDescriptor(&reduceDesc)); - CHECK_CUDNN_STATUS(cudnnSetReduceTensorDescriptor(reduceDesc, - CUDNN_REDUCE_TENSOR_ADD, - CudaContext::data_type<T>::value, - CUDNN_PROPAGATE_NAN, - CUDNN_REDUCE_TENSOR_NO_INDICES, - CUDNN_32BIT_INDICES)); - - cudnnTensorDescriptor_t outputDesc = std::dynamic_pointer_cast<TensorImpl_cuda_>(outputGrad.getImpl())->getCudnnTensorDesc(*op.getOutput(0)); - // Create a Tensor descriptor with the broadcasted dims and strides - cudnnTensorDescriptor_t tensorDesc; - CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc)); - CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc, - CudaContext::data_type<T>::value, - inputsDims[i].size(), - inputsDims[i].data(), - inputsStrides[i].data())); - size_t workspaceSize; - CHECK_CUDNN_STATUS(cudnnGetReductionWorkspaceSize(CudaContext::cudnnHandle(), - reduceDesc, - outputDesc, - tensorDesc, - &workspaceSize)); - - void *d_workspace; - CHECK_CUDA_STATUS(cudaMalloc(&d_workspace, workspaceSize)); - CHECK_CUDNN_STATUS(cudnnReduceTensor(CudaContext::cudnnHandle(), - reduceDesc, + mBwdReduceDesc, NULL, 0, - d_workspace, - workspaceSize, + mBwdWorkspace, + mBwdWorkspaceSize, i==0 ? &alpha: &gamma, - outputDesc, + std::dynamic_pointer_cast<TensorImpl_cuda_>(outputGrad.getImpl())->getCudnnTensorDesc(*op.getOutput(0)), outputGrad.getImpl()->rawPtr(), &beta, - tensorDesc, + mTensorDesc[i], op.getInput(i)->grad()->getImpl()->rawPtr())); + } + } +} - CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc)); +Aidge::SubImpl_cuda::~SubImpl_cuda() { + for (auto tensorDesc : mTensorDesc) { + if (tensorDesc != nullptr) { + cudnnDestroyTensorDescriptor(tensorDesc); } } + + if (mBwdReduceDesc != nullptr) { + cudnnDestroyReduceTensorDescriptor(mBwdReduceDesc); + } + + if (mBwdWorkspace != nullptr) { + cudaFree(mBwdWorkspace); + } } -- GitLab From 2564218e72c66c8622c072cc567dbb5736f1f62c Mon Sep 17 00:00:00 2001 From: Olivier BICHLER <olivier.bichler@cea.fr> Date: Sat, 22 Feb 2025 14:42:04 +0000 Subject: [PATCH 05/12] Fixed typo --- src/operator/ReduceMeanImpl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/operator/ReduceMeanImpl.cpp b/src/operator/ReduceMeanImpl.cpp index a3e14c1..3f8e8b9 100644 --- a/src/operator/ReduceMeanImpl.cpp +++ b/src/operator/ReduceMeanImpl.cpp @@ -75,7 +75,7 @@ void Aidge::ReduceMeanImpl_cuda::forward() { CHECK_CUDNN_STATUS(cudnnGetReductionWorkspaceSize(CudaContext::cudnnHandle(), mReduceDesc, std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input), - mOutputDesc, + outputDesc, &mWorkspaceSize)); CHECK_CUDA_STATUS(cudaMalloc(&mWorkspace, mWorkspaceSize)); -- GitLab From 61527ab8e647200ddd523f749de769bd561700be Mon Sep 17 00:00:00 2001 From: Olivier BICHLER <olivier.bichler@cea.fr> Date: Sat, 22 Feb 2025 15:15:52 +0000 Subject: [PATCH 06/12] Fixed Add operator --- .../aidge/backend/cuda/operator/AddImpl.hpp | 12 +- src/operator/AddImpl.cpp | 200 ++++++++---------- src/operator/SubImpl.cpp | 19 +- 3 files changed, 108 insertions(+), 123 deletions(-) diff --git a/include/aidge/backend/cuda/operator/AddImpl.hpp b/include/aidge/backend/cuda/operator/AddImpl.hpp index 42d420f..719c447 100644 --- a/include/aidge/backend/cuda/operator/AddImpl.hpp +++ b/include/aidge/backend/cuda/operator/AddImpl.hpp @@ -46,10 +46,18 @@ public: void forward() override; void backward() override; + ~AddImpl_cuda(); private: - template <class T> void forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides); - template <class T> void backward_(const Tensor& outGrad, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides); + std::vector<cudnnTensorDescriptor_t> mTensorDesc; + cudnnReduceTensorDescriptor_t mBwdReduceDesc = nullptr; + size_t mBwdWorkspaceSize = 0; + void* mBwdWorkspace = nullptr; + std::vector<std::shared_ptr<Tensor>> mInputFallbacks; + std::shared_ptr<Tensor> mOutputGradFallback; + + template <class T> void forward_(const std::vector<std::reference_wrapper<Tensor>>& inputs); + template <class T> void backward_(const Tensor& outGrad); }; // Implementation entry point registration to Operator diff --git a/src/operator/AddImpl.cpp b/src/operator/AddImpl.cpp index 8771a79..37681d6 100644 --- a/src/operator/AddImpl.cpp +++ b/src/operator/AddImpl.cpp @@ -33,40 +33,50 @@ void Aidge::AddImpl_cuda::forward() { AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, "Cannot add inputs with two differents data type."); } - std::vector<std::shared_ptr<Tensor>> inputFallbacks(op.nbInputs()); - std::vector<Tensor> inputs(op.nbInputs()); - std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims - std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides - for (IOIndex_t i = 0; i < op.nbInputs(); ++i) { - inputs[i] = op.getInput(i)->refCastFrom(inputFallbacks[i], *op.getOutput(0)); - - // Get tensor dims and broadcast them - std::copy(inputs[i].dims().begin(), inputs[i].dims().end(), std::back_inserter(dims[i])); - dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1)); - - if (dims[i].size() < 4) { - dims[i].resize(4, 1); - } + if (mInputFallbacks.empty()) { + mInputFallbacks.resize(op.nbInputs()); + } - // Compute the corresponding strides - std::vector<int> tensorStrides(dims[i].size()); - int product = 1; - for (size_t j = dims[i].size(); j > 0; --j) { - tensorStrides[j - 1] = product; - product *= dims[i][j - 1]; + std::vector<std::reference_wrapper<Tensor>> inputs; + for (IOIndex_t i = 0; i < op.nbInputs(); ++i) { + inputs.push_back(op.getInput(i)->refCastFrom(mInputFallbacks[i], *op.getOutput(0))); + + if (mTensorDesc.size() <= i) { + std::vector<int> dims(op.nbInputs()); // For broadcasted dims + std::vector<int> strides(op.nbInputs()); // For the cooresponding strides + + // Get tensor dims and broadcast them + std::copy(inputs[i].get().dims().begin(), inputs[i].get().dims().end(), std::back_inserter(dims)); + dims.insert(dims.cbegin(), op.getOutput(0)->nbDims() - dims.size(), int(1)); + + if (dims.size() < 4) { + dims.resize(4, 1); + } + + // Compute the corresponding strides + std::vector<int> tensorStrides(dims.size()); + int product = 1; + for (size_t j = dims.size(); j > 0; --j) { + tensorStrides[j - 1] = product; + product *= dims[j - 1]; + } + strides = tensorStrides; + + mTensorDesc.push_back(nullptr); + CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&mTensorDesc[i])); + CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(mTensorDesc[i], DataTypeToCudnn(op.getOutput(0)->dataType()), dims.size(), dims.data(), strides.data())); } - strides[i] = tensorStrides; } switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { case DataType::Float64: - forward_<double>(inputs, dims, strides); + forward_<double>(inputs); break; case DataType::Float32: - forward_<float>(inputs, dims, strides); + forward_<float>(inputs); break; case DataType::Float16: - forward_<half>(inputs, dims, strides); + forward_<half>(inputs); break; default: AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); @@ -74,40 +84,23 @@ void Aidge::AddImpl_cuda::forward() { } template <class T> -void Aidge::AddImpl_cuda::forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) { +void Aidge::AddImpl_cuda::forward_(const std::vector<std::reference_wrapper<Tensor>>& inputs) { const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; - // Create a Tensor descriptor with the broadcasted dims and strides - cudnnTensorDescriptor_t tensorDesc; - CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc)); - CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc, CudaContext::data_type<T>::value, inputsDims[0].size(), inputsDims[0].data(), inputsStrides[0].data())); - // Add first input - CHECK_CUDNN_STATUS( - cudnnAddTensor(CudaContext::cudnnHandle(), - &alpha, - tensorDesc, - inputs[0].getImpl()->rawPtr(), - &beta, - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), - std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()) - ); - // Add other inputs if there are any - for (size_t i = 1; i < op.nbInputs(); ++i) + for (size_t i = 0; i < op.nbInputs(); ++i) { - CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc, CudaContext::data_type<T>::value, inputsDims[i].size(), inputsDims[i].data(), inputsStrides[i].data())); CHECK_CUDNN_STATUS( cudnnAddTensor(CudaContext::cudnnHandle(), &alpha, - tensorDesc, - inputs[i].getImpl()->rawPtr(), - &alpha, + mTensorDesc[i], + inputs[i].get().getImpl()->rawPtr(), + (i > 0) ? &alpha : &beta, std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()) ); } - CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc)); } void Aidge::AddImpl_cuda::backward() { @@ -116,38 +109,42 @@ void Aidge::AddImpl_cuda::backward() { AIDGE_ASSERT(op.getOutput(0)->grad(), "missing output gradient in Add operator"); AIDGE_ASSERT(op.getOutput(0)->grad()->hasImpl(), "cannot run Add backward because the output gradient has no implementation."); - std::shared_ptr<Tensor> outputGradFallback; - const auto& outputGrad = op.getOutput(0)->grad()->refCastFrom(outputGradFallback, *op.getOutput(0)->grad()); + const auto& outputGrad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad()); - std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims - std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides - for (IOIndex_t i = 0; i < op.nbInputs(); ++i) { - std::shared_ptr<Tensor> inputFallback; - const Tensor input = op.getInput(i)->refCastFrom(inputFallback, *op.getOutput(0)); - - // Get tensor dims and broadcast them - std::copy(input.dims().begin(), input.dims().end(), std::back_inserter(dims[i])); - dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1)); - - // Compute the corresponding strides - std::vector<int> tensorStrides(dims[i].size()); - int product = 1; - for (size_t j = dims[i].size(); j > 0; --j) { - tensorStrides[j - 1] = product; - product *= dims[i][j - 1]; + if (mBwdReduceDesc == nullptr) { + CHECK_CUDNN_STATUS(cudnnCreateReduceTensorDescriptor(&mBwdReduceDesc)); + CHECK_CUDNN_STATUS(cudnnSetReduceTensorDescriptor(mBwdReduceDesc, + CUDNN_REDUCE_TENSOR_ADD, + DataTypeToCudnn(op.getOutput(0)->dataType()), + CUDNN_PROPAGATE_NAN, + CUDNN_REDUCE_TENSOR_NO_INDICES, + CUDNN_32BIT_INDICES)); + } + + if (mBwdWorkspace == nullptr) { + size_t workspaceSize = 0; + for (std::size_t i = 0; i < mTensorDesc.size(); i++) { + CHECK_CUDNN_STATUS(cudnnGetReductionWorkspaceSize(CudaContext::cudnnHandle(), + mBwdReduceDesc, + std::dynamic_pointer_cast<TensorImpl_cuda_>(outputGrad.getImpl())->getCudnnTensorDesc(*op.getOutput(0)), + mTensorDesc[i], + &workspaceSize)); + + mBwdWorkspaceSize = std::max(workspaceSize, mBwdWorkspaceSize); } - strides[i] = tensorStrides; + + CHECK_CUDA_STATUS(cudaMalloc(&mBwdWorkspace, mBwdWorkspaceSize)); } switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { case DataType::Float64: - backward_<double>(outputGrad, dims, strides); + backward_<double>(outputGrad); break; case DataType::Float32: - backward_<float>(outputGrad, dims, strides); + backward_<float>(outputGrad); break; case DataType::Float16: - backward_<half>(outputGrad, dims, strides); + backward_<half>(outputGrad); break; default: AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); @@ -155,65 +152,56 @@ void Aidge::AddImpl_cuda::backward() { } template <class T> -void Aidge::AddImpl_cuda::backward_(const Tensor& outputGrad, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) +void Aidge::AddImpl_cuda::backward_(const Tensor& outputGrad) { const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; const typename Cuda::cudnn_scaling_type<T>::type beta = 1.0f; // accumulate - for (std::size_t i = 0; i < inputsDims.size(); i++) + for (std::size_t i = 0; i < mTensorDesc.size(); i++) { if (op.getInput(i)->size() == op.getOutput(0)->size()) { - // TODO: Test if we can avoid copy and simply set rawPtr - op.getInput(i)->grad()->getImpl()->copy(outputGrad.getImpl()->rawPtr(), op.getInput(i)->grad()->size()); + CHECK_CUDNN_STATUS(cudnnAddTensor(CudaContext::cudnnHandle(), + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), + outputGrad.getImpl()->rawPtr(), + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(i)->getImpl())->getCudnnTensorDesc(*op.getInput(i)), + op.getInput(i)->grad()->getImpl()->rawPtr())); } else // In case of broadcasting { // Gradient with respect to input_i: sum outputGrad over the broadcasted dimensions using cudnnReduceTensor - cudnnReduceTensorDescriptor_t reduceDesc; - CHECK_CUDNN_STATUS(cudnnCreateReduceTensorDescriptor(&reduceDesc)); - CHECK_CUDNN_STATUS(cudnnSetReduceTensorDescriptor(reduceDesc, - CUDNN_REDUCE_TENSOR_ADD, - CudaContext::data_type<T>::value, - CUDNN_PROPAGATE_NAN, - CUDNN_REDUCE_TENSOR_NO_INDICES, - CUDNN_32BIT_INDICES)); - - cudnnTensorDescriptor_t outputDesc = std::dynamic_pointer_cast<TensorImpl_cuda_>(outputGrad.getImpl())->getCudnnTensorDesc(*op.getOutput(0)); - // Create a Tensor descriptor with the broadcasted dims and strides - cudnnTensorDescriptor_t tensorDesc; - CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc)); - CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc, - CudaContext::data_type<T>::value, - inputsDims[i].size(), - inputsDims[i].data(), - inputsStrides[i].data())); - size_t workspaceSize; - CHECK_CUDNN_STATUS(cudnnGetReductionWorkspaceSize(CudaContext::cudnnHandle(), - reduceDesc, - outputDesc, - tensorDesc, - &workspaceSize)); - - void *d_workspace; - CHECK_CUDA_STATUS(cudaMalloc(&d_workspace, workspaceSize)); - CHECK_CUDNN_STATUS(cudnnReduceTensor(CudaContext::cudnnHandle(), - reduceDesc, + mBwdReduceDesc, NULL, 0, - d_workspace, - workspaceSize, + mBwdWorkspace, + mBwdWorkspaceSize, &alpha, - outputDesc, + std::dynamic_pointer_cast<TensorImpl_cuda_>(outputGrad.getImpl())->getCudnnTensorDesc(*op.getOutput(0)), outputGrad.getImpl()->rawPtr(), &beta, - tensorDesc, + mTensorDesc[i], op.getInput(i)->grad()->getImpl()->rawPtr())); + } + } +} - CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc)); +Aidge::AddImpl_cuda::~AddImpl_cuda() { + for (auto tensorDesc : mTensorDesc) { + if (tensorDesc != nullptr) { + cudnnDestroyTensorDescriptor(tensorDesc); } } + + if (mBwdReduceDesc != nullptr) { + cudnnDestroyReduceTensorDescriptor(mBwdReduceDesc); + } + + if (mBwdWorkspace != nullptr) { + cudaFree(mBwdWorkspace); + } } diff --git a/src/operator/SubImpl.cpp b/src/operator/SubImpl.cpp index 6fbd78a..4b44d2c 100644 --- a/src/operator/SubImpl.cpp +++ b/src/operator/SubImpl.cpp @@ -90,25 +90,14 @@ void Aidge::SubImpl_cuda::forward_(const std::vector<std::reference_wrapper<Tens const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; const typename Cuda::cudnn_scaling_type<T>::type gamma = -1.0f; - // Add first input to the output - CHECK_CUDNN_STATUS( - cudnnAddTensor(CudaContext::cudnnHandle(), - &alpha, - mTensorDesc[0], - inputs[0].get().getImpl()->rawPtr(), - &beta, - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), - std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()) - ); - // Substract other inputs if there are any - for (size_t i = 1; i < op.nbInputs(); ++i) + for (size_t i = 0; i < op.nbInputs(); ++i) { CHECK_CUDNN_STATUS( cudnnAddTensor(CudaContext::cudnnHandle(), - &gamma, - mTensorDesc[0], + (i > 0) ? &gamma : &alpha, + mTensorDesc[i], inputs[i].get().getImpl()->rawPtr(), - &alpha, + (i > 0) ? &alpha : &beta, std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()) ); -- GitLab From d9777957a2697c5813e9b4d4c7ba711ad61d197a Mon Sep 17 00:00:00 2001 From: Olivier BICHLER <olivier.bichler@cea.fr> Date: Sat, 22 Feb 2025 15:26:48 +0000 Subject: [PATCH 07/12] Fixed Mul --- .../aidge/backend/cuda/operator/MulImpl.hpp | 10 +- src/operator/MulImpl.cpp | 176 ++++++++---------- 2 files changed, 82 insertions(+), 104 deletions(-) diff --git a/include/aidge/backend/cuda/operator/MulImpl.hpp b/include/aidge/backend/cuda/operator/MulImpl.hpp index 9a1a4d7..4e29fba 100644 --- a/include/aidge/backend/cuda/operator/MulImpl.hpp +++ b/include/aidge/backend/cuda/operator/MulImpl.hpp @@ -46,10 +46,16 @@ public: void forward() override; void backward() override; + ~MulImpl_cuda(); private: - template <class T> void forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides); - template <class T> void backward_(const Tensor& outputGrad, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides); + std::vector<cudnnTensorDescriptor_t> mTensorDesc; + cudnnOpTensorDescriptor_t mOpTensorDesc; + std::vector<std::shared_ptr<Tensor>> mInputFallbacks; + std::shared_ptr<Tensor> mOutputGradFallback; + + template <class T> void forward_(const std::vector<std::reference_wrapper<Tensor>>& inputs); + template <class T> void backward_(const Tensor& outputGrad); }; // Implementation entry point registration to Operator diff --git a/src/operator/MulImpl.cpp b/src/operator/MulImpl.cpp index aa9b4c7..1a0c8d9 100644 --- a/src/operator/MulImpl.cpp +++ b/src/operator/MulImpl.cpp @@ -34,40 +34,55 @@ void Aidge::MulImpl_cuda::forward() { AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, "Cannot Mul inputs with two differents data type."); } - std::vector<std::shared_ptr<Tensor>> inputFallbacks(op.nbInputs()); - std::vector<Tensor> inputs(op.nbInputs()); - std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims - std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides - for (IOIndex_t i = 0; i < op.nbInputs(); ++i) { - inputs[i] = op.getInput(i)->refCastFrom(inputFallbacks[i], *op.getOutput(0)); - - // Get tensor dims and broadcast them - std::copy(inputs[i].dims().begin(), inputs[i].dims().end(), std::back_inserter(dims[i])); - dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1)); + if (mInputFallbacks.empty()) { + mInputFallbacks.resize(op.nbInputs()); + } - if (dims[i].size() < 4) { - dims[i].resize(4, 1); + std::vector<std::reference_wrapper<Tensor>> inputs; + for (IOIndex_t i = 0; i < op.nbInputs(); ++i) { + inputs.push_back(op.getInput(i)->refCastFrom(mInputFallbacks[i], *op.getOutput(0))); + + if (mTensorDesc.size() <= i) { + std::vector<int> dims(op.nbInputs()); // For broadcasted dims + std::vector<int> strides(op.nbInputs()); // For the cooresponding strides + + // Get tensor dims and broadcast them + std::copy(inputs[i].get().dims().begin(), inputs[i].get().dims().end(), std::back_inserter(dims)); + dims.insert(dims.cbegin(), op.getOutput(0)->nbDims() - dims.size(), int(1)); + + if (dims.size() < 4) { + dims.resize(4, 1); + } + + // Compute the corresponding strides + std::vector<int> tensorStrides(dims.size()); + int product = 1; + for (size_t j = dims.size(); j > 0; --j) { + tensorStrides[j - 1] = product; + product *= dims[j - 1]; + } + strides = tensorStrides; + + mTensorDesc.push_back(nullptr); + CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&mTensorDesc[i])); + CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(mTensorDesc[i], DataTypeToCudnn(op.getOutput(0)->dataType()), dims.size(), dims.data(), strides.data())); } + } - // Compute the corresponding strides - std::vector<int> tensorStrides(dims[i].size()); - int product = 1; - for (size_t j = dims[i].size(); j > 0; --j) { - tensorStrides[j - 1] = product; - product *= dims[i][j - 1]; - } - strides[i] = tensorStrides; + if (mOpTensorDesc == nullptr) { + CHECK_CUDNN_STATUS(cudnnCreateOpTensorDescriptor(&mOpTensorDesc)); + CHECK_CUDNN_STATUS(cudnnSetOpTensorDescriptor(mOpTensorDesc, CUDNN_OP_TENSOR_MUL, DataTypeToCudnn(op.getOutput(0)->dataType()), CUDNN_PROPAGATE_NAN)); } switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { case DataType::Float64: - forward_<double>(inputs, dims, strides); + forward_<double>(inputs); break; case DataType::Float32: - forward_<float>(inputs, dims, strides); + forward_<float>(inputs); break; case DataType::Float16: - forward_<half>(inputs, dims, strides); + forward_<half>(inputs); break; default: AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); @@ -75,51 +90,37 @@ void Aidge::MulImpl_cuda::forward() { } template <class T> -void Aidge::MulImpl_cuda::forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) { +void Aidge::MulImpl_cuda::forward_(const std::vector<std::reference_wrapper<Tensor>>& inputs) { const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; - // Create a Tensor descriptor with the broadcasted dims and strides - cudnnTensorDescriptor_t tensorDesc0, tensorDesc1; - CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc0)); - CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc0, CudaContext::data_type<T>::value, inputsDims[0].size(), inputsDims[0].data(), inputsStrides[0].data())); - CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc1)); - CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc1, CudaContext::data_type<T>::value, inputsDims[1].size(), inputsDims[1].data(), inputsStrides[1].data())); - // Multiply inputs - cudnnOpTensorDescriptor_t opTensorDesc; - CHECK_CUDNN_STATUS(cudnnCreateOpTensorDescriptor(&opTensorDesc)); - CHECK_CUDNN_STATUS(cudnnSetOpTensorDescriptor(opTensorDesc, CUDNN_OP_TENSOR_MUL, CudaContext::data_type<T>::value, CUDNN_PROPAGATE_NAN)); - if(inputs[0].size()>inputs[1].size()) { + if(inputs[0].get().size()>inputs[1].get().size()) { CHECK_CUDNN_STATUS(cudnnOpTensor(CudaContext::cudnnHandle(), - opTensorDesc, + mOpTensorDesc, &alpha, - tensorDesc0, - inputs[0].getImpl()->rawPtr(), + mTensorDesc[0], + inputs[0].get().getImpl()->rawPtr(), &alpha, - tensorDesc1, - inputs[1].getImpl()->rawPtr(), + mTensorDesc[1], + inputs[1].get().getImpl()->rawPtr(), &beta, std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr())); } else { CHECK_CUDNN_STATUS(cudnnOpTensor(CudaContext::cudnnHandle(), - opTensorDesc, + mOpTensorDesc, &alpha, - tensorDesc1, - inputs[1].getImpl()->rawPtr(), + mTensorDesc[1], + inputs[1].get().getImpl()->rawPtr(), &alpha, - tensorDesc0, - inputs[0].getImpl()->rawPtr(), + mTensorDesc[0], + inputs[0].get().getImpl()->rawPtr(), &beta, std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr())); } - - CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc0)); - CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc1)); - CHECK_CUDNN_STATUS(cudnnDestroyOpTensorDescriptor(opTensorDesc)); } void Aidge::MulImpl_cuda::backward() { @@ -128,42 +129,17 @@ void Aidge::MulImpl_cuda::backward() { AIDGE_ASSERT(op.getOutput(0)->grad(), "missing output gradient in Mul operator"); AIDGE_ASSERT(op.getOutput(0)->grad()->hasImpl(), "cannot run Mul backward because the output gradient has no implementation."); - std::shared_ptr<Tensor> outputGradFallback; - const auto& outputGrad = op.getOutput(0)->grad()->refCastFrom(outputGradFallback, *op.getOutput(0)->grad()); - - std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims - std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides - for (IOIndex_t i = 0; i < op.nbInputs(); ++i) { - std::shared_ptr<Tensor> inputFallback; - const Tensor input = op.getInput(i)->refCastFrom(inputFallback, *op.getOutput(0)); - - // Get tensor dims and broadcast them - std::copy(input.dims().begin(), input.dims().end(), std::back_inserter(dims[i])); - dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1)); - - if (dims[i].size() < 4) { - dims[i].resize(4, 1); - } - - // Compute the corresponding strides - std::vector<int> tensorStrides(dims[i].size()); - int product = 1; - for (size_t j = dims[i].size(); j > 0; --j) { - tensorStrides[j - 1] = product; - product *= dims[i][j - 1]; - } - strides[i] = tensorStrides; - } + const auto& outputGrad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad()); switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { case DataType::Float64: - backward_<double>(outputGrad, dims, strides); + backward_<double>(outputGrad); break; case DataType::Float32: - backward_<float>(outputGrad, dims, strides); + backward_<float>(outputGrad); break; case DataType::Float16: - backward_<half>(outputGrad, dims, strides); + backward_<half>(outputGrad); break; default: AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); @@ -171,51 +147,47 @@ void Aidge::MulImpl_cuda::backward() { } template <class T> -void Aidge::MulImpl_cuda::backward_(const Tensor& outputGrad, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) { +void Aidge::MulImpl_cuda::backward_(const Tensor& outputGrad) { const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; const typename Cuda::cudnn_scaling_type<T>::type beta = 1.0f; // accumulate - // Create a Tensor descriptor with the broadcasted dims and strides - cudnnTensorDescriptor_t tensorDesc0, tensorDesc1; - CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc0)); - CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc0, CudaContext::data_type<T>::value, inputsDims[0].size(), inputsDims[0].data(), inputsStrides[0].data())); - CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc1)); - CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc1, CudaContext::data_type<T>::value, inputsDims[1].size(), inputsDims[1].data(), inputsStrides[1].data())); - - // Create the operation descriptor - cudnnOpTensorDescriptor_t opTensorDesc; - CHECK_CUDNN_STATUS(cudnnCreateOpTensorDescriptor(&opTensorDesc)); - CHECK_CUDNN_STATUS(cudnnSetOpTensorDescriptor(opTensorDesc, CUDNN_OP_TENSOR_MUL, CudaContext::data_type<T>::value, CUDNN_PROPAGATE_NAN)); - // Input0_grad = output_grad * Input1 CHECK_CUDNN_STATUS(cudnnOpTensor(CudaContext::cudnnHandle(), - opTensorDesc, + mOpTensorDesc, &alpha, std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), outputGrad.getImpl()->rawPtr(), &alpha, - tensorDesc1, + mTensorDesc[1], op.getInput(1)->getImpl()->rawPtr(), &beta, - tensorDesc0, + mTensorDesc[0], op.getInput(0)->grad()->getImpl()->rawPtr())); // Input1_grad = output_grad * Input0 CHECK_CUDNN_STATUS(cudnnOpTensor(CudaContext::cudnnHandle(), - opTensorDesc, + mOpTensorDesc, &alpha, std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), outputGrad.getImpl()->rawPtr(), &alpha, - tensorDesc0, + mTensorDesc[0], op.getInput(0)->getImpl()->rawPtr(), &beta, - tensorDesc1, + mTensorDesc[1], op.getInput(1)->grad()->getImpl()->rawPtr())); - - CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc0)); - CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc1)); - CHECK_CUDNN_STATUS(cudnnDestroyOpTensorDescriptor(opTensorDesc)); -} \ No newline at end of file +} + +Aidge::MulImpl_cuda::~MulImpl_cuda() { + for (auto tensorDesc : mTensorDesc) { + if (tensorDesc != nullptr) { + cudnnDestroyTensorDescriptor(tensorDesc); + } + } + + if (mOpTensorDesc != nullptr) { + cudnnDestroyOpTensorDescriptor(mOpTensorDesc); + } +} -- GitLab From 75109fd6179dca617726154fcc9c984fcb606cbc Mon Sep 17 00:00:00 2001 From: Olivier BICHLER <olivier.bichler@cea.fr> Date: Sat, 22 Feb 2025 15:54:30 +0000 Subject: [PATCH 08/12] Fixed wrong adaptation --- src/operator/AddImpl.cpp | 9 +++------ src/operator/MulImpl.cpp | 9 +++------ src/operator/SubImpl.cpp | 9 +++------ 3 files changed, 9 insertions(+), 18 deletions(-) diff --git a/src/operator/AddImpl.cpp b/src/operator/AddImpl.cpp index 37681d6..2ab2831 100644 --- a/src/operator/AddImpl.cpp +++ b/src/operator/AddImpl.cpp @@ -42,10 +42,8 @@ void Aidge::AddImpl_cuda::forward() { inputs.push_back(op.getInput(i)->refCastFrom(mInputFallbacks[i], *op.getOutput(0))); if (mTensorDesc.size() <= i) { - std::vector<int> dims(op.nbInputs()); // For broadcasted dims - std::vector<int> strides(op.nbInputs()); // For the cooresponding strides - // Get tensor dims and broadcast them + std::vector<int> dims; std::copy(inputs[i].get().dims().begin(), inputs[i].get().dims().end(), std::back_inserter(dims)); dims.insert(dims.cbegin(), op.getOutput(0)->nbDims() - dims.size(), int(1)); @@ -54,13 +52,12 @@ void Aidge::AddImpl_cuda::forward() { } // Compute the corresponding strides - std::vector<int> tensorStrides(dims.size()); + std::vector<int> strides(dims.size()); int product = 1; for (size_t j = dims.size(); j > 0; --j) { - tensorStrides[j - 1] = product; + strides[j - 1] = product; product *= dims[j - 1]; } - strides = tensorStrides; mTensorDesc.push_back(nullptr); CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&mTensorDesc[i])); diff --git a/src/operator/MulImpl.cpp b/src/operator/MulImpl.cpp index 1a0c8d9..3c0d403 100644 --- a/src/operator/MulImpl.cpp +++ b/src/operator/MulImpl.cpp @@ -43,10 +43,8 @@ void Aidge::MulImpl_cuda::forward() { inputs.push_back(op.getInput(i)->refCastFrom(mInputFallbacks[i], *op.getOutput(0))); if (mTensorDesc.size() <= i) { - std::vector<int> dims(op.nbInputs()); // For broadcasted dims - std::vector<int> strides(op.nbInputs()); // For the cooresponding strides - // Get tensor dims and broadcast them + std::vector<int> dims; std::copy(inputs[i].get().dims().begin(), inputs[i].get().dims().end(), std::back_inserter(dims)); dims.insert(dims.cbegin(), op.getOutput(0)->nbDims() - dims.size(), int(1)); @@ -55,13 +53,12 @@ void Aidge::MulImpl_cuda::forward() { } // Compute the corresponding strides - std::vector<int> tensorStrides(dims.size()); + std::vector<int> strides(dims.size()); int product = 1; for (size_t j = dims.size(); j > 0; --j) { - tensorStrides[j - 1] = product; + strides[j - 1] = product; product *= dims[j - 1]; } - strides = tensorStrides; mTensorDesc.push_back(nullptr); CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&mTensorDesc[i])); diff --git a/src/operator/SubImpl.cpp b/src/operator/SubImpl.cpp index 4b44d2c..55cb566 100644 --- a/src/operator/SubImpl.cpp +++ b/src/operator/SubImpl.cpp @@ -42,10 +42,8 @@ void Aidge::SubImpl_cuda::forward() { inputs.push_back(op.getInput(i)->refCastFrom(mInputFallbacks[i], *op.getOutput(0))); if (mTensorDesc.size() <= i) { - std::vector<int> dims(op.nbInputs()); // For broadcasted dims - std::vector<int> strides(op.nbInputs()); // For the cooresponding strides - // Get tensor dims and broadcast them + std::vector<int> dims; std::copy(inputs[i].get().dims().begin(), inputs[i].get().dims().end(), std::back_inserter(dims)); dims.insert(dims.cbegin(), op.getOutput(0)->nbDims() - dims.size(), int(1)); @@ -54,13 +52,12 @@ void Aidge::SubImpl_cuda::forward() { } // Compute the corresponding strides - std::vector<int> tensorStrides(dims.size()); + std::vector<int> strides(dims.size()); int product = 1; for (size_t j = dims.size(); j > 0; --j) { - tensorStrides[j - 1] = product; + strides[j - 1] = product; product *= dims[j - 1]; } - strides = tensorStrides; mTensorDesc.push_back(nullptr); CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&mTensorDesc[i])); -- GitLab From 302de9a793204ea332b116e8fa3d579495068bdd Mon Sep 17 00:00:00 2001 From: Olivier BICHLER <olivier.bichler@cea.fr> Date: Sat, 22 Feb 2025 16:14:47 +0000 Subject: [PATCH 09/12] Coding style --- src/operator/AddImpl.cpp | 3 +-- src/operator/MulImpl.cpp | 3 +-- src/operator/SubImpl.cpp | 3 +-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/src/operator/AddImpl.cpp b/src/operator/AddImpl.cpp index 2ab2831..ad1d852 100644 --- a/src/operator/AddImpl.cpp +++ b/src/operator/AddImpl.cpp @@ -43,8 +43,7 @@ void Aidge::AddImpl_cuda::forward() { if (mTensorDesc.size() <= i) { // Get tensor dims and broadcast them - std::vector<int> dims; - std::copy(inputs[i].get().dims().begin(), inputs[i].get().dims().end(), std::back_inserter(dims)); + std::vector<int> dims(inputs[i].get().dims().begin(), inputs[i].get().dims().end()); dims.insert(dims.cbegin(), op.getOutput(0)->nbDims() - dims.size(), int(1)); if (dims.size() < 4) { diff --git a/src/operator/MulImpl.cpp b/src/operator/MulImpl.cpp index 3c0d403..e86a6b0 100644 --- a/src/operator/MulImpl.cpp +++ b/src/operator/MulImpl.cpp @@ -44,8 +44,7 @@ void Aidge::MulImpl_cuda::forward() { if (mTensorDesc.size() <= i) { // Get tensor dims and broadcast them - std::vector<int> dims; - std::copy(inputs[i].get().dims().begin(), inputs[i].get().dims().end(), std::back_inserter(dims)); + std::vector<int> dims(inputs[i].get().dims().begin(), inputs[i].get().dims().end()); dims.insert(dims.cbegin(), op.getOutput(0)->nbDims() - dims.size(), int(1)); if (dims.size() < 4) { diff --git a/src/operator/SubImpl.cpp b/src/operator/SubImpl.cpp index 55cb566..b092720 100644 --- a/src/operator/SubImpl.cpp +++ b/src/operator/SubImpl.cpp @@ -43,8 +43,7 @@ void Aidge::SubImpl_cuda::forward() { if (mTensorDesc.size() <= i) { // Get tensor dims and broadcast them - std::vector<int> dims; - std::copy(inputs[i].get().dims().begin(), inputs[i].get().dims().end(), std::back_inserter(dims)); + std::vector<int> dims(inputs[i].get().dims().begin(), inputs[i].get().dims().end()); dims.insert(dims.cbegin(), op.getOutput(0)->nbDims() - dims.size(), int(1)); if (dims.size() < 4) { -- GitLab From 4657d0a915965c875001f1a85beb14c458832b0a Mon Sep 17 00:00:00 2001 From: Olivier BICHLER <olivier.bichler@cea.fr> Date: Sat, 22 Feb 2025 16:19:26 +0000 Subject: [PATCH 10/12] Fixed missing initialization --- include/aidge/backend/cuda/operator/MulImpl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/aidge/backend/cuda/operator/MulImpl.hpp b/include/aidge/backend/cuda/operator/MulImpl.hpp index 4e29fba..4c1e276 100644 --- a/include/aidge/backend/cuda/operator/MulImpl.hpp +++ b/include/aidge/backend/cuda/operator/MulImpl.hpp @@ -50,7 +50,7 @@ public: private: std::vector<cudnnTensorDescriptor_t> mTensorDesc; - cudnnOpTensorDescriptor_t mOpTensorDesc; + cudnnOpTensorDescriptor_t mOpTensorDesc = nullptr; std::vector<std::shared_ptr<Tensor>> mInputFallbacks; std::shared_ptr<Tensor> mOutputGradFallback; -- GitLab From 3ed75e8cd112f89381b4c3905f723d6f86e824b4 Mon Sep 17 00:00:00 2001 From: bhalimi <benjamin.halimi@cea.fr> Date: Mon, 24 Feb 2025 13:25:58 +0000 Subject: [PATCH 11/12] remove default values in source --- src/data/TensorImpl.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/data/TensorImpl.cu b/src/data/TensorImpl.cu index 528caf1..2084143 100644 --- a/src/data/TensorImpl.cu +++ b/src/data/TensorImpl.cu @@ -36,7 +36,7 @@ cudaCopyToH_kernel(const SRC_T* srcData, } } -template <typename SRC_T, typename std::enable_if<!std::is_same<half_float::half, SRC_T>::value>::type* = nullptr> +template <typename SRC_T, typename std::enable_if<!std::is_same<half_float::half, SRC_T>::value>::type*> void Aidge::thrust_copy(const SRC_T* srcData, half_float::half* dstData, size_t size) { cudaCopyToH_kernel<SRC_T><<<(size + 255) / 256, 256>>> @@ -58,7 +58,7 @@ cudaCopyFromH_kernel(const __half* srcData, } } -template <typename DST_T, typename std::enable_if<!std::is_same<half_float::half, DST_T>::value>::type* = nullptr> +template <typename DST_T, typename std::enable_if<!std::is_same<half_float::half, DST_T>::value>::type*> void Aidge::thrust_copy(const half_float::half* srcData, DST_T* dstData, size_t size) { cudaCopyFromH_kernel<DST_T><<<(size + 255) / 256, 256>>> -- GitLab From c4f53b0d7efea6fb912335306398c841d7e1b5b4 Mon Sep 17 00:00:00 2001 From: Maxence Naud <maxence.naud@cea.fr> Date: Mon, 3 Mar 2025 14:28:58 +0000 Subject: [PATCH 12/12] [upd] version 0.5.0 -> 0.5.1 --- version.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/version.txt b/version.txt index 8ea2ddf..4b9fcbe 100644 --- a/version.txt +++ b/version.txt @@ -1,2 +1 @@ -0.5.0 - +0.5.1 -- GitLab