diff --git a/.gitlab/ci/build.gitlab-ci.yml b/.gitlab/ci/build.gitlab-ci.yml index a8b6595731f3ba8ecc14fffcd306ada52bb52616..c5d22e779753ee0fbfa0bcbd828f85639ace8b9f 100644 --- a/.gitlab/ci/build.gitlab-ci.yml +++ b/.gitlab/ci/build.gitlab-ci.yml @@ -1,5 +1,6 @@ include: - - remote: 'https://gitlab.eclipse.org/eclipse/aidge/gitlab_shared_files/-/raw/main/.gitlab/ci/shared_script.gitlab-ci.yml' + #- remote: 'https://gitlab.eclipse.org/eclipse/aidge/gitlab_shared_files/-/raw/main/.gitlab/ci/shared_script.gitlab-ci.yml' + - remote: 'https://gitlab.eclipse.org/hrouis/gitlab_shared_files/-/raw/test_hro/.gitlab/ci/shared_script.gitlab-ci.yml' build:ubuntu_cpp: stage: build diff --git a/include/aidge/backend/cuda.hpp b/include/aidge/backend/cuda.hpp index 6b5a4d639d58476a20325716361f38de28df0d24..da62b81022550a79d63fa1f20aa9429753e5ab6c 100644 --- a/include/aidge/backend/cuda.hpp +++ b/include/aidge/backend/cuda.hpp @@ -13,13 +13,20 @@ #define AIDGE_BACKEND_CUDA_IMPORTS_H_ #include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/AddImpl.hpp" #include "aidge/backend/cuda/operator/AvgPoolingImpl.hpp" +#include "aidge/backend/cuda/operator/BatchNormImpl.hpp" #include "aidge/backend/cuda/operator/ConvImpl.hpp" #include "aidge/backend/cuda/operator/FCImpl.hpp" +#include "aidge/backend/cuda/operator/GlobalAveragePoolingImpl.hpp" #include "aidge/backend/cuda/operator/MaxPoolingImpl.hpp" +#include "aidge/backend/cuda/operator/PadImpl.hpp" #include "aidge/backend/cuda/operator/ReLUImpl.hpp" #include "aidge/backend/cuda/operator/ShiftMaxImpl.hpp" #include "aidge/backend/cuda/operator/ShiftGELUImpl.hpp" +#include "aidge/backend/cuda/operator/ReshapeImpl.hpp" +#include "aidge/backend/cuda/operator/SigmoidImpl.hpp" +#include "aidge/backend/cuda/operator/SubImpl.hpp" +#include "aidge/backend/cuda/operator/TanhImpl.hpp" - -#endif /* AIDGE_BACKEND_CUDA_IMPORTS_H_ */ \ No newline at end of file +#endif /* AIDGE_BACKEND_CUDA_IMPORTS_H_ */ diff --git a/include/aidge/backend/cuda/data/TensorImpl.hpp b/include/aidge/backend/cuda/data/TensorImpl.hpp index f083a8ba9f68ed53929db95b3fd6604f31548e21..96045781647f93f0627ca0853a0cdaa66a08af83 100644 --- a/include/aidge/backend/cuda/data/TensorImpl.hpp +++ b/include/aidge/backend/cuda/data/TensorImpl.hpp @@ -85,14 +85,20 @@ public: // native interface const future_std::span<T>& data() const { return mData; } + inline std::size_t capacity() const noexcept override { return mData.size(); } + std::size_t scalarSize() const noexcept override { return sizeof(T); } + void zeros() override final { + CHECK_CUDA_STATUS(cudaMemset(rawPtr(), T(0), mNbElts * sizeof(T))); + } + void copy(const void *src, NbElts_t length, NbElts_t offset = 0) override { - AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "copy length is above capacity"); + AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "TensorImpl_cuda<{}>::copy(): copy length ({}) is above capacity ({})", typeid(T).name(), length, mNbElts); const T* srcT = static_cast<const T *>(src); T* dstT = static_cast<T *>(rawPtr(offset)); - AIDGE_ASSERT(dstT < srcT || dstT >= srcT + length, "overlapping copy is not supported"); + AIDGE_ASSERT(dstT < srcT || dstT >= srcT + length, "TensorImpl_cuda<{}>::copy(): overlapping copy is not supported", typeid(T).name()); CHECK_CUDA_STATUS(cudaMemcpy(dstT, srcT, length * sizeof(T), cudaMemcpyDeviceToDevice)); } @@ -101,7 +107,7 @@ public: return; } - AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "copy length is above capacity"); + AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "TensorImpl_cuda<{}>::copyCast(): copy length ({}) is above capacity ({})", typeid(T).name(), length, mNbElts); switch (srcDt) { case DataType::Float64: thrust_copy(static_cast<const double*>(src), @@ -159,23 +165,23 @@ public: length); break; default: - AIDGE_THROW_OR_ABORT(std::runtime_error, "Unsupported data type."); + AIDGE_THROW_OR_ABORT(std::runtime_error, "TensorImpl_cuda<{}>::copyCast(): unsupported data type {}.", typeid(T).name(), srcDt); break; } } void copyFromDevice(const void *src, const std::pair<std::string, DeviceIdx_t>& device, NbElts_t length, NbElts_t offset = 0) override { - AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "copy length is above capacity"); + AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "TensorImpl_cuda<{}>::copyFromDevice(): copy length ({}) is above capacity ({})", typeid(T).name(), length, mNbElts); CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(offset), src, length * sizeof(T), cudaMemcpyDeviceToDevice)); } void copyFromHost(const void *src, NbElts_t length, NbElts_t offset = 0) override { - AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "copy length is above capacity"); + AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "TensorImpl_cuda<{}>::copyFromHost(): copy length ({}) is above capacity ({})", typeid(T).name(), length, mNbElts); CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(offset), src, length * sizeof(T), cudaMemcpyHostToDevice)); } void copyToHost(void *dst, NbElts_t length, NbElts_t offset = 0) const override { - AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "copy length is above capacity"); + AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "TensorImpl_cuda<{}>::copyToHost(): copy length ({}) is above capacity ({})", typeid(T).name(), length, mNbElts); CHECK_CUDA_STATUS(cudaMemcpy(dst, rawPtr(offset), length * sizeof(T), cudaMemcpyDeviceToHost)); } @@ -185,7 +191,7 @@ public: }; const void *rawPtr(NbElts_t offset = 0) const override { - AIDGE_ASSERT(mData.size() >= mNbElts, "accessing uninitialized const rawPtr"); + AIDGE_ASSERT(mData.size() >= mNbElts, "TensorImpl_cuda<{}>::rawPtr(): accessing uninitialized const rawPtr", typeid(T).name()); return (mData.data() + offset); }; @@ -220,7 +226,7 @@ public: } void setRawPtr(void *ptr, NbElts_t length) override final { - AIDGE_ASSERT(length >= mNbElts, "trying to set raw pointer of insufficient capacity"); + AIDGE_ASSERT(length >= mNbElts, "TensorImpl_cuda<{}>::setRawPtr(): trying to set raw pointer (length: {}) of insufficient capacity (required: {})", typeid(T).name(), length, mNbElts); mData = future_std::span<T>(static_cast<T *>(ptr), length); mDataOwner.reset(); }; @@ -231,7 +237,7 @@ private: void lazyInit() { if (mData.size() < mNbElts) { // Need more data, a re-allocation will occur - AIDGE_ASSERT(mData.empty() || mDataOwner != nullptr, "trying to enlarge non-owned data"); + AIDGE_ASSERT(mData.empty() || mDataOwner != nullptr, "TensorImpl_cuda<{}>: trying to enlarge non-owned data", typeid(T).name()); mDataOwner.reset(cudaAlloc(mNbElts)); mData = future_std::span<T>(mDataOwner.get(), mNbElts); } diff --git a/include/aidge/backend/cuda/operator/AddImpl.hpp b/include/aidge/backend/cuda/operator/AddImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..cd1819753cd00a325443d9c9c992f3d2347bb377 --- /dev/null +++ b/include/aidge/backend/cuda/operator/AddImpl.hpp @@ -0,0 +1,56 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_BACKEND_CUDA_OPERATOR_ADDIMPL_H_ +#define AIDGE_BACKEND_CUDA_OPERATOR_ADDIMPL_H_ + +#include <array> +#include <memory> +#include <tuple> +#include <vector> + +#include <cudnn.h> + +#include "aidge/backend/OperatorImpl.hpp" +#include "aidge/operator/Add.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" + +#include "aidge/backend/cuda/utils/CudaUtils.hpp" + +namespace Aidge { +class AddImpl_cuda : public OperatorImpl { +private: + + +public: + AddImpl_cuda(const Add_Op &op) : OperatorImpl(op, "cuda") {} + + static std::unique_ptr<AddImpl_cuda> create(const Add_Op &op) { + return std::make_unique<AddImpl_cuda>(op); + } + +public: + void forward(); + void backward(); + // ~AddImpl_cuda(); +private: + template <class T> void forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides); + template <class T> void backward_(const Tensor& outGrad, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides); +}; + +namespace { +// add cuda backend to Add_Op implementation registry +static Registrar<Add_Op> registrarAddImpl_cuda("cuda", Aidge::AddImpl_cuda::create); +} // namespace +} // namespace Aidge + +#endif /* AIDGE_BACKEND_CUDA_OPERATOR_ADDIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/AvgPoolingImpl.hpp b/include/aidge/backend/cuda/operator/AvgPoolingImpl.hpp index 43a6bd57c0c6431705abe73d3f3c175046d72dc9..540ec574f9b5fbcea8b8f28e390cbe05f1e0fa8e 100644 --- a/include/aidge/backend/cuda/operator/AvgPoolingImpl.hpp +++ b/include/aidge/backend/cuda/operator/AvgPoolingImpl.hpp @@ -33,7 +33,7 @@ private: // CuDNN specific variables cudnnPoolingDescriptor_t mAvgPoolingDesc = nullptr; cudnnPoolingMode_t mMode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; - std::shared_ptr<Tensor> mInputFallback; + std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback; public: AvgPoolingImpl_cuda(const AvgPooling_Op<DIM> &op) : OperatorImpl(op, "cuda") {} @@ -44,10 +44,12 @@ public: public: void forward(); + void backward(); ~AvgPoolingImpl_cuda(); private: template <class T> void forward_(const Tensor& input); + template <class T> void backward_(const Tensor& output_grad); }; namespace { diff --git a/include/aidge/backend/cuda/operator/BatchNormImpl.hpp b/include/aidge/backend/cuda/operator/BatchNormImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..3451d07f289371202570434f96546344c0c4fb26 --- /dev/null +++ b/include/aidge/backend/cuda/operator/BatchNormImpl.hpp @@ -0,0 +1,61 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_BACKEND_CUDA_OPERATOR_BATCHNORMIMPL_H_ +#define AIDGE_BACKEND_CUDA_OPERATOR_BATCHNORMIMPL_H_ + +#include <array> +#include <memory> +#include <tuple> +#include <vector> + +#include <cudnn.h> + +#include "aidge/backend/OperatorImpl.hpp" +#include "aidge/operator/BatchNorm.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" + +#include "aidge/backend/cuda/utils/CudaUtils.hpp" + +namespace Aidge { +template <DimIdx_t DIM> +class BatchNormImpl_cuda : public OperatorImpl { +private: + // CuDNN specific variables + cudnnTensorDescriptor_t mBNDesc = nullptr; + cudnnBatchNormMode_t mMode; + double mEpsilon; + +public: + BatchNormImpl_cuda(const BatchNorm_Op<DIM> &op) : OperatorImpl(op, "cuda") {} + + static std::unique_ptr<BatchNormImpl_cuda> create(const BatchNorm_Op<DIM> &op) { + return std::make_unique<BatchNormImpl_cuda>(op); + } + +public: + void forward(); + void backward(); + ~BatchNormImpl_cuda(); + +private: + template <class T> void forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, const Tensor& input3, const Tensor& input4); + template <class T> void backward_(const Tensor& input0, const Tensor& input1, const Tensor& input2); +}; + +namespace { +// add cuda backend to BatchNorm_Op<2> implementation registry +static Registrar<BatchNorm_Op<2>> registrarBatchNormImpl_cuda("cuda", Aidge::BatchNormImpl_cuda<2>::create); +} // namespace +} // namespace Aidge + +#endif /* AIDGE_BACKEND_CUDA_OPERATOR_BATCHNORMIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/ConvImpl.hpp b/include/aidge/backend/cuda/operator/ConvImpl.hpp index 8c591927ce0e52daeff447726c114ce3ae4d0103..0722048f7cf021104a9694a621b1c0dad00ce423 100644 --- a/include/aidge/backend/cuda/operator/ConvImpl.hpp +++ b/include/aidge/backend/cuda/operator/ConvImpl.hpp @@ -21,11 +21,13 @@ #include "aidge/backend/OperatorImpl.hpp" #include "aidge/operator/Conv.hpp" +#include "aidge/operator/ConvDepthWise.hpp" #include "aidge/utils/Registrar.hpp" #include "aidge/utils/Types.h" #include "aidge/backend/cuda/utils/CudaUtils.hpp" + namespace Aidge { template <DimIdx_t DIM> class ConvImpl_cuda : public OperatorImpl { @@ -42,14 +44,19 @@ private: std::shared_ptr<Tensor> mInput0Fallback; std::shared_ptr<Tensor> mInput1Fallback; std::shared_ptr<Tensor> mInput2Fallback; + bool mDepthWise = false; public: - ConvImpl_cuda(const Conv_Op<DIM> &op) : OperatorImpl(op, "cuda") {} + ConvImpl_cuda(const Operator&op, bool depthWise = false) : OperatorImpl(op, "cuda"), mDepthWise(depthWise) {} static std::unique_ptr<ConvImpl_cuda> create(const Conv_Op<DIM> &op) { return std::make_unique<ConvImpl_cuda>(op); } + static std::unique_ptr<ConvImpl_cuda> createDW(const ConvDepthWise_Op<DIM> &op) { + return std::make_unique<ConvImpl_cuda>(op, true); + } + public: void forward(); void backward(); @@ -61,8 +68,8 @@ private: }; namespace { -// add cuda backend to Conv_Op<2> implementation registry static Registrar<Conv_Op<2>> registrarConvImpl_cuda("cuda", Aidge::ConvImpl_cuda<2>::create); +static Registrar<ConvDepthWise_Op<2>> registrarConvDepthWiseImpl_cuda("cuda", Aidge::ConvImpl_cuda<2>::createDW); } // namespace } // namespace Aidge diff --git a/include/aidge/backend/cuda/operator/FCImpl.hpp b/include/aidge/backend/cuda/operator/FCImpl.hpp index 3d8a1348d500fc533c7c9b601b09629995f97427..46f7849d1f17aab5496bdbde013ef078ad1f5a7c 100644 --- a/include/aidge/backend/cuda/operator/FCImpl.hpp +++ b/include/aidge/backend/cuda/operator/FCImpl.hpp @@ -27,9 +27,6 @@ #include "aidge/backend/cuda/utils/CudaUtils.hpp" namespace Aidge { -class FCImplForward_cuda : public Registrable<FCImplForward_cuda, - std::tuple<DataType>, - void(std::size_t , std::size_t, std::size_t, bool, const void* , const void* , const void* , void*)> {}; class FCImpl_cuda : public OperatorImpl { private: std::shared_ptr<Tensor> mInput0Fallback; @@ -46,10 +43,12 @@ public: public: void forward(); + void backward(); // ~FCImpl_cuda(); private: - template <class T> void forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, bool noBias, std::size_t outChannels); + template <class T> void forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, std::size_t outChannels); + template <class T> void backward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, std::size_t outChannels); }; namespace { diff --git a/include/aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp index 9084e01fc08cb3d00e80fc8cf6246064b20591f2..8d1af8f7c5954c2eae9179926aec433eee34414f 100644 --- a/include/aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp +++ b/include/aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp @@ -32,5 +32,14 @@ cublasStatus_t cublasGemm(cublasHandle_t handle, const T *B, int ldb, const T *beta, T *C, int ldc); + +template <class T> +cublasStatus_t cublasGemv(cublasHandle_t handle, cublasOperation_t trans, + int m, int n, + const T *alpha, + const T *A, int lda, + const T *x, int incx, + const T *beta, + T *y, int incy); } #endif /* AIDGE_CUDA_OPERATOR_FCIMPL_FORWARD_KERNEL_H_ */ \ No newline at end of file diff --git a/include/aidge/backend/cuda/operator/GlobalAveragePoolingImpl.hpp b/include/aidge/backend/cuda/operator/GlobalAveragePoolingImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..6e0fad5c01efb6474f527dee0bfbfdc594788bc6 --- /dev/null +++ b/include/aidge/backend/cuda/operator/GlobalAveragePoolingImpl.hpp @@ -0,0 +1,60 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_BACKEND_CUDA_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_H_ +#define AIDGE_BACKEND_CUDA_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_H_ + +#include <array> +#include <memory> +#include <tuple> +#include <vector> + +#include <cudnn.h> + +#include "aidge/backend/OperatorImpl.hpp" +#include "aidge/operator/GlobalAveragePooling.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" + +#include "aidge/backend/cuda/utils/CudaUtils.hpp" + +namespace Aidge { +class GlobalAveragePoolingImpl_cuda : public OperatorImpl { +private: + // CuDNN specific variables + cudnnPoolingDescriptor_t mGlobalAveragePoolingDesc = nullptr; + cudnnPoolingMode_t mMode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; + std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback; + +public: + GlobalAveragePoolingImpl_cuda(const GlobalAveragePooling_Op &op) : OperatorImpl(op, "cuda") {} + + static std::unique_ptr<GlobalAveragePoolingImpl_cuda> create(const GlobalAveragePooling_Op &op) { + return std::make_unique<GlobalAveragePoolingImpl_cuda>(op); + } + +public: + void forward(); + void backward(); + ~GlobalAveragePoolingImpl_cuda(); + +private: + template <class T> void forward_(const Tensor& input); + template <class T> void backward_(const Tensor& output_grad); +}; + +namespace { +// add cuda backend to GlobalAveragePooling_Op implementation registry +static Registrar<GlobalAveragePooling_Op> registrarGlobalAveragePoolingImpl_cuda("cuda", Aidge::GlobalAveragePoolingImpl_cuda::create); +} // namespace +} // namespace Aidge + +#endif /* AIDGE_BACKEND_CUDA_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/MaxPoolingImpl.hpp b/include/aidge/backend/cuda/operator/MaxPoolingImpl.hpp index 4d08d7fab3c7cb2baa18838fd872e44a8eccc923..db7f1e376013db52aeb1b27f8cc3ff192c7f0629 100644 --- a/include/aidge/backend/cuda/operator/MaxPoolingImpl.hpp +++ b/include/aidge/backend/cuda/operator/MaxPoolingImpl.hpp @@ -33,7 +33,7 @@ private: // CuDNN specific variables cudnnPoolingDescriptor_t mMaxPoolingDesc = nullptr; cudnnPoolingMode_t mMode = CUDNN_POOLING_MAX; - std::shared_ptr<Tensor> mInputFallback; + std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback; public: MaxPoolingImpl_cuda(const MaxPooling_Op<DIM> &op) : OperatorImpl(op, "cuda") {} @@ -44,10 +44,12 @@ public: public: void forward(); + void backward(); ~MaxPoolingImpl_cuda(); private: template <class T> void forward_(const Tensor& input); + template <class T> void backward_(const Tensor& output_grad); }; namespace { diff --git a/include/aidge/backend/cuda/operator/PadImpl.hpp b/include/aidge/backend/cuda/operator/PadImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..4452d3408e7b4780c1e5c4ea6553ba0b713df231 --- /dev/null +++ b/include/aidge/backend/cuda/operator/PadImpl.hpp @@ -0,0 +1,61 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_BACKEND_CUDA_OPERATOR_PADIMPL_H_ +#define AIDGE_BACKEND_CUDA_OPERATOR_PADIMPL_H_ + +#include <array> +#include <memory> +#include <tuple> +#include <vector> + +#include <cudnn.h> + +#include "aidge/backend/OperatorImpl.hpp" +#include "aidge/operator/Pad.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" + +#include "aidge/backend/cuda/utils/CudaUtils.hpp" + +namespace Aidge { +template <DimIdx_t DIM> +class PadImpl_cuda : public OperatorImpl { +private: + // CuDNN specific variables + std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback; + int mLeftPad, mTopPad; + double mPadVal; + unsigned int mPadType; + +public: + PadImpl_cuda(const Pad_Op<DIM> &op) : OperatorImpl(op, "cuda") {} + + static std::unique_ptr<PadImpl_cuda> create(const Pad_Op<2> &op) { + return std::make_unique<PadImpl_cuda>(op); + } + +public: + void forward(); + void backward(); + +private: + template <class T> void forward_(const Tensor& input); + template <class T> void backward_(const Tensor& outGrad); +}; + +namespace { +// add cuda backend to Pad_Op<2> implementation registry +static Registrar<Pad_Op<2>> registrarPadImpl_cuda("cuda", Aidge::PadImpl_cuda<2>::create); +} // namespace +} // namespace Aidge + +#endif /* AIDGE_BACKEND_CUDA_OPERATOR_PADIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/PadImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/PadImpl_CUDA_kernels.hpp new file mode 100644 index 0000000000000000000000000000000000000000..c6a83160da5cf3fea3d3415959c965e16c1eb4ff --- /dev/null +++ b/include/aidge/backend/cuda/operator/PadImpl_CUDA_kernels.hpp @@ -0,0 +1,37 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CUDA_OPERATOR_PADIMPL_FORWARD_KERNEL_H_ +#define AIDGE_CUDA_OPERATOR_PADIMPL_FORWARD_KERNEL_H_ + +#include "aidge/data/Data.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" + +namespace Aidge +{ + + template <class T> + void cudaPadding(const cudaDeviceProp &deviceProp, + unsigned int nbOutputs, + unsigned int outputsWidth, + unsigned int outputsHeight, + unsigned int nbChannels, + unsigned int batchSize, + unsigned int inputWidth, + unsigned int inputHeight, + int leftPad, + int topPad, + unsigned int padType, + T padValue, + const T *input, + T *outputs); +} +#endif /* AIDGE_CUDA_OPERATOR_PADIMPL_FORWARD_KERNEL_H_ */ \ No newline at end of file diff --git a/include/aidge/backend/cuda/operator/ReLUImpl.hpp b/include/aidge/backend/cuda/operator/ReLUImpl.hpp index 6570662fa5df27e54a9df6f357e918243a71330a..285713f460b9d5b5e868c0c07ab23804f30dd694 100644 --- a/include/aidge/backend/cuda/operator/ReLUImpl.hpp +++ b/include/aidge/backend/cuda/operator/ReLUImpl.hpp @@ -36,6 +36,7 @@ private: cudnnActivationMode_t mReLUDesc = nullptr; #endif std::shared_ptr<Tensor> mInputFallback; + std::shared_ptr<Tensor> mOutputGradFallback; public: ReLUImpl_cuda(const ReLU_Op &op) : OperatorImpl(op, "cuda") {} @@ -46,10 +47,12 @@ public: public: void forward(); + void backward(); ~ReLUImpl_cuda(); private: template <class T> void forward_(const Tensor& input); + template <class T> void backward_(const Tensor& output_grad); }; namespace { diff --git a/include/aidge/backend/cuda/operator/ReshapeImpl.hpp b/include/aidge/backend/cuda/operator/ReshapeImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..7b43df680bef115310669f0d55f2f78ef4fe9fa6 --- /dev/null +++ b/include/aidge/backend/cuda/operator/ReshapeImpl.hpp @@ -0,0 +1,53 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_BACKEND_CUDA_OPERATOR_RESHAPEIMPL_H_ +#define AIDGE_BACKEND_CUDA_OPERATOR_RESHAPEIMPL_H_ + +#include <array> +#include <memory> +#include <tuple> +#include <vector> + +#include <cudnn.h> + +#include "aidge/backend/OperatorImpl.hpp" +#include "aidge/operator/Reshape.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" + +#include "aidge/backend/cuda/utils/CudaUtils.hpp" + +namespace Aidge { +class ReshapeImpl_cuda : public OperatorImpl { +private: + std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback; + +public: + ReshapeImpl_cuda(const Reshape_Op &op) : OperatorImpl(op, "cuda") {} + + static std::unique_ptr<ReshapeImpl_cuda> create(const Reshape_Op &op) { + return std::make_unique<ReshapeImpl_cuda>(op); + } + +public: + void forward(); + void backward(); + ~ReshapeImpl_cuda(); +}; + +namespace { +// add cuda backend to Reshape_Op implementation registry +static Registrar<Reshape_Op> registrarReshapeImpl_cuda("cuda", Aidge::ReshapeImpl_cuda::create); +} // namespace +} // namespace Aidge + +#endif /* AIDGE_BACKEND_CUDA_OPERATOR_RESHAPEIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/SigmoidImpl.hpp b/include/aidge/backend/cuda/operator/SigmoidImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..90dbb717732ad788b868fdc95eb55579a5e0b9f6 --- /dev/null +++ b/include/aidge/backend/cuda/operator/SigmoidImpl.hpp @@ -0,0 +1,64 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_BACKEND_CUDA_OPERATOR_SIGMOIDIMPL_H_ +#define AIDGE_BACKEND_CUDA_OPERATOR_SIGMOIDIMPL_H_ + +#include <array> +#include <memory> +#include <tuple> +#include <vector> + +#include <cudnn.h> + +#include "aidge/backend/OperatorImpl.hpp" +#include "aidge/operator/Sigmoid.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" + +#include "aidge/backend/cuda/utils/CudaUtils.hpp" + +namespace Aidge { +class SigmoidImpl_cuda : public OperatorImpl { +private: + // CuDNN specific variables + #if CUDNN_VERSION >= 5000 + cudnnActivationDescriptor_t mSigmoidDesc = nullptr; + #else + cudnnActivationMode_t mSigmoidDesc = nullptr; + #endif + std::shared_ptr<Tensor> mInputFallback; + std::shared_ptr<Tensor> mOutputGradFallback; + +public: + SigmoidImpl_cuda(const Sigmoid_Op &op) : OperatorImpl(op, "cuda") {} + + static std::unique_ptr<SigmoidImpl_cuda> create(const Sigmoid_Op &op) { + return std::make_unique<SigmoidImpl_cuda>(op); + } + +public: + void forward(); + void backward(); + ~SigmoidImpl_cuda(); + +private: + template <class T> void forward_(const Tensor& input); + template <class T> void backward_(const Tensor& output_grad); +}; + +namespace { +// add cuda backend to Sigmoid_Op implementation registry +static Registrar<Sigmoid_Op> registrarSigmoidImpl_cuda("cuda", Aidge::SigmoidImpl_cuda::create); +} // namespace +} // namespace Aidge + +#endif /* AIDGE_BACKEND_CUDA_OPERATOR_SIGMOIDIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/SubImpl.hpp b/include/aidge/backend/cuda/operator/SubImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..fd1a76692abdf16b9854b90f535f68329ae5877a --- /dev/null +++ b/include/aidge/backend/cuda/operator/SubImpl.hpp @@ -0,0 +1,56 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_BACKEND_CUDA_OPERATOR_SUBIMPL_H_ +#define AIDGE_BACKEND_CUDA_OPERATOR_SUBIMPL_H_ + +#include <array> +#include <memory> +#include <tuple> +#include <vector> + +#include <cudnn.h> + +#include "aidge/backend/OperatorImpl.hpp" +#include "aidge/operator/Sub.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" + +#include "aidge/backend/cuda/utils/CudaUtils.hpp" + +namespace Aidge { +class SubImpl_cuda : public OperatorImpl { +private: + + +public: + SubImpl_cuda(const Sub_Op &op) : OperatorImpl(op, "cuda") {} + + static std::unique_ptr<SubImpl_cuda> create(const Sub_Op &op) { + return std::make_unique<SubImpl_cuda>(op); + } + +public: + void forward(); + void backward(); + // ~SubImpl_cuda(); +private: + template <class T> void forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides); + template <class T> void backward_(const Tensor& outGrad, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides); +}; + +namespace { +// add cuda backend to Sub_Op implementation registry +static Registrar<Sub_Op> registrarSubImpl_cuda("cuda", Aidge::SubImpl_cuda::create); +} // namespace +} // namespace Aidge + +#endif /* AIDGE_BACKEND_CUDA_OPERATOR_SUBIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/TanhImpl.hpp b/include/aidge/backend/cuda/operator/TanhImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..35e879513fee0ec9354edecefd3d53860e54a0b1 --- /dev/null +++ b/include/aidge/backend/cuda/operator/TanhImpl.hpp @@ -0,0 +1,64 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_BACKEND_CUDA_OPERATOR_TANHIMPL_H_ +#define AIDGE_BACKEND_CUDA_OPERATOR_TANHIMPL_H_ + +#include <array> +#include <memory> +#include <tuple> +#include <vector> + +#include <cudnn.h> + +#include "aidge/backend/OperatorImpl.hpp" +#include "aidge/operator/Tanh.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" + +#include "aidge/backend/cuda/utils/CudaUtils.hpp" + +namespace Aidge { +class TanhImpl_cuda : public OperatorImpl { +private: + // CuDNN specific variables + #if CUDNN_VERSION >= 5000 + cudnnActivationDescriptor_t mTanhDesc = nullptr; + #else + cudnnActivationMode_t mTanhDesc = nullptr; + #endif + std::shared_ptr<Tensor> mInputFallback; + std::shared_ptr<Tensor> mOutputGradFallback; + +public: + TanhImpl_cuda(const Tanh_Op &op) : OperatorImpl(op, "cuda") {} + + static std::unique_ptr<TanhImpl_cuda> create(const Tanh_Op &op) { + return std::make_unique<TanhImpl_cuda>(op); + } + +public: + void forward(); + void backward(); + ~TanhImpl_cuda(); + +private: + template <class T> void forward_(const Tensor& input); + template <class T> void backward_(const Tensor& output_grad); +}; + +namespace { +// add cuda backend to Tanh_Op implementation registry +static Registrar<Tanh_Op> registrarTanhImpl_cuda("cuda", Aidge::TanhImpl_cuda::create); +} // namespace +} // namespace Aidge + +#endif /* AIDGE_BACKEND_CUDA_OPERATOR_TANHIMPL_H_ */ diff --git a/python_binding/pybind_backend_cuda.cpp b/python_binding/pybind_backend_cuda.cpp index abd1997389f3574a24e171f6ab26628dcfe40cfd..3d7564459781d6933827aa66b405b03085806467 100644 --- a/python_binding/pybind_backend_cuda.cpp +++ b/python_binding/pybind_backend_cuda.cpp @@ -1,9 +1,20 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + #include <pybind11/pybind11.h> + // Need to call this header to register every impl #include "aidge/backend/cuda.hpp" namespace py = pybind11; - namespace Aidge { void init_cuda_sys_info(py::module& m); @@ -15,4 +26,4 @@ void init_Aidge(py::module& m){ PYBIND11_MODULE(aidge_backend_cuda, m) { init_Aidge(m); } -} +} // namespace Aidge diff --git a/src/operator/AddImpl.cpp b/src/operator/AddImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..74d89c405530766324407fc42345f237931dc2f4 --- /dev/null +++ b/src/operator/AddImpl.cpp @@ -0,0 +1,213 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <algorithm> +#include <cassert> +#include <numeric> +#include <vector> + +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/AddImpl.hpp" +#include "aidge/backend/cuda/utils/CudaContext.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/operator/Add.hpp" +#include "aidge/utils/Types.h" + +void Aidge::AddImpl_cuda::forward() { + const Add_Op& op = static_cast<const Add_Op&>(mOp); + // Check inputs + AIDGE_ASSERT(op.getInput(0), "missing input in Add operator"); + AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Add forward because the 0-th input has no implementation."); + DataType datatypeFirstInput = op.getInput(0)->dataType(); + for (IOIndex_t i = 1; i < op.nbInputs(); ++i) { + AIDGE_ASSERT(op.getInput(i), "missing input in Add operator"); + AIDGE_ASSERT(op.getInput(i)->hasImpl(), "cannot run Add forward because the {}-th input has no implementation.", i); + AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, "Cannot add inputs with two differents data type."); + } + + std::vector<std::shared_ptr<Tensor>> inputFallbacks(op.nbInputs()); + std::vector<Tensor> inputs(op.nbInputs()); + std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims + std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides + for (IOIndex_t i = 0; i < op.nbInputs(); ++i) { + inputs[i] = op.getInput(i)->refCastFrom(inputFallbacks[i], *op.getOutput(0)); + + // Get tensor dims and broadcast them + std::copy(inputs[i].dims().begin(), inputs[i].dims().end(), std::back_inserter(dims[i])); + dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1)); + + // Compute the corresponding strides + std::vector<int> tensorStrides(dims[i].size()); + int product = 1; + for (size_t j = dims[i].size(); j > 0; --j) { + tensorStrides[j - 1] = product; + product *= dims[i][j - 1]; + } + strides[i] = tensorStrides; + } + + switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + forward_<double>(inputs, dims, strides); + break; + case DataType::Float32: + forward_<float>(inputs, dims, strides); + break; + case DataType::Float16: + forward_<half>(inputs, dims, strides); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + } +} + +template <class T> +void Aidge::AddImpl_cuda::forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; + const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; + + // Create a Tensor descriptor with the broadcasted dims and strides + cudnnTensorDescriptor_t tensorDesc; + CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc)); + CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc, CudaContext::data_type<T>::value, inputsDims[0].size(), inputsDims[0].data(), inputsStrides[0].data())); + // Add first input + CHECK_CUDNN_STATUS( + cudnnAddTensor(CudaContext::cudnnHandle(), + &alpha, + tensorDesc, + inputs[0].getImpl()->rawPtr(), + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), + std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()) + ); + // Add other inputs if there are any + for (size_t i = 1; i < op.nbInputs(); ++i) + { + CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc, CudaContext::data_type<T>::value, inputsDims[i].size(), inputsDims[i].data(), inputsStrides[i].data())); + CHECK_CUDNN_STATUS( + cudnnAddTensor(CudaContext::cudnnHandle(), + &alpha, + tensorDesc, + inputs[i].getImpl()->rawPtr(), + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), + std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()) + ); + } + CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc)); +} + +void Aidge::AddImpl_cuda::backward() { + const Add_Op& op = static_cast<const Add_Op&>(mOp); + // Check output + AIDGE_ASSERT(op.getOutput(0)->grad(), "missing output gradient in Add operator"); + AIDGE_ASSERT(op.getOutput(0)->grad()->hasImpl(), "cannot run Add backward because the output gradient has no implementation."); + + std::shared_ptr<Tensor> outputGradFallback; + const auto& outputGrad = op.getOutput(0)->grad()->refCastFrom(outputGradFallback, *op.getOutput(0)->grad()); + + std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims + std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides + for (IOIndex_t i = 0; i < op.nbInputs(); ++i) { + std::shared_ptr<Tensor> inputFallback; + const Tensor input = op.getInput(i)->refCastFrom(inputFallback, *op.getOutput(0)); + + // Get tensor dims and broadcast them + std::copy(input.dims().begin(), input.dims().end(), std::back_inserter(dims[i])); + dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1)); + + // Compute the corresponding strides + std::vector<int> tensorStrides(dims[i].size()); + int product = 1; + for (size_t j = dims[i].size(); j > 0; --j) { + tensorStrides[j - 1] = product; + product *= dims[i][j - 1]; + } + strides[i] = tensorStrides; + } + + switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + backward_<double>(outputGrad, dims, strides); + break; + case DataType::Float32: + backward_<float>(outputGrad, dims, strides); + break; + case DataType::Float16: + backward_<half>(outputGrad, dims, strides); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + } +} + +template <class T> +void Aidge::AddImpl_cuda::backward_(const Tensor& outputGrad, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; + const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; + + for (std::size_t i = 0; i < inputsDims.size(); i++) + { + if (op.getInput(i)->size() == op.getOutput(0)->size()) + { + // TODO: Test if we can avoid copy and simply set rawPtr + op.getInput(i)->grad()->getImpl()->copy(outputGrad.getImpl()->rawPtr(), op.getInput(i)->grad()->size()); + } + else // In case of broadcasting + { + // Gradient with respect to input_i: sum outputGrad over the broadcasted dimensions using cudnnReduceTensor + cudnnReduceTensorDescriptor_t reduceDesc; + CHECK_CUDNN_STATUS(cudnnCreateReduceTensorDescriptor(&reduceDesc)); + CHECK_CUDNN_STATUS(cudnnSetReduceTensorDescriptor(reduceDesc, + CUDNN_REDUCE_TENSOR_ADD, + CudaContext::data_type<T>::value, + CUDNN_PROPAGATE_NAN, + CUDNN_REDUCE_TENSOR_NO_INDICES, + CUDNN_32BIT_INDICES)); + + cudnnTensorDescriptor_t outputDesc = std::dynamic_pointer_cast<TensorImpl_cuda_>(outputGrad.getImpl())->getCudnnTensorDesc(*op.getOutput(0)); + // Create a Tensor descriptor with the broadcasted dims and strides + cudnnTensorDescriptor_t tensorDesc; + CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc)); + CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc, + CudaContext::data_type<T>::value, + inputsDims[i].size(), + inputsDims[i].data(), + inputsStrides[i].data())); + size_t workspaceSize; + CHECK_CUDNN_STATUS(cudnnGetReductionWorkspaceSize(CudaContext::cudnnHandle(), + reduceDesc, + outputDesc, + tensorDesc, + &workspaceSize)); + + float *d_workspace; + CHECK_CUDA_STATUS(cudaMalloc(&d_workspace, workspaceSize)); + + CHECK_CUDNN_STATUS(cudnnReduceTensor(CudaContext::cudnnHandle(), + reduceDesc, + NULL, + 0, + d_workspace, + workspaceSize, + &alpha, + outputDesc, + outputGrad.getImpl()->rawPtr(), + &beta, + tensorDesc, + op.getInput(i)->grad()->getImpl()->rawPtr())); + + CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc)); + } + } +} diff --git a/src/operator/AvgPoolingImpl.cpp b/src/operator/AvgPoolingImpl.cpp index 6692b342c7f745eede689dd79a9a704bbefa9d77..d1270ee4b0a556e1053f3cfde8d71ec5efbee279 100644 --- a/src/operator/AvgPoolingImpl.cpp +++ b/src/operator/AvgPoolingImpl.cpp @@ -9,7 +9,6 @@ * ********************************************************************************/ -#include <cassert> #include <vector> #include "aidge/backend/cuda/data/TensorImpl.hpp" @@ -17,22 +16,22 @@ #include "aidge/backend/cuda/utils/CudaContext.hpp" #include "aidge/backend/cuda/utils/CudaUtils.hpp" #include "aidge/operator/AvgPooling.hpp" +#include "aidge/utils/ErrorHandling.hpp" #include "aidge/utils/Types.h" template <Aidge::DimIdx_t DIM> void Aidge::AvgPoolingImpl_cuda<DIM>::forward() { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const AvgPooling_Op<DIM>& op = dynamic_cast<const AvgPooling_Op<DIM>&>(mOp); - assert(mOp.getRawInput(0) && "missing input #0"); + AIDGE_ASSERT(mOp.getRawInput(0), "missing input #0"); const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0)); // Lazy-initialize CuDNN AvgPooling descriptor if (mAvgPoolingDesc == nullptr) { - const AvgPooling_Op<DIM>& avgPoolingOp = static_cast<const AvgPooling_Op<DIM>&>(op); - const std::vector<int> strides(avgPoolingOp.template getAttr<AvgPoolingAttr::StrideDims>().begin(), avgPoolingOp.template getAttr<AvgPoolingAttr::StrideDims>().end()); + const std::vector<int> strides(op.strideDims().begin(), op.strideDims().end()); const std::vector<int> paddings(DIM, 0); - const std::vector<int> window_dims(avgPoolingOp.template getAttr<AvgPoolingAttr::KernelDims>().begin(), avgPoolingOp.template getAttr<AvgPoolingAttr::KernelDims>().end()); + const std::vector<int> window_dims(op.kernelDims().begin(), op.kernelDims().end()); CHECK_CUDNN_STATUS(cudnnCreatePoolingDescriptor(&mAvgPoolingDesc)); CHECK_CUDNN_STATUS( @@ -59,7 +58,7 @@ void Aidge::AvgPoolingImpl_cuda<DIM>::forward() { template <Aidge::DimIdx_t DIM> template <class T> void Aidge::AvgPoolingImpl_cuda<DIM>::forward_(const Tensor& input) { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const AvgPooling_Op<DIM>& op = dynamic_cast<const AvgPooling_Op<DIM>&>(mOp); const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; CHECK_CUDNN_STATUS( @@ -71,11 +70,53 @@ void Aidge::AvgPoolingImpl_cuda<DIM>::forward_(const Tensor& input) { input.getImpl()->rawPtr(), &beta, std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), - std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr() + op.getOutput(0)->getImpl()->rawPtr() ) ); } +template <Aidge::DimIdx_t DIM> +void Aidge::AvgPoolingImpl_cuda<DIM>::backward() { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + + AIDGE_ASSERT(mAvgPoolingDesc != nullptr, "AvgPool descriptor must be created during forward!"); + AIDGE_ASSERT(op.getOutput(0)->grad(), "missing output grad #0"); + + const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad()); + + // Do the actual backward computation + // Template is only for scaling parameters, which are always in float + // excepted when the convolution is performed in double precision. + if (op.getInput(0)->grad()->dataType() == DataType::Float64) { + backward_<double>(output_grad); + } + else { + backward_<float>(output_grad); + } +} + +template <Aidge::DimIdx_t DIM> +template <class T> +void Aidge::AvgPoolingImpl_cuda<DIM>::backward_(const Tensor& output_grad) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + + const T alpha = 1.0f; + const T beta = 0.0f; + CHECK_CUDNN_STATUS( + cudnnPoolingBackward(CudaContext::cudnnHandle(), + mAvgPoolingDesc, + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), + std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr(), + std::dynamic_pointer_cast<TensorImpl_cuda_>(output_grad.getImpl())->getCudnnTensorDesc(output_grad), + output_grad.getImpl()->rawPtr(), + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->getImpl())->getCudnnTensorDesc(*op.getInput(0)), + op.getInput(0)->getImpl()->rawPtr(), + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->grad()->getImpl())->getCudnnTensorDesc(*op.getInput(0)), + op.getInput(0)->grad()->getImpl()->rawPtr())); +} + template <Aidge::DimIdx_t DIM> Aidge::AvgPoolingImpl_cuda<DIM>::~AvgPoolingImpl_cuda() { if(mAvgPoolingDesc != nullptr) diff --git a/src/operator/BatchNormImpl.cpp b/src/operator/BatchNormImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5cf079326a0ea003fb72875bcaebefe847086ecb --- /dev/null +++ b/src/operator/BatchNormImpl.cpp @@ -0,0 +1,220 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <cassert> +#include <cuda_runtime.h> +#include <cudnn.h> +#include <vector> + +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/BatchNormImpl.hpp" +#include "aidge/backend/cuda/utils/CudaContext.hpp" +#include "aidge/operator/BatchNorm.hpp" +#include "aidge/utils/Types.h" + +template <Aidge::DimIdx_t DIM> +void Aidge::BatchNormImpl_cuda<DIM>::forward() { + // FIXME: uncomment the following code once memory handling will work + AIDGE_ASSERT(mOp.getRawInput(0), "missing input #0"); + AIDGE_ASSERT(mOp.getRawInput(1), "missing input #1"); + AIDGE_ASSERT(mOp.getRawInput(2), "missing input #2"); + AIDGE_ASSERT(mOp.getRawInput(3), "missing input #3"); + AIDGE_ASSERT(mOp.getRawInput(4), "missing input #4"); + + + std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback, input3Fallback, input4Fallback; + const auto& input0 = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->refCastFrom(input0Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); + const auto& input1 = std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->refCastFrom(input1Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); + const auto& input2 = std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->refCastFrom(input2Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); + const auto& input3 = std::static_pointer_cast<Tensor>(mOp.getRawInput(3))->refCastFrom(input3Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); + const auto& input4 = std::static_pointer_cast<Tensor>(mOp.getRawInput(4))->refCastFrom(input4Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); + + if (mBNDesc == nullptr) + { + const BatchNorm_Op<DIM>& bnOp = static_cast<const BatchNorm_Op<DIM>&>(mOp); + mEpsilon = static_cast<double>(bnOp.epsilon()); + mMode = CUDNN_BATCHNORM_SPATIAL; + + // CUDNN_BN_MIN_EPSILON is set to 0.0 since cuDNN 7.5.0 + if (CUDNN_BN_MIN_EPSILON > 0.0 && mEpsilon < CUDNN_BN_MIN_EPSILON) { + mEpsilon = CUDNN_BN_MIN_EPSILON; + } + + CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&mBNDesc)); + CHECK_CUDNN_STATUS(cudnnDeriveBNTensorDescriptor( + mBNDesc, std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0), mMode)); + + + cudnnDataType_t dataType; + const unsigned int nbDimsRequested = DIM; + std::vector<int> dims(nbDimsRequested); + std::vector<int> strides(nbDimsRequested); + int nbDims; + CHECK_CUDNN_STATUS(cudnnGetTensorNdDescriptor(mBNDesc, + nbDimsRequested, + &dataType, + &nbDims, + &dims[0], + &strides[0])); + dims.resize(nbDims); + strides.resize(nbDims); + } + + switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + forward_<double>(input0, input1, input2, input3, input4); + break; + case DataType::Float32: + forward_<float>(input0, input1, input2, input3, input4); + break; + case DataType::Float16: + forward_<half>(input0, input1, input2, input3, input4); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + } +} + +template <Aidge::DimIdx_t DIM> +template <class T> +void Aidge::BatchNormImpl_cuda<DIM>::forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, const Tensor& input3, const Tensor& input4) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; + const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; + + cudnnTensorDescriptor_t tensorDesc; + // For scale, bias, var and mean, if we have a 1D tensor, the dim should go on the channels + if (input1.nbDims() == 1) + { + CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc)); + const std::vector<int> dims = {1, static_cast<int>(input1.size()),1, 1}; + const std::vector<int> strides = {static_cast<int>(input1.size()), 1, 1, 1}; + CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc, CudaContext::data_type<T>::value, dims.size(), dims.data(), strides.data())); + } + else { + tensorDesc = std::dynamic_pointer_cast<TensorImpl_cuda_>(input1.getImpl())->getCudnnTensorDesc(input1); + } + CHECK_CUDNN_STATUS( + cudnnBatchNormalizationForwardInference( + CudaContext::cudnnHandle(), + mMode, + &alpha, + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0), + input0.getImpl()->rawPtr(), + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), + std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr(), + tensorDesc, + input1.getImpl()->rawPtr(), + input2.getImpl()->rawPtr(), + input3.getImpl()->rawPtr(), + input4.getImpl()->rawPtr(), + mEpsilon) + ); + if (input1.nbDims() == 1) + { + CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc)); + } +} + +template <Aidge::DimIdx_t DIM> +void Aidge::BatchNormImpl_cuda<DIM>::backward() { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + AIDGE_ASSERT(mBNDesc != nullptr, "BatchNorm descriptor must be created during forward!"); + for (IOIndex_t i = 0; i < (op.nbInputs() - 2); ++i) { + AIDGE_ASSERT(op.getInput(i), "missing input # {} in BatchNorm operator", i); + AIDGE_ASSERT(op.getInput(i)->hasImpl(), "cannot run BatchNorm backward because the {}-th input has no implementation.", i); + } + AIDGE_ASSERT(op.getOutput(0)->grad(), "missing outputGrad in BatchNorm operator"); + AIDGE_ASSERT(op.getOutput(0)->grad()->hasImpl(), "cannot run BatchNorm backward because the output grad has no implementation."); + + std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback, outputGradFallback; + const auto& input0 = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->refCastFrom(input0Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); + const auto& weights = std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->refCastFrom(input1Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); + const auto& bias = std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->refCastFrom(input2Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); + const auto& outputGrad = op.getOutput(0)->grad()->refCastFrom(outputGradFallback, *op.getOutput(0)->grad()); + + + switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + backward_<double>(input0, outputGrad, weights); + break; + case DataType::Float32: + backward_<float>(input0, outputGrad, weights); + break; + case DataType::Float16: + backward_<half>(input0, outputGrad, weights); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + } +} + +template <Aidge::DimIdx_t DIM> +template <class T> +void Aidge::BatchNormImpl_cuda<DIM>::backward_(const Tensor& input0, const Tensor& outputGrad, const Tensor& weights) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; + const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; + const typename Cuda::cudnn_scaling_type<T>::type alphaData = 1.0f; + const typename Cuda::cudnn_scaling_type<T>::type betaData = 0.0f; + + cudnnTensorDescriptor_t scaleBiasDesc; + // For scale, bias, var and mean, if we have a 1D tensor, the dim should go on the channels + if (weights.nbDims() == 1) + { + CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&scaleBiasDesc)); + const std::vector<int> dims = {1, static_cast<int>(weights.size()),1, 1}; + const std::vector<int> strides = {static_cast<int>(weights.size()), 1, 1, 1}; + CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(scaleBiasDesc, CudaContext::data_type<T>::value, dims.size(), dims.data(), strides.data())); + } + else { + scaleBiasDesc = std::dynamic_pointer_cast<TensorImpl_cuda_>(weights.getImpl())->getCudnnTensorDesc(weights); + } + + CHECK_CUDNN_STATUS( + cudnnBatchNormalizationBackward( + CudaContext::cudnnHandle(), + mMode, + &alphaData, + &betaData, + &alpha, + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0), + input0.getImpl()->rawPtr(), + std::dynamic_pointer_cast<TensorImpl_cuda_>(outputGrad.getImpl())->getCudnnTensorDesc(outputGrad), + outputGrad.getImpl()->rawPtr(), + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->grad()->getImpl())->getCudnnTensorDesc(*op.getInput(0)), + op.getInput(0)->grad()->getImpl()->rawPtr(), + scaleBiasDesc, + weights.getImpl()->rawPtr(), + op.getInput(1)->grad()->getImpl()->rawPtr(), + op.getInput(2)->grad()->getImpl()->rawPtr(), + mEpsilon, + nullptr, + nullptr) // TODO add savedMean and savedVar? + ); + if (weights.nbDims() == 1) + { + CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(scaleBiasDesc)); + } +} + +template <Aidge::DimIdx_t DIM> +Aidge::BatchNormImpl_cuda<DIM>::~BatchNormImpl_cuda() { + if(mBNDesc != nullptr) + { + cudnnDestroyTensorDescriptor(mBNDesc); + } +} + +// Template declarations +template class Aidge::BatchNormImpl_cuda<2>; diff --git a/src/operator/ConvImpl.cpp b/src/operator/ConvImpl.cpp index c0c32d3bbb758c9403577c84500bfe951e5e1a96..b627f69a289340b42e1de4baa6bb09d1ea2e5e99 100644 --- a/src/operator/ConvImpl.cpp +++ b/src/operator/ConvImpl.cpp @@ -9,13 +9,15 @@ * ********************************************************************************/ +#include "aidge/backend/cuda/operator/ConvImpl.hpp" + #include <cassert> #include <vector> #include "aidge/backend/cuda/data/TensorImpl.hpp" -#include "aidge/backend/cuda/operator/ConvImpl.hpp" #include "aidge/backend/cuda/utils/CudaUtils.hpp" #include "aidge/operator/Conv.hpp" +#include "aidge/operator/ConvDepthWise.hpp" #include "aidge/utils/Types.h" template <Aidge::DimIdx_t DIM> @@ -33,19 +35,27 @@ void Aidge::ConvImpl_cuda<DIM>::forward() { // Lazy-initialize CuDNN convolution descriptor if (mConvDesc == nullptr) { - const Conv_Op<DIM>& convOp = static_cast<const Conv_Op<DIM>&>(mOp); - const std::vector<int> strides(convOp.template getAttr<ConvAttr::StrideDims>().begin(), convOp.template getAttr<ConvAttr::StrideDims>().end()); const std::vector<int> paddings(DIM, 0); - const std::vector<int> upscales(convOp.template getAttr<ConvAttr::DilationDims>().begin(), convOp.template getAttr<ConvAttr::DilationDims>().end()); - - CHECK_CUDNN_STATUS(cudnnCreateConvolutionDescriptor(&mConvDesc)); - CHECK_CUDNN_STATUS(cudnnSetConvolutionNdDescriptor(mConvDesc, - DIM, - &paddings[0], - &strides[0], - &upscales[0], - CUDNN_CROSS_CORRELATION, - DataTypeToCudnn(op.getOutput(0)->dataType()))); + std::vector<int> strides, upscales; + if (mDepthWise) { + const ConvDepthWise_Op<DIM>& convDWOp = static_cast<const ConvDepthWise_Op<DIM>&>(mOp); + strides = std::vector<int>(convDWOp.strideDims().begin(), convDWOp.strideDims().end()); + upscales = std::vector<int>(convDWOp.dilationDims().begin(), convDWOp.dilationDims().end()); + } + else { + const Conv_Op<DIM>& convOp = static_cast<const Conv_Op<DIM>&>(mOp); + strides = std::vector<int>(convOp.strideDims().begin(), convOp.strideDims().end()); + upscales = std::vector<int>(convOp.dilationDims().begin(), convOp.dilationDims().end()); + } + + CHECK_CUDNN_STATUS(cudnnCreateConvolutionDescriptor(&mConvDesc)); + CHECK_CUDNN_STATUS(cudnnSetConvolutionNdDescriptor(mConvDesc, + DIM, + &paddings[0], + &strides[0], + &upscales[0], + CUDNN_CROSS_CORRELATION, + DataTypeToCudnn(op.getOutput(0)->dataType()))); } // Lazy-initialize CuDNN filter descriptor @@ -339,4 +349,5 @@ Aidge::ConvImpl_cuda<DIM>::~ConvImpl_cuda() { // Template declarations +template class Aidge::ConvImpl_cuda<1>; template class Aidge::ConvImpl_cuda<2>; diff --git a/src/operator/FCImpl.cpp b/src/operator/FCImpl.cpp index 8b60f7fd6aa41f206b2c6eaa5d8f8daa1bd81374..9948ee1356ad4fedb5d830016ae66ca69a033e38 100644 --- a/src/operator/FCImpl.cpp +++ b/src/operator/FCImpl.cpp @@ -24,27 +24,26 @@ #include "aidge/utils/Types.h" void Aidge::FCImpl_cuda::forward() { - assert(mOp.getRawInput(0) && "missing input #0"); - assert(mOp.getRawInput(1) && "missing input #1"); - assert(mOp.getRawInput(2) && "missing input #2"); + AIDGE_ASSERT(mOp.getRawInput(0), "missing input #0"); + AIDGE_ASSERT(mOp.getRawInput(1), "missing input #1"); + AIDGE_ASSERT(mOp.getRawInput(2), "missing input #2"); const auto& fcOp = static_cast<const FC_Op&>(mOp); - bool noBias = fcOp.template getAttr<FCAttr::NoBias>(); - std::size_t outChannels = static_cast<std::size_t>(fcOp.template getAttr<FCAttr::OutChannels>()); + std::size_t outChannels = fcOp.outChannels(); const auto& input0 = fcOp.getInput(0)->refCastFrom(mInput0Fallback, *fcOp.getOutput(0)); const auto& input1 = fcOp.getInput(1)->refCastFrom(mInput1Fallback, *fcOp.getOutput(0)); - const auto& input2 = fcOp.getInput(2)->refCastFrom(mInput2Fallback, *fcOp.getOutput(0)); + const auto& input2 = (fcOp.getInput(2)) ? fcOp.getInput(2)->refCastFrom(mInput2Fallback, *fcOp.getOutput(0)) : Tensor(); switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { case DataType::Float64: - forward_<double>(input0, input1, input2, noBias, outChannels); + forward_<double>(input0, input1, input2, outChannels); break; case DataType::Float32: - forward_<float>(input0, input1, input2, noBias, outChannels); + forward_<float>(input0, input1, input2, outChannels); break; case DataType::Float16: - forward_<half>(input0, input1, input2, noBias, outChannels); + forward_<half>(input0, input1, input2, outChannels); break; default: AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); @@ -52,7 +51,7 @@ void Aidge::FCImpl_cuda::forward() { } template<class T> -void Aidge::FCImpl_cuda::forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, bool noBias, std::size_t outChannels) +void Aidge::FCImpl_cuda::forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, std::size_t outChannels) { const T * input = static_cast<const T*>(input0.getImpl()->rawPtr()); const T * weights = static_cast<const T*>(input1.getImpl()->rawPtr()); @@ -67,8 +66,8 @@ void Aidge::FCImpl_cuda::forward_(const Tensor& input0, const Tensor& input1, co int lda = k; // leading dimension of weights int ldb = k; // leading dimension of input int ldc = n; // leading dimension of output - const T alpha = 1.0f; - const T beta = 0.0f; + const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; + const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; CHECK_CUBLAS_STATUS(cublasGemm(CudaContext::cublasHandle(), CUBLAS_OP_T, CUBLAS_OP_N, @@ -78,13 +77,13 @@ void Aidge::FCImpl_cuda::forward_(const Tensor& input0, const Tensor& input1, co reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&alpha), weights, ldb, - input, + input, lda, reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&beta), output, ldc)); - if(!noBias){ + if(!input2.empty()){ T* onesVector; CHECK_CUDA_STATUS(cudaMalloc((void**)&onesVector, m * sizeof(T))); // Fill the vector with ones @@ -114,4 +113,109 @@ void Aidge::FCImpl_cuda::forward_(const Tensor& input0, const Tensor& input1, co cudaFree(onesVector); } -} \ No newline at end of file +} + +void Aidge::FCImpl_cuda::backward() { + AIDGE_ASSERT(mOp.getRawInput(0), "missing input #0"); + AIDGE_ASSERT(mOp.getRawInput(1), "missing input #1"); + AIDGE_ASSERT(mOp.getRawInput(2), "missing input #2"); + + const auto& fcOp = static_cast<const FC_Op&>(mOp); + std::size_t outChannels = fcOp.outChannels(); + + const auto& input0 = fcOp.getInput(0)->refCastFrom(mInput0Fallback, *fcOp.getOutput(0)); + const auto& input1 = fcOp.getInput(1)->refCastFrom(mInput1Fallback, *fcOp.getOutput(0)); + const auto& input2 = fcOp.getInput(2)->refCastFrom(mInput2Fallback, *fcOp.getOutput(0)); + + switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + backward_<double>(input0, input1, input2, outChannels); + break; + case DataType::Float32: + backward_<float>(input0, input1, input2, outChannels); + break; + case DataType::Float16: + backward_<half>(input0, input1, input2, outChannels); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + } +} + +template<class T> +void Aidge::FCImpl_cuda::backward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, std::size_t outChannels) +{ + const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; + const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; + const typename Cuda::cudnn_scaling_type<T>::type betaData = 0.0f; + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const T * input = static_cast<const T*>(input0.getImpl()->rawPtr()); + const T * weights = static_cast<const T*>(input1.getImpl()->rawPtr()); + const T * outputGrad = static_cast<const T*>(op.getOutput(0)->grad()->getImpl()->rawPtr()); + T * weightsGrad = static_cast<T*>(op.getInput(1)->grad()->getImpl()->rawPtr()); + + // Performing weightsGrad = (input) * T(outputGrad) + // [n x m] = [n x k] * [k x m] + int m = input0.dims()[input0.nbDims()-1]; + int k = input0.size()/m; + int n = input1.size()/m; + int input0LastDim = input0.dims()[input0.nbDims()-1]; + CHECK_CUBLAS_STATUS(cublasGemm( + CudaContext::cublasHandle(), + CUBLAS_OP_N, + CUBLAS_OP_T, + m, + n, + k, + reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&alpha), + input, + m, + outputGrad, + n, + reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&beta), + weightsGrad, + m)); + + if(!input2.empty()){ + T * biasGrad = static_cast<T*>(op.getInput(2)->grad()->getImpl()->rawPtr()); + T* onesVector; + CHECK_CUDA_STATUS(cudaMalloc((void**)&onesVector, m * sizeof(T))); + // Fill the vector with ones + std::vector<T> onesVec(m, T(1.0)); + CHECK_CUDA_STATUS(cudaMemcpy(onesVector, + &onesVec[0], + m * sizeof(T), + cudaMemcpyHostToDevice)); + // Performing biasGrad = outputGrad * onesVector + CHECK_CUBLAS_STATUS(cublasGemv(CudaContext::cublasHandle(), + CUBLAS_OP_N, + outChannels, + input0LastDim, + reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&alpha), + outputGrad, + outChannels, + onesVector, + 1, + reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&beta), + biasGrad, + 1)); + cudaFree(onesVector); + } + // Performing inputGrad = (weights) * (outputGrad) + CHECK_CUBLAS_STATUS(cublasGemm( + CudaContext::cublasHandle(), + CUBLAS_OP_N, + CUBLAS_OP_N, + op.getInput(1)->grad()->size()/outChannels, + input0LastDim, + outChannels, + reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&alpha), + weights,//w + op.getInput(1)->grad()->size()/outChannels, + outputGrad,//dY + outChannels, + reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&betaData), + static_cast<T*>(op.getInput(0)->grad()->getImpl()->rawPtr()),//dX + op.getInput(1)->grad()->size()/outChannels)); + +} diff --git a/src/operator/FCImpl_CUDA_kernels.cu b/src/operator/FCImpl_CUDA_kernels.cu index 5139ac1d7edf61cf347870e6add2870b2792a0e5..90b7c6bbc9cf0f470a7184e24273aae04da6f3c6 100644 --- a/src/operator/FCImpl_CUDA_kernels.cu +++ b/src/operator/FCImpl_CUDA_kernels.cu @@ -73,4 +73,60 @@ cublasStatus_t cublasGemm<double>(cublasHandle_t handle, beta, C, ldc); } + +template <> +cublasStatus_t cublasGemv<__half>(cublasHandle_t handle, cublasOperation_t trans, + int m, int n, + const __half *alpha, + const __half *A, int lda, + const __half *x, int incx, + const __half *beta, + __half *y, int incy) +{ + // Using cublasHgemm() because there is no cublasHgemv() yet + return cublasHgemm(handle, + trans, CUBLAS_OP_N, + m, 1, n, + alpha, + A, lda, + x, incx, + beta, + y, incy); +} + +template <> +cublasStatus_t cublasGemv<float>(cublasHandle_t handle, cublasOperation_t trans, + int m, int n, + const float *alpha, + const float *A, int lda, + const float *x, int incx, + const float *beta, + float *y, int incy) +{ + return cublasSgemv(handle, trans, + m, n, + alpha, + A, lda, + x, incx, + beta, + y, incy); +} + +template <> +cublasStatus_t cublasGemv<double>(cublasHandle_t handle, cublasOperation_t trans, + int m, int n, + const double *alpha, + const double *A, int lda, + const double *x, int incx, + const double *beta, + double *y, int incy) +{ + return cublasDgemv(handle, trans, + m, n, + alpha, + A, lda, + x, incx, + beta, + y, incy); +} } \ No newline at end of file diff --git a/src/operator/GlobalAveragePoolingImpl.cpp b/src/operator/GlobalAveragePoolingImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8c83d477094d9cce41807d888cca57bd614e9cc6 --- /dev/null +++ b/src/operator/GlobalAveragePoolingImpl.cpp @@ -0,0 +1,111 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <cassert> +#include <vector> + +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/GlobalAveragePoolingImpl.hpp" +#include "aidge/backend/cuda/utils/CudaContext.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/operator/GlobalAveragePooling.hpp" +#include "aidge/utils/Types.h" + +void Aidge::GlobalAveragePoolingImpl_cuda::forward() { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + + AIDGE_ASSERT(mOp.getRawInput(0), "missing input #0"); + + const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0)); + + // Lazy-initialize CuDNN GlobalAveragePooling descriptor + if (mGlobalAveragePoolingDesc == nullptr) { + int poolingDims = 2; // Assuming 2D pooling + int windowDims[2] = {static_cast<int>(input.dims().at(2)), static_cast<int>(input.dims().at(3))}; // Pooling window dimensions matching spatial dimensions of input tensor + int padding[2] = {0, 0}; // No padding + int stride[2] = {1, 1}; // Stride of 1 + CHECK_CUDNN_STATUS(cudnnCreatePoolingDescriptor(&mGlobalAveragePoolingDesc)); + CHECK_CUDNN_STATUS( + cudnnSetPoolingNdDescriptor(mGlobalAveragePoolingDesc, mMode, CUDNN_NOT_PROPAGATE_NAN, poolingDims, windowDims, padding, stride) + // cudnnSetPooling2dDesccomputedOutputriptor(mGlobalAveragePoolingDesc, mMode, CUDNN_NOT_PROPAGATE_NAN, 1, 1, 0, 0, 1, 1) + ); + } + + if (op.getOutput(0)->dataType() == DataType::Float64) { + forward_<double>(input); + } + else { + forward_<float>(input); + } +} + +template <class T> +void Aidge::GlobalAveragePoolingImpl_cuda::forward_(const Tensor& input) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; + const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; + + CHECK_CUDNN_STATUS( + cudnnPoolingForward( + CudaContext::cudnnHandle(), + mGlobalAveragePoolingDesc, + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input), + input.getImpl()->rawPtr(), + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), + std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr() + ) + ); +} + +void Aidge::GlobalAveragePoolingImpl_cuda::backward() { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + + AIDGE_ASSERT(mGlobalAveragePoolingDesc != nullptr, "GlobalAvgPool descriptor must be created during forward!"); + AIDGE_ASSERT(op.getOutput(0)->grad(), "missing output grad #0"); + + const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad()); + + if (op.getOutput(0)->dataType() == DataType::Float64) { + backward_<double>(output_grad); + } + else { + backward_<float>(output_grad); + } +} + +template <class T> +void Aidge::GlobalAveragePoolingImpl_cuda::backward_(const Tensor& output_grad) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + + const T alpha = 1.0f; + const T beta = 0.0f; + CHECK_CUDNN_STATUS( + cudnnPoolingBackward(CudaContext::cudnnHandle(), + mGlobalAveragePoolingDesc, + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), + std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr(), + std::dynamic_pointer_cast<TensorImpl_cuda_>(output_grad.getImpl())->getCudnnTensorDesc(output_grad), + output_grad.getImpl()->rawPtr(), + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->getImpl())->getCudnnTensorDesc(*op.getInput(0)), + op.getInput(0)->getImpl()->rawPtr(), + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->grad()->getImpl())->getCudnnTensorDesc(*op.getInput(0)), + op.getInput(0)->grad()->getImpl()->rawPtr())); +} + +Aidge::GlobalAveragePoolingImpl_cuda::~GlobalAveragePoolingImpl_cuda() { + if(mGlobalAveragePoolingDesc != nullptr) + cudnnDestroyPoolingDescriptor(mGlobalAveragePoolingDesc); +} + diff --git a/src/operator/MaxPoolingImpl.cpp b/src/operator/MaxPoolingImpl.cpp index de41915e7506cd121f25a6112252ecea92b047d5..39050635102ebebaed8192cb4bb338e2bc31d5e8 100644 --- a/src/operator/MaxPoolingImpl.cpp +++ b/src/operator/MaxPoolingImpl.cpp @@ -21,18 +21,17 @@ template <Aidge::DimIdx_t DIM> void Aidge::MaxPoolingImpl_cuda<DIM>::forward() { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const MaxPooling_Op<DIM>& op_ = static_cast<const MaxPooling_Op<DIM>&>(mOp); - assert(mOp.getRawInput(0) && "missing input #0"); + AIDGE_ASSERT(mOp.getRawInput(0), "missing input #0"); - const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0)); + const auto& input = op_.getInput(0)->refCastFrom(mInputFallback, *op_.getOutput(0)); // Lazy-initialize CuDNN MaxPooling descriptor if (mMaxPoolingDesc == nullptr) { - const MaxPooling_Op<DIM>& maxPoolingOp = static_cast<const MaxPooling_Op<DIM>&>(op); - const std::vector<int> strides(maxPoolingOp.template getAttr<MaxPoolingAttr::StrideDims>().begin(), maxPoolingOp.template getAttr<MaxPoolingAttr::StrideDims>().end()); + const std::vector<int> strides(op_.strideDims().begin(), op_.strideDims().end()); const std::vector<int> paddings(DIM, 0); - const std::vector<int> window_dims(maxPoolingOp.template getAttr<MaxPoolingAttr::KernelDims>().begin(), maxPoolingOp.template getAttr<MaxPoolingAttr::KernelDims>().end()); + const std::vector<int> window_dims(op_.kernelDims().begin(), op_.kernelDims().end()); CHECK_CUDNN_STATUS(cudnnCreatePoolingDescriptor(&mMaxPoolingDesc)); CHECK_CUDNN_STATUS( @@ -45,10 +44,11 @@ void Aidge::MaxPoolingImpl_cuda<DIM>::forward() { &strides[0])); } + // Do the actual forward computation // Template is only for scaling parameters, which are always in float // excepted when the convolution is performed in double precision. - if (op.getOutput(0)->dataType() == DataType::Float64) { + if (op_.getOutput(0)->dataType() == DataType::Float64) { forward_<double>(input); } else { @@ -59,7 +59,7 @@ void Aidge::MaxPoolingImpl_cuda<DIM>::forward() { template <Aidge::DimIdx_t DIM> template <class T> void Aidge::MaxPoolingImpl_cuda<DIM>::forward_(const Tensor& input) { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const MaxPooling_Op<DIM>& op_ = static_cast<const MaxPooling_Op<DIM>&>(mOp); const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; CHECK_CUDNN_STATUS( @@ -70,12 +70,54 @@ void Aidge::MaxPoolingImpl_cuda<DIM>::forward_(const Tensor& input) { std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input), input.getImpl()->rawPtr(), &beta, - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), - std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr() + std::dynamic_pointer_cast<TensorImpl_cuda_>(op_.getOutput(0)->getImpl())->getCudnnTensorDesc(*op_.getOutput(0)), + op_.getOutput(0)->getImpl()->rawPtr() ) ); } +template <Aidge::DimIdx_t DIM> +void Aidge::MaxPoolingImpl_cuda<DIM>::backward() { + const MaxPooling_Op<DIM>& op_ = static_cast<const MaxPooling_Op<DIM>&>(mOp); + + AIDGE_ASSERT(mMaxPoolingDesc != nullptr, "MaxPool descriptor must be created during forward!"); + AIDGE_ASSERT(op_.getOutput(0)->grad(), "missing output grad #0"); + + const auto& output_grad = op_.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op_.getOutput(0)->grad()); + + // Do the actual backward computation + // Template is only for scaling parameters, which are always in float + // excepted when the convolution is performed in double precision. + if (op_.getOutput(0)->dataType() == DataType::Float64) { + backward_<double>(output_grad); + } + else { + backward_<float>(output_grad); + } +} + +template <Aidge::DimIdx_t DIM> +template <class T> +void Aidge::MaxPoolingImpl_cuda<DIM>::backward_(const Tensor& output_grad) { + const MaxPooling_Op<DIM>& op_ = static_cast<const MaxPooling_Op<DIM>&>(mOp); + + const T alpha = 1.0f; + const T beta = 0.0f; + CHECK_CUDNN_STATUS( + cudnnPoolingBackward(CudaContext::cudnnHandle(), + mMaxPoolingDesc, + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op_.getOutput(0)->getImpl())->getCudnnTensorDesc(*op_.getOutput(0)), + op_.getOutput(0)->getImpl()->rawPtr(), + std::dynamic_pointer_cast<TensorImpl_cuda_>(output_grad.getImpl())->getCudnnTensorDesc(output_grad), + output_grad.getImpl()->rawPtr(), + std::dynamic_pointer_cast<TensorImpl_cuda_>(op_.getInput(0)->getImpl())->getCudnnTensorDesc(*op_.getInput(0)), + op_.getInput(0)->getImpl()->rawPtr(), + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op_.getInput(0)->grad()->getImpl())->getCudnnTensorDesc(*op_.getInput(0)), + op_.getInput(0)->grad()->getImpl()->rawPtr())); +} + template <Aidge::DimIdx_t DIM> Aidge::MaxPoolingImpl_cuda<DIM>::~MaxPoolingImpl_cuda() { if(mMaxPoolingDesc != nullptr) diff --git a/src/operator/PadImpl.cpp b/src/operator/PadImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3606ba66d002f1467aa65771015cab02c066d5a5 --- /dev/null +++ b/src/operator/PadImpl.cpp @@ -0,0 +1,137 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include "aidge/backend/cuda/operator/PadImpl.hpp" + +#include <vector> + +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/PadImpl_CUDA_kernels.hpp" +#include "aidge/backend/cuda/utils/CudaContext.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/operator/Pad.hpp" +#include "aidge/utils/ErrorHandling.hpp" +#include "aidge/utils/Types.h" + +template <Aidge::DimIdx_t DIM> +void Aidge::PadImpl_cuda<DIM>::forward() +{ + const Pad_Op<DIM> &op = static_cast<const Pad_Op<DIM> &>(mOp); + + AIDGE_ASSERT(op.getInput(0), "missing input in Pad operator"); + AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Pad forward input has no implementation."); + + const auto &input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0)); + + auto paddingBorders = op.beginEndBorders(); + + mLeftPad = paddingBorders[2]; + mTopPad = paddingBorders[0]; + mPadVal = op.borderValue(); + mPadType = static_cast<unsigned int>(op.borderType()); + + switch (op.getOutput(0)->dataType()) + { + case DataType::Float64: + forward_<double>(input); + break; + case DataType::Float32: + forward_<float>(input); + break; + case DataType::Float16: + forward_<half>(input); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + } +} + +template <Aidge::DimIdx_t DIM> +template <class T> +void Aidge::PadImpl_cuda<DIM>::forward_(const Tensor &input) +{ + const auto outDims = std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(); + const T *inputPtr = static_cast<const T *>(input.getImpl()->rawPtr()); + T *output = static_cast<T *>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr()); + Aidge::cudaPadding(CudaContext::getDeviceProp(), + outDims[1], + outDims[3], + outDims[2], + input.dims()[1], + input.dims()[0], + input.dims()[3], + input.dims()[2], + mLeftPad, + mTopPad, + mPadType, + static_cast<T>(mPadVal), + inputPtr, + output); +} + +template <Aidge::DimIdx_t DIM> +void Aidge::PadImpl_cuda<DIM>::backward() +{ + const Pad_Op<DIM> &op = static_cast<const Pad_Op<DIM> &>(mOp); + + AIDGE_ASSERT(op.getOutput(0)->grad(), "missing output gradient in Pad operator"); + AIDGE_ASSERT(op.getOutput(0)->grad(), "cannot run Pad backward, output gradient has no implementation."); + + const auto &outGrad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getInput(0)); + + auto paddingBorders = op.beginEndBorders(); + + mLeftPad = paddingBorders[2]; + mTopPad = paddingBorders[0]; + mPadVal = op.borderValue(); + mPadType = static_cast<unsigned int>(op.borderType()); + + switch (std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) + { + case DataType::Float64: + backward_<double>(outGrad); + break; + case DataType::Float32: + backward_<float>(outGrad); + break; + case DataType::Float16: + backward_<half>(outGrad); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + } +} + +template <Aidge::DimIdx_t DIM> +template <class T> +void Aidge::PadImpl_cuda<DIM>::backward_(const Tensor &outGrad) +{ + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const auto inputGradDims = op.getInput(0)->grad()->dims(); + T *inputGrad = static_cast<T *>(op.getInput(0)->grad()->getImpl()->rawPtr()); + Aidge::cudaPadding(CudaContext::getDeviceProp(), + inputGradDims[1], + inputGradDims[3], + inputGradDims[2], + outGrad.dims()[1], + outGrad.dims()[0], + outGrad.dims()[3], + outGrad.dims()[2], + -mLeftPad, + -mTopPad, + mPadType, + static_cast<T>(mPadVal), + static_cast<const T *>(outGrad.getImpl()->rawPtr()), + inputGrad); +} + +// Template declarations +template class Aidge::PadImpl_cuda<2>; diff --git a/src/operator/PadImpl_CUDA_kernels.cu b/src/operator/PadImpl_CUDA_kernels.cu new file mode 100644 index 0000000000000000000000000000000000000000..a20a4c10a6cb5e783a09868389b8f968bc0f42a3 --- /dev/null +++ b/src/operator/PadImpl_CUDA_kernels.cu @@ -0,0 +1,224 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include "aidge/backend/cuda/operator/PadImpl_CUDA_kernels.hpp" + +template <typename T> +__global__ void cudaPadding_kernel(unsigned int nbOutputs, + unsigned int outputWidth, + unsigned int outputHeight, + unsigned int nbChannels, + unsigned int inputWidth, + unsigned int inputHeight, + int leftPad, + int topPad, + unsigned int padType, + T padValue, + const T *input, + T *outputs) +{ + const unsigned int inputOffset = (blockIdx.z * blockDim.z + threadIdx.z) * nbChannels * inputWidth * inputHeight; + + const unsigned int outputOffset = (blockIdx.z * blockDim.z + threadIdx.z) * nbOutputs * outputWidth * outputHeight; + + // nbCh = nbChannels for propagate + // = nbOutputs for back-propagate + const unsigned int nbCh = min(nbChannels, nbOutputs); + + for (unsigned int ch = blockIdx.x; ch < nbCh; ch += gridDim.x) + { + for (unsigned int oy = threadIdx.y; oy < outputHeight; oy += blockDim.y) + { + for (unsigned int ox = threadIdx.x; ox < outputWidth; ox += blockDim.x) + { + T outputValue = padValue; + + if (padType == 0) // Const padding + { + int ix = (int)ox - leftPad; + int iy = (int)oy - topPad; + + if (ix >= 0 && ix < (int)inputWidth && iy >= 0 && iy < (int)inputHeight) + { + outputValue = input[ix + + iy * inputWidth + ch * inputWidth * inputHeight + inputOffset]; + } + } + else if (padType == 1) // Edge padding + { + int ix = max(0, min((int)inputWidth - 1, (int)ox - leftPad)); + int iy = max(0, min((int)inputHeight - 1, (int)oy - topPad)); + + outputValue = input[ix + + iy * inputWidth + ch * inputWidth * inputHeight + inputOffset]; + } + else if (padType == 2) // Reflect padding + { + int ix = (int)ox - leftPad; + int iy = (int)oy - topPad; + + if (ix < 0) + ix = 0 - ix; + if (iy < 0) + iy = 0 - iy; + if (ix >= (int)inputWidth) + ix = (int)inputWidth - ix; + if (iy >= (int)inputHeight) + iy = (int)inputHeight - iy; + + outputValue = input[ix + + iy * inputWidth + ch * inputWidth * inputHeight + inputOffset]; + } + else if (padType == 3) // Wrap padding + { + int ix = (inputWidth + (int)ox - leftPad) % inputWidth; + int iy = (inputHeight + (int)oy - topPad) % inputHeight; + + outputValue = input[ix + + iy * inputWidth + ch * inputWidth * inputHeight + inputOffset]; + } + outputs[ox + oy * outputWidth + ch * outputWidth * outputHeight + outputOffset] = outputValue; + } + } + } +} + +template <> // double +void Aidge::cudaPadding(const cudaDeviceProp &deviceProp, + unsigned int nbOutputs, + unsigned int outputsWidth, + unsigned int outputsHeight, + unsigned int nbChannels, + unsigned int batchSize, + unsigned int inputWidth, + unsigned int inputHeight, + int leftPad, + int topPad, + unsigned int padType, + double padValue, + const double *input, + double *outputs) +{ + const unsigned int maxSize = (unsigned int)deviceProp.maxThreadsPerBlock; + const unsigned int prefMultiple = (unsigned int)deviceProp.warpSize; + + const unsigned int groupSize = (outputsWidth * outputsHeight < maxSize) + ? outputsWidth * outputsHeight + : maxSize; + + const unsigned int reqWidth = (unsigned int)ceilf((float)groupSize / (float)outputsWidth); + + const unsigned int groupWidth = min(prefMultiple, reqWidth); + const dim3 blocksPerGrid = {nbChannels, 1, batchSize}; + const dim3 threadsPerBlocks = {groupWidth, groupSize / groupWidth, 1}; + + cudaPadding_kernel<<<blocksPerGrid, threadsPerBlocks>>>(nbOutputs, + outputsWidth, + outputsHeight, + nbChannels, + inputWidth, + inputHeight, + leftPad, + topPad, + padType, + padValue, + input, + outputs); + CHECK_CUDA_STATUS(cudaPeekAtLastError()); +} + +template <> // float +void Aidge::cudaPadding(const cudaDeviceProp &deviceProp, + unsigned int nbOutputs, + unsigned int outputsWidth, + unsigned int outputsHeight, + unsigned int nbChannels, + unsigned int batchSize, + unsigned int inputWidth, + unsigned int inputHeight, + int leftPad, + int topPad, + unsigned int padType, + float padValue, + const float *input, + float *outputs) +{ + const unsigned int maxSize = (unsigned int)deviceProp.maxThreadsPerBlock; + const unsigned int prefMultiple = (unsigned int)deviceProp.warpSize; + + const unsigned int groupSize = (outputsWidth * outputsHeight < maxSize) + ? outputsWidth * outputsHeight + : maxSize; + + const unsigned int reqWidth = (unsigned int)ceilf((float)groupSize / (float)outputsWidth); + + const unsigned int groupWidth = min(prefMultiple, reqWidth); + const dim3 blocksPerGrid = {nbChannels, 1, batchSize}; + const dim3 threadsPerBlocks = {groupWidth, groupSize / groupWidth, 1}; + + cudaPadding_kernel<<<blocksPerGrid, threadsPerBlocks>>>(nbOutputs, + outputsWidth, + outputsHeight, + nbChannels, + inputWidth, + inputHeight, + leftPad, + topPad, + padType, + padValue, + input, + outputs); + CHECK_CUDA_STATUS(cudaPeekAtLastError()); +} + +template <> // half +void Aidge::cudaPadding(const cudaDeviceProp &deviceProp, + unsigned int nbOutputs, + unsigned int outputsWidth, + unsigned int outputsHeight, + unsigned int nbChannels, + unsigned int batchSize, + unsigned int inputWidth, + unsigned int inputHeight, + int leftPad, + int topPad, + unsigned int padType, + half padValue, + const half *input, + half *outputs) +{ + const unsigned int maxSize = (unsigned int)deviceProp.maxThreadsPerBlock; + const unsigned int prefMultiple = (unsigned int)deviceProp.warpSize; + + const unsigned int groupSize = (outputsWidth * outputsHeight < maxSize) + ? outputsWidth * outputsHeight + : maxSize; + + const unsigned int reqWidth = (unsigned int)ceilf((float)groupSize / (float)outputsWidth); + + const unsigned int groupWidth = min(prefMultiple, reqWidth); + const dim3 blocksPerGrid = {nbChannels, 1, batchSize}; + const dim3 threadsPerBlocks = {groupWidth, groupSize / groupWidth, 1}; + + cudaPadding_kernel<<<blocksPerGrid, threadsPerBlocks>>>(nbOutputs, + outputsWidth, + outputsHeight, + nbChannels, + inputWidth, + inputHeight, + leftPad, + topPad, + padType, + padValue, + input, + outputs); + CHECK_CUDA_STATUS(cudaPeekAtLastError()); +} \ No newline at end of file diff --git a/src/operator/ReLUImpl.cpp b/src/operator/ReLUImpl.cpp index 0a4eeeb7d8a2a4be94b5ac6b43dbae69cd8e3869..80d52045e832b42a95b6d7448f2016530bb9d1ac 100644 --- a/src/operator/ReLUImpl.cpp +++ b/src/operator/ReLUImpl.cpp @@ -1,5 +1,5 @@ /******************************************************************************** - * Copyright (c) 2023 CEA-List + * Copyright (c) 2024 CEA-List * * This program and the accompanying materials are made available under the * terms of the Eclipse Public License 2.0 which is available at @@ -64,6 +64,55 @@ void Aidge::ReLUImpl_cuda::forward_(const Tensor& input) { std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr())); } +void Aidge::ReLUImpl_cuda::backward() { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + + assert(op.getOutput(0)->grad() && "missing output #0"); + + const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad()); + + // Lazy-initialize CuDNN ReLU descriptor + if (mReLUDesc == nullptr) { + #if CUDNN_VERSION >= 5000 + CHECK_CUDNN_STATUS(cudnnCreateActivationDescriptor(&mReLUDesc)); + CHECK_CUDNN_STATUS(cudnnSetActivationDescriptor( + mReLUDesc, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, 0.0)); + #else + mReLUDesc = CUDNN_ACTIVATION_RELU; + #endif + } + + // Do the actual backward computation + // Template is only for scaling parameters, which are always in float + // excepted when the convolution is performed in double precision. + if (op.getInput(0)->grad()->dataType() == DataType::Float64) { + backward_<double>(output_grad); + } + else { + backward_<float>(output_grad); + } +} + +template <class T> +void Aidge::ReLUImpl_cuda::backward_(const Tensor& output_grad) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; + const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; + CHECK_CUDNN_STATUS( + cudnnActivationBackward(CudaContext::cudnnHandle(), + mReLUDesc, + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), + std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr(), + std::dynamic_pointer_cast<TensorImpl_cuda_>(output_grad.getImpl())->getCudnnTensorDesc(output_grad), + output_grad.getImpl()->rawPtr(), + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->getImpl())->getCudnnTensorDesc(*op.getInput(0)), + std::static_pointer_cast<Tensor>(op.getRawInput(0))->getImpl()->rawPtr(), + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->grad()->getImpl())->getCudnnTensorDesc(*op.getInput(0)->grad()), + op.getInput(0)->grad()->getImpl()->rawPtr())); +} + Aidge::ReLUImpl_cuda::~ReLUImpl_cuda() { if (mReLUDesc != nullptr) { #if CUDNN_VERSION >= 5000 diff --git a/src/operator/ReshapeImpl.cpp b/src/operator/ReshapeImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8016a5a9d1dfc26454af2cb03b6fe573820245f5 --- /dev/null +++ b/src/operator/ReshapeImpl.cpp @@ -0,0 +1,46 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <cassert> +#include <chrono> // std::chrono::milliseconds +#include <numeric> // std::accumulate +#include <thread> // std::this_thread::sleep_for +#include <vector> + +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/ReshapeImpl.hpp" +#include "aidge/backend/cuda/utils/CudaContext.hpp" +#include "aidge/operator/Reshape.hpp" +#include "aidge/utils/Types.h" + +void Aidge::ReshapeImpl_cuda::forward() { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + // FIXME: uncomment the following code once memory handling will work + assert(mOp.getRawInput(0) && "missing input #0"); + + const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0)); + + std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))-> getImpl() -> setRawPtr(input.getImpl()->rawPtr(), input.getImpl()->size()); +} + +void Aidge::ReshapeImpl_cuda::backward() { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + AIDGE_ASSERT(op.getOutput(0)->grad(), "missing output grad #0"); + + const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad()); + + std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->grad() -> getImpl() -> setRawPtr(output_grad.getImpl()->rawPtr(), output_grad.getImpl()->size()); +} + +Aidge::ReshapeImpl_cuda::~ReshapeImpl_cuda() { + +} + diff --git a/src/operator/SigmoidImpl.cpp b/src/operator/SigmoidImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..386cd9d821b3019cf8f0de2cc757ae514446f1a6 --- /dev/null +++ b/src/operator/SigmoidImpl.cpp @@ -0,0 +1,123 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <cassert> +#include <vector> + +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/SigmoidImpl.hpp" +#include "aidge/backend/cuda/utils/CudaContext.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/operator/Sigmoid.hpp" +#include "aidge/utils/Types.h" + +void Aidge::SigmoidImpl_cuda::forward() { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + + assert(mOp.getRawInput(0) && "missing input #0"); + + const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0)); + + // Lazy-initialize CuDNN Sigmoid descriptor + if (mSigmoidDesc == nullptr) { + #if CUDNN_VERSION >= 5000 + CHECK_CUDNN_STATUS(cudnnCreateActivationDescriptor(&mSigmoidDesc)); + CHECK_CUDNN_STATUS(cudnnSetActivationDescriptor( + mSigmoidDesc, CUDNN_ACTIVATION_SIGMOID, CUDNN_NOT_PROPAGATE_NAN, 0.0)); + #else + mSigmoidDesc = CUDNN_ACTIVATION_SIGMOID; + #endif + } + + // Do the actual forward computation + // Template is only for scaling parameters, which are always in float + // excepted when the convolution is performed in double precision. + if (op.getOutput(0)->dataType() == DataType::Float64) { + forward_<double>(input); + } + else { + forward_<float>(input); + } +} + +template <class T> +void Aidge::SigmoidImpl_cuda::forward_(const Tensor& input) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; + const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; + CHECK_CUDNN_STATUS( + cudnnActivationForward(CudaContext::cudnnHandle(), + mSigmoidDesc, + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input), + input.getImpl()->rawPtr(), + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), + std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr())); +} + +void Aidge::SigmoidImpl_cuda::backward() { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + + assert(op.getOutput(0)->grad() && "missing output #0"); + + const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad()); + + // Lazy-initialize CuDNN Sigmoid descriptor + if (mSigmoidDesc == nullptr) { + #if CUDNN_VERSION >= 5000 + CHECK_CUDNN_STATUS(cudnnCreateActivationDescriptor(&mSigmoidDesc)); + CHECK_CUDNN_STATUS(cudnnSetActivationDescriptor( + mSigmoidDesc, CUDNN_ACTIVATION_SIGMOID, CUDNN_NOT_PROPAGATE_NAN, 0.0)); + #else + mSigmoidDesc = CUDNN_ACTIVATION_SIGMOID; + #endif + } + + // Do the actual backward computation + // Template is only for scaling parameters, which are always in float + // excepted when the convolution is performed in double precision. + if (op.getInput(0)->grad()->dataType() == DataType::Float64) { + backward_<double>(output_grad); + } + else { + backward_<float>(output_grad); + } +} + +template <class T> +void Aidge::SigmoidImpl_cuda::backward_(const Tensor& output_grad) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; + const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; + CHECK_CUDNN_STATUS( + cudnnActivationBackward(CudaContext::cudnnHandle(), + mSigmoidDesc, + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), + std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr(), + std::dynamic_pointer_cast<TensorImpl_cuda_>(output_grad.getImpl())->getCudnnTensorDesc(output_grad), + output_grad.getImpl()->rawPtr(), + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->getImpl())->getCudnnTensorDesc(*op.getInput(0)), + std::static_pointer_cast<Tensor>(op.getRawInput(0))->getImpl()->rawPtr(), + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->grad()->getImpl())->getCudnnTensorDesc(*op.getInput(0)->grad()), + op.getInput(0)->grad()->getImpl()->rawPtr())); +} + +Aidge::SigmoidImpl_cuda::~SigmoidImpl_cuda() { + if (mSigmoidDesc != nullptr) { + #if CUDNN_VERSION >= 5000 + cudnnDestroyActivationDescriptor(mSigmoidDesc); + #endif + } +} + diff --git a/src/operator/SubImpl.cpp b/src/operator/SubImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..adebd2a1bdcede94f159627f67860e7ec60a5d85 --- /dev/null +++ b/src/operator/SubImpl.cpp @@ -0,0 +1,219 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <algorithm> +#include <cassert> +#include <numeric> +#include <vector> + +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/SubImpl.hpp" +#include "aidge/backend/cuda/utils/CudaContext.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/operator/Sub.hpp" +#include "aidge/utils/Types.h" + +void Aidge::SubImpl_cuda::forward() { + const Sub_Op& op = static_cast<const Sub_Op&>(mOp); + // Check inputs + AIDGE_ASSERT(op.getInput(0), "missing input in Sub operator"); + AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Sub forward because the 0-th input has no implementation."); + DataType datatypeFirstInput = op.getInput(0)->dataType(); + for (IOIndex_t i = 1; i < op.nbInputs(); ++i) { + AIDGE_ASSERT(op.getInput(i), "missing input in Sub operator"); + AIDGE_ASSERT(op.getInput(i)->hasImpl(), "cannot run Sub forward because the {}-th input has no implementation.", i); + AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, "Cannot add inputs with two differents data type."); + } + + std::vector<std::shared_ptr<Tensor>> inputFallbacks(op.nbInputs()); + std::vector<Tensor> inputs(op.nbInputs()); + std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims + std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides + for (IOIndex_t i = 0; i < op.nbInputs(); ++i) { + inputs[i] = op.getInput(i)->refCastFrom(inputFallbacks[i], *op.getOutput(0)); + + // Get tensor dims and broadcast them + std::copy(inputs[i].dims().begin(), inputs[i].dims().end(), std::back_inserter(dims[i])); + dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1)); + + // Compute the corresponding strides + std::vector<int> tensorStrides(dims[i].size()); + int product = 1; + for (size_t j = dims[i].size(); j > 0; --j) { + tensorStrides[j - 1] = product; + product *= dims[i][j - 1]; + } + strides[i] = tensorStrides; + } + + switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + forward_<double>(inputs, dims, strides); + break; + case DataType::Float32: + forward_<float>(inputs, dims, strides); + break; + case DataType::Float16: + forward_<half>(inputs, dims, strides); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + } +} + +template <class T> +void Aidge::SubImpl_cuda::forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; + const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; + const typename Cuda::cudnn_scaling_type<T>::type gamma = -1.0f; + // Create a Tensor descriptor with the broadcasted dims and strides + cudnnTensorDescriptor_t tensorDesc; + CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc)); + CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc, CudaContext::data_type<T>::value, inputsDims[0].size(), inputsDims[0].data(), inputsStrides[0].data())); + // Add first input to the output + CHECK_CUDNN_STATUS( + cudnnAddTensor(CudaContext::cudnnHandle(), + &alpha, + tensorDesc, + inputs[0].getImpl()->rawPtr(), + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), + std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()) + ); + // Substract other inputs if there are any + for (size_t i = 1; i < op.nbInputs(); ++i) + { + CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc, CudaContext::data_type<T>::value, inputsDims[i].size(), inputsDims[i].data(), inputsStrides[i].data())); + CHECK_CUDNN_STATUS( + cudnnAddTensor(CudaContext::cudnnHandle(), + &gamma, + tensorDesc, + inputs[i].getImpl()->rawPtr(), + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), + std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()) + ); + } + CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc)); +} + +void Aidge::SubImpl_cuda::backward() { + const Sub_Op& op = static_cast<const Sub_Op&>(mOp); + // Check output + AIDGE_ASSERT(op.getOutput(0)->grad(), "missing output gradient in Sub operator"); + AIDGE_ASSERT(op.getOutput(0)->grad()->hasImpl(), "cannot run Sub backward because the output gradient has no implementation."); + + std::shared_ptr<Tensor> outputGradFallback; + const auto& outputGrad = op.getOutput(0)->grad()->refCastFrom(outputGradFallback, *op.getOutput(0)->grad()); + + std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims + std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides + for (IOIndex_t i = 0; i < op.nbInputs(); ++i) { + std::shared_ptr<Tensor> inputFallback; + const Tensor input = op.getInput(i)->refCastFrom(inputFallback, *op.getOutput(0)); + + // Get tensor dims and broadcast them + std::copy(input.dims().begin(), input.dims().end(), std::back_inserter(dims[i])); + dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1)); + + // Compute the corresponding strides + std::vector<int> tensorStrides(dims[i].size()); + int product = 1; + for (size_t j = dims[i].size(); j > 0; --j) { + tensorStrides[j - 1] = product; + product *= dims[i][j - 1]; + } + strides[i] = tensorStrides; + } + + switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + backward_<double>(outputGrad, dims, strides); + break; + case DataType::Float32: + backward_<float>(outputGrad, dims, strides); + break; + case DataType::Float16: + backward_<half>(outputGrad, dims, strides); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + } +} + +template <class T> +void Aidge::SubImpl_cuda::backward_(const Tensor& outputGrad, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; + const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; + const typename Cuda::cudnn_scaling_type<T>::type gamma = -1.0f; + for (std::size_t i = 0; i < inputsDims.size(); i++) + { + if (op.getInput(i)->size() == op.getOutput(0)->size()) + { + CHECK_CUDNN_STATUS( + cudnnAddTensor(CudaContext::cudnnHandle(), + i==0 ? &alpha: &gamma, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), + outputGrad.getImpl()->rawPtr(), + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(i)->getImpl())->getCudnnTensorDesc(*op.getInput(i)), + op.getInput(i)->grad()->getImpl()->rawPtr())); + } + else // In case of broadcasting + { + // Gradient with respect to input_i: sum outputGrad over the broadcasted dimensions using cudnnReduceTensor + cudnnReduceTensorDescriptor_t reduceDesc; + CHECK_CUDNN_STATUS(cudnnCreateReduceTensorDescriptor(&reduceDesc)); + CHECK_CUDNN_STATUS(cudnnSetReduceTensorDescriptor(reduceDesc, + CUDNN_REDUCE_TENSOR_ADD, + CudaContext::data_type<T>::value, + CUDNN_PROPAGATE_NAN, + CUDNN_REDUCE_TENSOR_NO_INDICES, + CUDNN_32BIT_INDICES)); + + cudnnTensorDescriptor_t outputDesc = std::dynamic_pointer_cast<TensorImpl_cuda_>(outputGrad.getImpl())->getCudnnTensorDesc(*op.getOutput(0)); + // Create a Tensor descriptor with the broadcasted dims and strides + cudnnTensorDescriptor_t tensorDesc; + CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc)); + CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc, + CudaContext::data_type<T>::value, + inputsDims[i].size(), + inputsDims[i].data(), + inputsStrides[i].data())); + size_t workspaceSize; + CHECK_CUDNN_STATUS(cudnnGetReductionWorkspaceSize(CudaContext::cudnnHandle(), + reduceDesc, + outputDesc, + tensorDesc, + &workspaceSize)); + + float *d_workspace; + CHECK_CUDA_STATUS(cudaMalloc(&d_workspace, workspaceSize)); + + CHECK_CUDNN_STATUS(cudnnReduceTensor(CudaContext::cudnnHandle(), + reduceDesc, + NULL, + 0, + d_workspace, + workspaceSize, + i==0 ? &alpha: &gamma, + outputDesc, + outputGrad.getImpl()->rawPtr(), + &beta, + tensorDesc, + op.getInput(i)->grad()->getImpl()->rawPtr())); + + CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc)); + } + } +} \ No newline at end of file diff --git a/src/operator/TanhImpl.cpp b/src/operator/TanhImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..96c0330febba35cfea04bbbac97d9308195d6309 --- /dev/null +++ b/src/operator/TanhImpl.cpp @@ -0,0 +1,123 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <cassert> +#include <vector> + +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/TanhImpl.hpp" +#include "aidge/backend/cuda/utils/CudaContext.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/operator/Tanh.hpp" +#include "aidge/utils/Types.h" + +void Aidge::TanhImpl_cuda::forward() { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + + assert(mOp.getRawInput(0) && "missing input #0"); + + const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0)); + + // Lazy-initialize CuDNN Tanh descriptor + if (mTanhDesc == nullptr) { + #if CUDNN_VERSION >= 5000 + CHECK_CUDNN_STATUS(cudnnCreateActivationDescriptor(&mTanhDesc)); + CHECK_CUDNN_STATUS(cudnnSetActivationDescriptor( + mTanhDesc, CUDNN_ACTIVATION_TANH, CUDNN_NOT_PROPAGATE_NAN, 0.0)); + #else + mTanhDesc = CUDNN_ACTIVATION_TANH; + #endif + } + + // Do the actual forward computation + // Template is only for scaling parameters, which are always in float + // excepted when the convolution is performed in double precision. + if (op.getOutput(0)->dataType() == DataType::Float64) { + forward_<double>(input); + } + else { + forward_<float>(input); + } +} + +template <class T> +void Aidge::TanhImpl_cuda::forward_(const Tensor& input) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; + const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; + CHECK_CUDNN_STATUS( + cudnnActivationForward(CudaContext::cudnnHandle(), + mTanhDesc, + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input), + input.getImpl()->rawPtr(), + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), + std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr())); +} + +void Aidge::TanhImpl_cuda::backward() { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + + assert(op.getOutput(0)->grad() && "missing output #0"); + + const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad()); + + // Lazy-initialize CuDNN Tanh descriptor + if (mTanhDesc == nullptr) { + #if CUDNN_VERSION >= 5000 + CHECK_CUDNN_STATUS(cudnnCreateActivationDescriptor(&mTanhDesc)); + CHECK_CUDNN_STATUS(cudnnSetActivationDescriptor( + mTanhDesc, CUDNN_ACTIVATION_SIGMOID, CUDNN_NOT_PROPAGATE_NAN, 0.0)); + #else + mTanhDesc = CUDNN_ACTIVATION_SIGMOID; + #endif + } + + // Do the actual backward computation + // Template is only for scaling parameters, which are always in float + // excepted when the convolution is performed in double precision. + if (op.getInput(0)->grad()->dataType() == DataType::Float64) { + backward_<double>(output_grad); + } + else { + backward_<float>(output_grad); + } +} + +template <class T> +void Aidge::TanhImpl_cuda::backward_(const Tensor& output_grad) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; + const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; + CHECK_CUDNN_STATUS( + cudnnActivationBackward(CudaContext::cudnnHandle(), + mTanhDesc, + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), + std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr(), + std::dynamic_pointer_cast<TensorImpl_cuda_>(output_grad.getImpl())->getCudnnTensorDesc(output_grad), + output_grad.getImpl()->rawPtr(), + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->getImpl())->getCudnnTensorDesc(*op.getInput(0)), + std::static_pointer_cast<Tensor>(op.getRawInput(0))->getImpl()->rawPtr(), + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->grad()->getImpl())->getCudnnTensorDesc(*op.getInput(0)->grad()), + op.getInput(0)->grad()->getImpl()->rawPtr())); +} + +Aidge::TanhImpl_cuda::~TanhImpl_cuda() { + if (mTanhDesc != nullptr) { + #if CUDNN_VERSION >= 5000 + cudnnDestroyActivationDescriptor(mTanhDesc); + #endif + } +} + diff --git a/unit_tests/Test_AddImpl.cpp b/unit_tests/Test_AddImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b8129175d88323c896244e531f1dd52a5cbaa19e --- /dev/null +++ b/unit_tests/Test_AddImpl.cpp @@ -0,0 +1,448 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <catch2/catch_test_macros.hpp> + +#include "aidge/backend/cpu.hpp" +#include "aidge/backend/cuda.hpp" +#include "aidge/data/Tensor.hpp" +#include "aidge/operator/Add.hpp" +#include "aidge/utils/TensorUtils.hpp" + +using namespace Aidge; + +TEST_CASE("[gpu/operator] Add(forward)", "[Add][GPU]") { + std::shared_ptr<Tensor> input1 = std::make_shared<Tensor>(Array4D<float,3,3,3,2> { + { // + { // + {{20, 47},{21, 48},{22, 49}}, // + {{23, 50},{24, 51},{25, 52}}, // + {{26, 53},{27, 54},{28, 55}} // + }, // + { // + {{29, 56},{30, 57},{31, 58}}, // + {{32, 59},{33, 60},{34, 61}}, // + {{35, 62},{36, 63},{37, 64}} // + }, // + { // + {{38, 65},{39, 66},{40, 67}}, // + {{41, 68},{42, 69},{43, 70}}, // + {{44, 71},{45, 72},{46, 73}} // + } // + } // + }); // + input1->setBackend("cuda"); + SECTION("One input") { + std::shared_ptr<Node> myAdd = Add(1); + auto op = std::static_pointer_cast<OperatorTensor>(myAdd -> getOperator()); + op->associateInput(0, input1); + op->setBackend("cuda"); + op->setDataType(DataType::Float32); + myAdd->forward(); + + float* computedOutput = new float[input1->size()](); + cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * input1->size(), cudaMemcpyDeviceToHost); + float* targetOutput = new float[input1->size()](); + cudaMemcpy(targetOutput, input1->getImpl()->rawPtr(), sizeof(float) * input1->size(), cudaMemcpyDeviceToHost); + + for(int i = 0; i < input1->size(); i++){ + REQUIRE(fabs(computedOutput[i] - targetOutput[i]) < 1e-6); + } + + delete[] computedOutput; + delete[] targetOutput; + } + + SECTION("Two inputs") { + std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<float,3,3,3,2> { + { + { + {{40, 94},{42, 96},{44, 98}}, + {{46, 100},{48, 102},{50, 104}}, + {{52, 106},{54, 108},{56, 110}} + }, + { + {{58, 112},{60, 114},{62, 116}}, + {{64, 118},{66, 120},{68, 122}}, + {{70, 124},{72, 126},{74, 128}} + }, + { + {{76, 130},{78, 132},{80, 134}}, + {{82, 136},{84, 138},{86, 140}}, + {{88, 142},{90, 144},{92, 146}} + } + } + }); + + std::shared_ptr<Node> myAdd = Add(2); + auto op = std::static_pointer_cast<OperatorTensor>(myAdd -> getOperator()); + op->associateInput(0, input1); + op->associateInput(1, input1); + op->setBackend("cuda"); + op->setDataType(DataType::Float32); + myAdd->forward(); + + float* computedOutput = new float[input1->size()](); + cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * expectedOutput->size(), cudaMemcpyDeviceToHost); + + for(int i = 0; i < expectedOutput->size(); i++){ + const float targetOutput = *(static_cast<float*>(expectedOutput->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + } + + delete[] computedOutput; + } + + SECTION("Three inputs") { + std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<float,3,3,3,2> { + { + { + {{ 60, 141},{ 63, 144},{ 66, 147}}, + {{ 69, 150},{ 72, 153},{ 75, 156}}, + {{ 78, 159},{ 81, 162},{ 84, 165}} + }, + { + {{ 87, 168},{ 90, 171},{ 93, 174}}, + {{ 96, 177},{ 99, 180},{102, 183}}, + {{105, 186},{108, 189},{111, 192}} + }, + { + {{114, 195},{117, 198},{120, 201}}, + {{123, 204},{126, 207},{129, 210}}, + {{132, 213},{135, 216},{138, 219}} + } + } + }); + + std::shared_ptr<Node> myAdd = Add(3); + auto op = std::static_pointer_cast<OperatorTensor>(myAdd -> getOperator()); + op->associateInput(0, input1); + op->associateInput(1, input1); + op->associateInput(2, input1); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + myAdd->forward(); + + float* computedOutput = new float[input1->size()](); + cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * expectedOutput->size(), cudaMemcpyDeviceToHost); + + for(int i = 0; i < expectedOutput->size(); i++){ + const float targetOutput = *(static_cast<float*>(expectedOutput->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + } + + delete[] computedOutput; + } + + SECTION("Broadcasting") { + std::shared_ptr<Tensor> input_0 = std::make_shared<Tensor>(Array4D<float,3,1,3,2> { + { // + { // + {{0, 1},{2, 3},{4, 5}} // + }, // + { // + {{6, 7},{8, 9},{10, 11}} // + }, // + { // + {{12, 13},{14, 15},{16, 17}} // + } // + } // + }); // + std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<float,1,3,3,2> { + { // + { // + {{20, 21},{22, 23},{24, 25}}, // + {{26, 27},{28, 29},{30, 31}}, // + {{32, 33},{34, 35},{36, 37}} // + } // + } // + }); // + + std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<float,2> {{100,200}}); + std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<float,3,3,3,2> { + { // + { // + {{ 120, 222},{ 124, 226},{ 128, 230}}, // + {{ 126, 228},{ 130, 232},{ 134, 236}}, // + {{ 132, 234},{ 136, 238},{ 140, 242}} // + }, // + { // + {{ 126, 228},{ 130, 232},{ 134, 236}}, // + {{ 132, 234},{ 136, 238},{ 140, 242}}, // + {{ 138, 240},{ 142, 244},{ 146, 248}} // + }, // + { // + {{ 132, 234},{ 136, 238},{140, 242}}, // + {{ 138, 240},{ 142, 244},{146, 248}}, // + {{ 144, 246},{ 148, 250},{152, 254}} // + } // + } // + }); // + input_0->setBackend("cuda"); + input_1->setBackend("cuda"); + input_2->setBackend("cuda"); + std::shared_ptr<Node> myAdd = Add(3); + auto op = std::static_pointer_cast<OperatorTensor>(myAdd -> getOperator()); + op->associateInput(0, input_0); + op->associateInput(1, input_1); + op->associateInput(2, input_2); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + myAdd->forward(); + + float* computedOutput = new float[input1->size()](); + cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * expectedOutput->size(), cudaMemcpyDeviceToHost); + + for(int i = 0; i < expectedOutput->size(); i++){ + const float targetOutput = *(static_cast<float*>(expectedOutput->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + } + + delete[] computedOutput; + } + + SECTION("Random Input") { + constexpr std::uint16_t NBTRIALS = 10; + // Create a random number generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<float> valueDist( + 0.1f, 1.1f); // Random float distribution between 0 and 1 + std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1), + std::size_t(10)); + std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(4), std::size_t(5)); + std::uniform_int_distribution<int> boolDist(0,1); + + // To measure execution time of 'forward()' + std::chrono::time_point<std::chrono::system_clock> start; + std::chrono::time_point<std::chrono::system_clock> end; + std::chrono::duration<double, std::micro> duration{}; + std::size_t number_of_operation = 0; + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) + { + // Create Add Operator CUDA + std::shared_ptr<Node> myAddCUDA = Add(2, "myaddcuda"); + auto op_cuda = std::static_pointer_cast<OperatorTensor>(myAddCUDA -> getOperator()); + + // Create Add Operator CPU + std::shared_ptr<Node> myAddCPU = Add(2, "myaddcpu"); + auto op_cpu = std::static_pointer_cast<OperatorTensor>(myAddCPU -> getOperator()); + op_cpu->setDataType(DataType::Float32); + op_cpu->setBackend("cpu"); + + const std::size_t nbDims = nbDimsDist(gen); + std::vector<std::size_t> dims0, dims1, dims; + for (std::size_t i = 0; i < nbDims; ++i) { + const std::size_t dim = dimSizeDist(gen); + // To test broadcasting, set some dims to 1 + if (boolDist(gen)) { + dims0.push_back(1); + }else{ + dims0.push_back(dim); + } + if (boolDist(gen)) { + dims1.push_back(1); + }else{ + dims1.push_back(dim); + } + dims.push_back(std::max(dims0[i], dims1[i])); + } + const std::size_t nb_elements0 = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t nb_elements1 = std::accumulate(dims1.cbegin(), dims1.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + number_of_operation += nb_elements; + + float* array0 = new float[nb_elements0]; + float* array1 = new float[nb_elements1]; + + for (std::size_t i = 0; i < nb_elements0; ++i) { + array0[i] = valueDist(gen); + } + for (std::size_t i = 0; i < nb_elements1; ++i) { + array1[i] = valueDist(gen); + } + + // input0 CUDA + float* array0_d, *array1_d; + std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>(); + T0_cuda->setDataType(DataType::Float32); + T0_cuda->setBackend("cuda"); + T0_cuda->resize(dims0); + op_cuda->associateInput(0, T0_cuda); + cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements0); + cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements0, cudaMemcpyHostToDevice); + T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements0); + + // input0 CPU + std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>(); + op_cpu->associateInput(0,T0_cpu); + T0_cpu->setDataType(DataType::Float32); + T0_cpu->setBackend("cpu"); + T0_cpu->resize(dims0); + T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements0); + + // input1 CUDA + std::shared_ptr<Tensor> T1_cuda = std::make_shared<Tensor>(); + T1_cuda->setDataType(DataType::Float32); + T1_cuda->setBackend("cuda"); + T1_cuda->resize(dims1); + op_cuda->associateInput(1, T1_cuda); + cudaMalloc(reinterpret_cast<void **>(&array1_d), sizeof(float) * nb_elements1); + cudaMemcpy(array1_d, array1, sizeof(float) * nb_elements1, cudaMemcpyHostToDevice); + T1_cuda->getImpl()->setRawPtr(array1_d, nb_elements1); + + // input1 CPU + std::shared_ptr<Tensor> T1_cpu = std::make_shared<Tensor>(); + op_cpu->associateInput(1,T1_cpu); + T1_cpu->setDataType(DataType::Float32); + T1_cpu->setBackend("cpu"); + T1_cpu->resize(dims1); + T1_cpu -> getImpl() -> setRawPtr(array1, nb_elements1); + + // forward CUDA + op_cuda->setDataType(DataType::Float32); + op_cuda->setBackend("cuda"); + start = std::chrono::system_clock::now(); + op_cuda->forward(); + end = std::chrono::system_clock::now(); + duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + + float *computedOutput = new float[nb_elements](); + cudaMemcpy(computedOutput, op_cuda->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * nb_elements, cudaMemcpyDeviceToHost); + + // forward CPU + op_cpu->forward(); + float *computedCPU = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr()); + REQUIRE(approxEq<float>(*computedOutput, *computedCPU)); + + delete[] array0; + delete[] array1; + delete[] computedOutput; + cudaFree(array0_d); + cudaFree(array1_d); + + } + } + +} + +TEST_CASE("[gpu/operator] Add(backward)", "[Add][GPU]") { + std::shared_ptr<Tensor> input_0 = std::make_shared<Tensor>(Array4D<float,3,1,3,2> { + { // + { // + {{0, 1},{2, 3},{4, 5}} // + }, // + { // + {{6, 7},{8, 9},{10, 11}} // + }, // + { // + {{12, 13},{14, 15},{16, 17}} // + } // + } // + }); // + std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<float,1,3,3,2> { + { // + { // + {{20, 21},{22, 23},{24, 25}}, // + {{26, 27},{28, 29},{30, 31}}, // + {{32, 33},{34, 35},{36, 37}} // + } // + } // + }); // + + std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<float,2> {{100,200}}); + + input_0->setBackend("cuda"); + input_1->setBackend("cuda"); + input_2->setBackend("cuda"); + std::shared_ptr<Node> myAdd = Add(3); + auto op = std::static_pointer_cast<OperatorTensor>(myAdd -> getOperator()); + op->associateInput(0, input_0); + op->associateInput(1, input_1); + op->associateInput(2, input_2); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + myAdd->forward(); + + // Run and test backward operation + std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array4D<float,3,3,3,2> { + { // + { // + {{ 1, 2},{ 3, 4},{ 5, 6}}, // + {{ 7, 8},{ 9, 10},{ 11, 12}}, // + {{ 13, 14},{ 15, 16},{ 17, 18}} // + }, // + { // + {{ 19, 20},{ 21, 22},{ 23, 24}}, // + {{ 25, 26},{ 27, 28},{ 29, 30}}, // + {{ 31, 32},{ 33, 34},{ 35, 36}} // + }, // + { // + {{ 37, 38},{ 39, 40},{41, 42}}, // + {{ 43, 44},{ 45, 46},{47, 48}}, // + {{ 49, 50},{ 51, 52},{53, 54}} // + } // + } // + }); // + myOutputGrad->setBackend("cuda"); + op->getOutput(0)->setGrad(myOutputGrad); + REQUIRE_NOTHROW(myAdd->backward()); + + std::shared_ptr<Tensor> expectedInput1Grad = std::make_shared<Tensor>(Array4D<float,3,1,3,2> { + { // + { // + {{21, 24},{27, 30},{33, 36}} // + }, // + { // + {{75, 78},{81, 84},{87, 90}} // + }, // + { // + {{129, 132},{135, 138},{141, 144}}// + } // + } // + }); // + std::shared_ptr<Tensor> expectedInput2Grad = std::make_shared<Tensor>(Array4D<float,1,3,3,2> { + { // + { // + {{57, 60},{63, 66},{69, 72}}, // + {{75, 78},{81, 84},{87, 90}}, // + {{93, 96},{99, 102},{105, 108}} // + } // + } // + }); // + std::shared_ptr<Tensor> expectedInput3Grad = std::make_shared<Tensor>(Array1D<float,2> {{729, 756}}); + + float *computedGrad1Cuda = new float[expectedInput1Grad->size()](); + cudaMemcpy(computedGrad1Cuda, op->getInput(0)->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInput1Grad->size(), cudaMemcpyDeviceToHost); + float *computedGrad2Cuda = new float[expectedInput2Grad->size()](); + cudaMemcpy(computedGrad2Cuda, op->getInput(1)->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInput2Grad->size(), cudaMemcpyDeviceToHost); + float *computedGrad3Cuda = new float[expectedInput3Grad->size()](); + cudaMemcpy(computedGrad3Cuda, op->getInput(2)->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInput3Grad->size(), cudaMemcpyDeviceToHost); + + for(int i = 0; i < expectedInput1Grad->size(); i++){ + const float targetOutput = *(static_cast<float*>(expectedInput1Grad->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedGrad1Cuda[i] - targetOutput) < 1e-6); + } + for(int i = 0; i < expectedInput2Grad->size(); i++){ + const float targetOutput = *(static_cast<float*>(expectedInput2Grad->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedGrad2Cuda[i] - targetOutput) < 1e-6); + } + for(int i = 0; i < expectedInput3Grad->size(); i++){ + const float targetOutput = *(static_cast<float*>(expectedInput3Grad->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedGrad3Cuda[i] - targetOutput) < 1e-6); + } + + delete[] computedGrad1Cuda; + delete[] computedGrad2Cuda; + delete[] computedGrad3Cuda; +} \ No newline at end of file diff --git a/unit_tests/Test_AvgPoolingImpl.cpp b/unit_tests/Test_AvgPoolingImpl.cpp index dfadebbe07aa38371576cf4006773484494751a0..3dccd6b7f909a9e9b4f8affb151898b77d94a7cf 100644 --- a/unit_tests/Test_AvgPoolingImpl.cpp +++ b/unit_tests/Test_AvgPoolingImpl.cpp @@ -10,151 +10,289 @@ ********************************************************************************/ #include <array> +#include <cuda_fp16.h> // half type +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution #include <catch2/catch_test_macros.hpp> -#include <cuda_fp16.h> -#include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution - -#include "Test_cuda.hpp" - -#include "aidge/data/half.hpp" -#include "aidge/data/Tensor.hpp" #include "aidge/backend/cpu.hpp" #include "aidge/backend/cuda.hpp" +#include "aidge/data/Tensor.hpp" +#include "aidge/data/half.hpp" +#include "aidge/utils/TensorUtils.hpp" using namespace Aidge; -TEST_CASE("[gpu/operator] AvgPooling(forward)", "[AvgPooling][GPU]") { - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,2,5,5> { //NCHW - { - { - {{ 0, 1, 2, 3, 4}, - { 5, 6, 7, 8, 9}, - { 10, 11, 12, 13, 14}, - { 15, 16, 17, 18, 19}, - { 20, 21, 22, 23, 24}}, - - {{ 25, 26, 27, 28, 29}, - { 30, 31, 32, 33, 34}, - { 35, 36, 37, 38, 39}, - { 40, 41, 42, 43, 44}, - { 45, 46, 47, 48, 49}} - }, - { - {{100, 101, 102, 103, 104}, - {105, 106, 107, 108, 109}, - {110, 111, 112, 113, 114}, - {115, 116, 117, 118, 119}, - {120, 121, 122, 123, 124}}, - - {{125, 126, 127, 128, 129}, - {130, 131, 132, 133, 134}, - {135, 136, 137, 138, 139}, - {140, 141, 142, 143, 144}, - {145, 146, 147, 148, 149}} - } - } - }); - SECTION("Stride") { - std::shared_ptr<Node> myAvgPool = AvgPooling({2,2}, "myAvgPool", {2,2}); - auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool -> getOperator()); +TEST_CASE("[gpu/operator] AvgPooling(forward)", "[AvgPooling][GPU]") +{ + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float, 2, 2, 5, 5>{// NCHW + { + {{{0, 1, 2, 3, 4}, + {5, 6, 7, 8, 9}, + {10, 11, 12, 13, 14}, + {15, 16, 17, 18, 19}, + {20, 21, 22, 23, 24}}, - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,2,2,2> { - { - { - {{ 3, 5}, - { 13, 15}}, - {{ 28, 30}, - { 38, 40}} - }, - { - {{103, 105}, - {113, 115}}, - {{128, 130}, - {138, 140}} - } - } - }); - op->associateInput(0,myInput); + {{25, 26, 27, 28, 29}, + {30, 31, 32, 33, 34}, + {35, 36, 37, 38, 39}, + {40, 41, 42, 43, 44}, + {45, 46, 47, 48, 49}}}, + {{{100, 101, 102, 103, 104}, + {105, 106, 107, 108, 109}, + {110, 111, 112, 113, 114}, + {115, 116, 117, 118, 119}, + {120, 121, 122, 123, 124}}, + + {{125, 126, 127, 128, 129}, + {130, 131, 132, 133, 134}, + {135, 136, 137, 138, 139}, + {140, 141, 142, 143, 144}, + {145, 146, 147, 148, 149}}}}}); + SECTION("Stride") + { + std::shared_ptr<Node> myAvgPool = AvgPooling({2, 2}, "myAvgPool", {2, 2}); + auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool->getOperator()); + + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 2>{ + {{{{3, 5}, + {13, 15}}, + {{28, 30}, + {38, 40}}}, + {{{103, 105}, + {113, 115}}, + {{128, 130}, + {138, 140}}}}}); + op->associateInput(0, myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); myAvgPool->forward(); - float* computedOutput = new float[myOutput->size()](); + float *computedOutput = new float[myOutput->size()](); cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); - for(int i = 0; i < myOutput->size(); i++){ - const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + for (int i = 0; i < myOutput->size(); i++) + { + const float targetOutput = *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i); REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); } delete[] computedOutput; } - SECTION("Stride >= feature dim") { - std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(Array4D<float,1,1,3,3> { //NCHW - { - { - {{0.3745, 0.9507, 0.7320}, - {0.5987, 0.1560, 0.1560}, - {0.0581, 0.8662, 0.6011}} - } - } - }); - std::shared_ptr<Node> myAvgPool = AvgPooling({3,3}, "myAvgPool", {3,3}); - auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool -> getOperator()); - - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,1,1,1,1> { - {{{{(0.3745 + 0.9507 + 0.7320 + 0.5987 + 0.1560 + 0.1560 + 0.0581 + 0.8662 + 0.6011)/9.0}}}} - }); - op->associateInput(0,myInput2); + SECTION("Stride >= feature dim") + { + std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(Array4D<float, 1, 1, 3, 3>{// NCHW + { + {{{0.3745, 0.9507, 0.7320}, + {0.5987, 0.1560, 0.1560}, + {0.0581, 0.8662, 0.6011}}}}}); + std::shared_ptr<Node> myAvgPool = AvgPooling({3, 3}, "myAvgPool", {3, 3}); + auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool->getOperator()); + + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float, 1, 1, 1, 1>{ + {{{{(0.3745 + 0.9507 + 0.7320 + 0.5987 + 0.1560 + 0.1560 + 0.0581 + 0.8662 + 0.6011) / 9.0}}}}}); + op->associateInput(0, myInput2); op->setDataType(DataType::Float32); op->setBackend("cuda"); myAvgPool->forward(); - float* computedOutput = new float[myOutput->size()](); + float *computedOutput = new float[myOutput->size()](); cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); - for(int i = 0; i < myOutput->size(); i++){ - const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + for (int i = 0; i < myOutput->size(); i++) + { + const float targetOutput = *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i); REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); } delete[] computedOutput; } - SECTION("half") { - std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(Array4D<half_float::half,1,1,3,3> { //NCHW - { - { - {{half_float::half(0.3745), half_float::half(0.9507), half_float::half(0.7320)}, - {half_float::half(0.5987), half_float::half(0.1560), half_float::half(0.1560)}, - {half_float::half(0.0581), half_float::half(0.8662), half_float::half(0.6011)}} - } - } - }); + SECTION("half") + { + std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(Array4D<half_float::half, 1, 1, 3, 3>{// NCHW + { + {{{half_float::half(0.3745), half_float::half(0.9507), half_float::half(0.7320)}, + {half_float::half(0.5987), half_float::half(0.1560), half_float::half(0.1560)}, + {half_float::half(0.0581), half_float::half(0.8662), half_float::half(0.6011)}}}}}); myInput2->setBackend("cuda"); - std::shared_ptr<Node> myAvgPool = AvgPooling({3,3}, "mymyAvgPoolcdw", {3,3}); - auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool -> getOperator()); - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<half_float::half,1,1,1,1> { - {{{{(half_float::half(0.3745) + half_float::half(0.9507) + half_float::half(0.7320) + half_float::half(0.5987) + half_float::half(0.1560) + half_float::half(0.1560) + half_float::half(0.0581) + half_float::half(0.8662) + half_float::half(0.6011))/half_float::half(9.0)}}}} - }); - op->associateInput(0,myInput2); + std::shared_ptr<Node> myAvgPool = AvgPooling({3, 3}, "myAvgPoolcdw", {3, 3}); + auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool->getOperator()); + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<half_float::half, 1, 1, 1, 1>{ + {{{{(half_float::half(0.3745) + half_float::half(0.9507) + half_float::half(0.7320) + half_float::half(0.5987) + half_float::half(0.1560) + half_float::half(0.1560) + half_float::half(0.0581) + half_float::half(0.8662) + half_float::half(0.6011)) / half_float::half(9.0)}}}}}); + op->associateInput(0, myInput2); op->setDataType(DataType::Float16); op->setBackend("cuda"); myAvgPool->forward(); - half_float::half* computedOutput = new half_float::half[myOutput->size()](); + half_float::half *computedOutput = new half_float::half[myOutput->size()](); cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(half_float::half) * myOutput->size(), cudaMemcpyDeviceToHost); - for(int i = 0; i < myOutput->size(); i++){ - const half_float::half targetOutput = *(static_cast<half_float::half*>(myOutput->getImpl()->rawPtr()) + i); + for (int i = 0; i < myOutput->size(); i++) + { + const half_float::half targetOutput = *(static_cast<half_float::half *>(myOutput->getImpl()->rawPtr()) + i); REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); } delete[] computedOutput; } + + SECTION("Random Input") { + constexpr std::uint16_t NBTRIALS = 10; + std::size_t kernel = 3; + std::size_t stride = 3; + // Create a random number generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<float> valueDist( + 0.1f, 1.1f); // Random float distribution between 0 and 1 + std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(kernel), + std::size_t(10)); + + // To measure execution time of 'AveragePooling_Op::forward()' + std::chrono::time_point<std::chrono::system_clock> start; + std::chrono::time_point<std::chrono::system_clock> end; + std::chrono::duration<double, std::micro> duration{}; + std::size_t number_of_operation = 0; + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) + { + // Create AveragePooling Operator CUDA + std::shared_ptr<Node> myAvgPoolCuda = AvgPooling({kernel, kernel}, "myAvgPoolCuda", {stride, stride}); + auto op_cuda = std::static_pointer_cast<OperatorTensor>(myAvgPoolCuda->getOperator()); + op_cuda->setDataType(DataType::Float32); + op_cuda->setBackend("cuda"); + + // Create AveragePooling Operator CUDA + std::shared_ptr<Node> myAvgPoolCpu = AvgPooling({kernel, kernel}, "myAvgPoolCpu", {stride, stride}); + auto op_cpu = std::static_pointer_cast<OperatorTensor>(myAvgPoolCpu->getOperator()); + op_cpu->setDataType(DataType::Float32); + op_cpu->setBackend("cpu"); + + // generate a random Tensor + const std::size_t nbDims = 4; + std::vector<std::size_t> dims; + for (std::size_t i = 0; i < nbDims; ++i) + { + dims.push_back(dimSizeDist(gen)); + } + + const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + number_of_operation += nb_elements; + + // Fill input tensor + float *array0 = new float[nb_elements]; + for (std::size_t i = 0; i < nb_elements; ++i) + { + array0[i] = valueDist(gen); + } + + // input0 CUDA + float* array0_d; + std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>(); + T0_cuda->setDataType(DataType::Float32); + T0_cuda->setBackend("cuda"); + T0_cuda->resize(dims); + op_cuda->associateInput(0, T0_cuda); + cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements); + cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice); + T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements); + + // input0 CPU + std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>(); + op_cpu->associateInput(0,T0_cpu); + T0_cpu->setDataType(DataType::Float32); + T0_cpu->setBackend("cpu"); + T0_cpu->resize(dims); + T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements); + + // Run inference + start = std::chrono::system_clock::now(); + op_cuda->forward(); + end = std::chrono::system_clock::now(); + duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + + const std::size_t outSize = op_cuda->getOutput(0)->size(); + float *computed_cuda = new float[outSize](); + cudaMemcpy(computed_cuda, op_cuda->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * outSize, cudaMemcpyDeviceToHost); + + // forward CPU + op_cpu->forward(); + float *computed_cpu = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr()); + REQUIRE(approxEq<float>(*computed_cuda, *computed_cpu)); + + delete[] computed_cuda; + delete[] array0; + cudaFree(array0_d); + } + std::cout << "number of elements over time spent: " << (number_of_operation / duration.count()) << std::endl; + std::cout << "total time: " << duration.count() << "μs" << std::endl; + } +} + +TEST_CASE("[gpu/operator] AvgPooling(backward)", "[AvgPooling][GPU]") +{ + // Run forward operation + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float, 1, 1, 4, 4> {// NCHW + { + { + { + {1, 2, 3, 4}, + {5, 6, 7, 8}, + {9, 10, 11, 12}, + {13, 14, 15, 16} + } + } + } + }); + myInput->setBackend("cuda"); + + std::shared_ptr<Node> myAvgPool = AvgPooling({2, 2}, "myAvgPool", {2, 2}); + auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool->getOperator()); + op->associateInput(0, myInput); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + myAvgPool->forward(); + + // Run and test backward operation + std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array4D<float, 1,1,2,2> { + { + { + { + {1, 2}, + {3, 4} + } + } + } + }); + myOutputGrad->setBackend("cuda"); + std::shared_ptr<Tensor> predictedOutput = op->getOutput(0); + std::shared_ptr<Tensor> input = op->getInput(0); + predictedOutput->setGrad(myOutputGrad); + REQUIRE_NOTHROW(myAvgPool->backward()); + + std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array4D<float, 1, 1, 4, 4>{ + { + { + { + {0.25, 0.25, 0.5, 0.5}, + {0.25, 0.25, 0.5, 0.5}, + {0.75, 0.75, 1, 1}, + {0.75, 0.75, 1, 1} + } + } + } + }); + + float *computedGradCuda = new float[expectedInputGrad->size()](); + cudaMemcpy(computedGradCuda, input->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInputGrad->size(), cudaMemcpyDeviceToHost); + + for(int i = 0; i < expectedInputGrad->size(); i++){ + const float targetOutput = *(static_cast<float*>(expectedInputGrad->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6); + } + + delete[] computedGradCuda; } \ No newline at end of file diff --git a/unit_tests/Test_BatchNormImpl.cpp b/unit_tests/Test_BatchNormImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c83624020d86a2eb786d249c5ee664ca3bfdde3b --- /dev/null +++ b/unit_tests/Test_BatchNormImpl.cpp @@ -0,0 +1,438 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <array> +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution + +#include <catch2/catch_test_macros.hpp> + +#include "aidge/backend/cpu.hpp" +#include "aidge/backend/cuda.hpp" +#include "aidge/data/Tensor.hpp" +#include "aidge/utils/TensorUtils.hpp" +#include "Test_cuda.hpp" + +using namespace Aidge; + +TEST_CASE("[gpu/operator] BatchNorm(forward)") { + SECTION("Static Input") { + std::shared_ptr<Node> myBatchNorm = BatchNorm<2>(3, 0.00001F, 0.1F, "mybatchnorm"); + auto op = std::static_pointer_cast<OperatorTensor>(myBatchNorm -> getOperator()); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + std::shared_ptr<Tensor> myWeights= std::make_shared<Tensor>(Array1D<float,3> {{0.9159252643585205, 0.18772238492965698, 0.4479946792125702}}); + std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float,3> {{0.33898890018463135, 0.3167555630207062, 0.7047033309936523}}); + std::shared_ptr<Tensor> myMean = std::make_shared<Tensor>(Array1D<float,3> {{0.45547693967819214, 0.22650663554668427, 0.6612948179244995}}); + std::shared_ptr<Tensor> myVar = std::make_shared<Tensor>(Array1D<float,3> {{0.02570258639752865, 0.026536229997873306, 0.15111008286476135}}); + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,3,3,3> { //NCHW + { + { + {{0.12943482, 0.6451229 , 0.24979436}, + {0.7551012, 0.32007095, 0.89463896}, + {0.7087448, 0.6266124, 0.4782957 }}, + + {{0.13796203, 0.9950787, 0.71555305}, + {0.01347321, 0.4395316, 0.43097174}, + {0.6056306 , 0.9561122 , 0.5783939 }}, + + {{0.7174486 , 0.503465 , 0.23695093}, + {0.5145477, 0.39576462, 0.02779444}, + {0.60789394 ,0.14119725 ,0.20753163}} + }, + + + {{{0.74452287, 0.5354875 , 0.8148496 }, + {0.73356223, 0.4304034 , 0.11783765}, + {0.8966221, 0.41049036, 0.95982736}}, + + {{0.03161403, 0.71250844, 0.14337301}, + {0.5338889 , 0.13484782, 0.8055851 }, + {0.71784616 ,0.8349626 , 0.10107189}}, + + {{0.85701346, 0.58286697, 0.9836816 }, + {0.36061534, 0.03660944, 0.7375317 }, + {0.6977233, 0.51965624, 0.29440993}} + } + } + }); + + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,3,3,3> { + { + { + {{-1.5233592, 1.4222438, -0.83586717}, + { 2.0504384, -0.43444824, 2.847476 }, + { 1.7856512, 1.3165123, 0.46932936}}, + + {{ 0.21473758 , 1.2022772, 0.8802177 }, + { 0.07130594 , 0.5621954, 0.55233306}, + { 0.7535689 , 1.1573814, 0.72218764}}, + + {{ 0.7694162 , 0.52281666, 0.2156798 }, + { 0.5355886 , 0.3987003, -0.02535689}, + { 0.6431629 , 0.10533108 , 0.18177633}}}, + + + {{{ 1.990015, 0.7960079, 2.3917203 }, + { 1.9274082, 0.19576907, -1.5896021 }, + { 2.8588037 , 0.08202624 , 3.2198315 }}, + + {{ 0.09220716, 0.8767097, 0.22097193}, + { 0.6709106 , 0.2111495, 0.9839494 }, + { 0.8828597 , 1.0177971 , 0.17223406}}, + + {{ 0.9302539 , 0.6143213 , 1.0762292 }, + { 0.35819346, -0.01519828, 0.79256046}, + { 0.7466844 , 0.5414758 , 0.28189686}} + } + } + }); + myInput->setBackend("cuda"); + myWeights->setBackend("cuda"); + myBias->setBackend("cuda"); + myMean->setBackend("cuda"); + myVar->setBackend("cuda"); + + op->associateInput(0,myInput); + op->associateInput(1,myWeights); + op->associateInput(2,myBias); + op->associateInput(3,myMean); + op->associateInput(4,myVar); + op->forward(); + + float* computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + + for(int i = 0; i < myOutput->size(); i++){ + const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-5); + } + + delete[] computedOutput; + } + + SECTION("Random Input") { + constexpr std::uint16_t NBTRIALS = 10; + constexpr float epsilon = 0.00001F; + constexpr float momentum = 0.1F; + // Create a random number generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<float> valueDist( + 0.1f, 1.1f); // Random float distribution between 0 and 1 + std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1), + std::size_t(10)); + // To measure execution time of 'forward()' + std::chrono::time_point<std::chrono::system_clock> start; + std::chrono::time_point<std::chrono::system_clock> end; + std::chrono::duration<double, std::micro> duration{}; + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) + { + // generate a random Tensor + const std::size_t nbDims = 4; + std::vector<std::size_t> dims; + for (std::size_t i = 0; i < nbDims; ++i) + { + dims.push_back(dimSizeDist(gen)); + } + const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t nbChannels = dims[1]; + + + // Create BatchNorm Operator Cuda + std::shared_ptr<Node> myBatchNormCuda = BatchNorm<2>(nbChannels, epsilon, momentum, "mybatchnormcuda"); + auto op_cuda = std::static_pointer_cast<OperatorTensor>(myBatchNormCuda -> getOperator()); + op_cuda->setDataType(DataType::Float32); + op_cuda->setBackend("cuda"); + + // Create BatchNorm Operator CPU + std::shared_ptr<Node> myBatchNormCpu = BatchNorm<2>(nbChannels, epsilon, momentum, "mybatchnormcuda"); + auto op_cpu = std::static_pointer_cast<OperatorTensor>(myBatchNormCpu -> getOperator()); + op_cpu->setDataType(DataType::Float32); + op_cpu->setBackend("cpu"); + + float* array0 = new float[nb_elements]; + float* weights = new float[nbChannels]; + float* bias = new float[nbChannels]; + float* mean = new float[nbChannels]; + float* var = new float[nbChannels]; + + + for (std::size_t i = 0; i < nb_elements; ++i) { + array0[i] = valueDist(gen); + } + for (std::size_t i = 0; i < nbChannels; ++i) { + weights[i] = valueDist(gen); + bias[i] = valueDist(gen); + mean[i] = valueDist(gen); + var[i] = valueDist(gen); + } + + // input0 CUDA + float* array0_d, *weight_d, *bias_d, *mean_d, *var_d; + std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>(); + T0_cuda->setDataType(DataType::Float32); + T0_cuda->setBackend("cuda"); + T0_cuda->resize(dims); + op_cuda->associateInput(0, T0_cuda); + cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements); + cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice); + T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements); + + // input0 CPU + std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>(); + op_cpu->associateInput(0,T0_cpu); + T0_cpu->setDataType(DataType::Float32); + T0_cpu->setBackend("cpu"); + T0_cpu->resize(dims); + T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements); + + // weight CUDA + std::shared_ptr<Tensor> Tw_cuda = std::make_shared<Tensor>(); + Tw_cuda->setDataType(DataType::Float32); + Tw_cuda->setBackend("cuda"); + Tw_cuda->resize({nbChannels}); + op_cuda->associateInput(1, Tw_cuda); + cudaMalloc(reinterpret_cast<void **>(&weight_d), sizeof(float) * nbChannels); + cudaMemcpy(weight_d, weights, sizeof(float) * nbChannels, cudaMemcpyHostToDevice); + Tw_cuda->getImpl()->setRawPtr(weight_d, nbChannels); + + // weight CPU + std::shared_ptr<Tensor> Tw_cpu = std::make_shared<Tensor>(); + op_cpu->associateInput(1,Tw_cpu); + Tw_cpu->setDataType(DataType::Float32); + Tw_cpu->setBackend("cpu"); + Tw_cpu->resize({nbChannels}); + Tw_cpu -> getImpl() -> setRawPtr(weights, nbChannels); + + // bias CUDA + std::shared_ptr<Tensor> Tb_cuda = std::make_shared<Tensor>(); + Tb_cuda->setDataType(DataType::Float32); + Tb_cuda->setBackend("cuda"); + Tb_cuda->resize({nbChannels}); + op_cuda->associateInput(2, Tb_cuda); + cudaMalloc(reinterpret_cast<void **>(&bias_d), sizeof(float) * nbChannels); + cudaMemcpy(bias_d, bias, sizeof(float) * nbChannels, cudaMemcpyHostToDevice); + Tb_cuda->getImpl()->setRawPtr(bias_d, nbChannels); + + // bias CPU + std::shared_ptr<Tensor> Tb_cpu = std::make_shared<Tensor>(); + op_cpu->associateInput(2,Tb_cpu); + Tb_cpu->setDataType(DataType::Float32); + Tb_cpu->setBackend("cpu"); + Tb_cpu->resize({nbChannels}); + Tb_cpu -> getImpl() -> setRawPtr(bias, nbChannels); + + // mean CUDA + std::shared_ptr<Tensor> Tm_cuda = std::make_shared<Tensor>(); + Tm_cuda->setDataType(DataType::Float32); + Tm_cuda->setBackend("cuda"); + Tm_cuda->resize({nbChannels}); + op_cuda->associateInput(3, Tm_cuda); + cudaMalloc(reinterpret_cast<void **>(&mean_d), sizeof(float) * nbChannels); + cudaMemcpy(mean_d, mean, sizeof(float) * nbChannels, cudaMemcpyHostToDevice); + Tm_cuda->getImpl()->setRawPtr(mean_d, nbChannels); + + // mean CPU + std::shared_ptr<Tensor> Tm_cpu = std::make_shared<Tensor>(); + op_cpu->associateInput(3,Tm_cpu); + Tm_cpu->setDataType(DataType::Float32); + Tm_cpu->setBackend("cpu"); + Tm_cpu->resize({nbChannels}); + Tm_cpu -> getImpl() -> setRawPtr(mean, nbChannels); + + // var CUDA + std::shared_ptr<Tensor> Tv_cuda = std::make_shared<Tensor>(); + Tv_cuda->setDataType(DataType::Float32); + Tv_cuda->setBackend("cuda"); + Tv_cuda->resize({nbChannels}); + op_cuda->associateInput(4, Tv_cuda); + cudaMalloc(reinterpret_cast<void **>(&var_d), sizeof(float) * nbChannels); + cudaMemcpy(var_d, var, sizeof(float) * nbChannels, cudaMemcpyHostToDevice); + Tv_cuda->getImpl()->setRawPtr(var_d, nbChannels); + + // var CPU + std::shared_ptr<Tensor> Tv_cpu = std::make_shared<Tensor>(); + op_cpu->associateInput(4,Tv_cpu); + Tv_cpu->setDataType(DataType::Float32); + Tv_cpu->setBackend("cpu"); + Tv_cpu->resize({nbChannels}); + Tv_cpu -> getImpl() -> setRawPtr(var, nbChannels); + + // forward CUDA + start = std::chrono::system_clock::now(); + op_cuda->forward(); + end = std::chrono::system_clock::now(); + duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + + const std::size_t outSize = op_cuda->getOutput(0)->size(); + float *computed_cuda = new float[outSize](); + cudaMemcpy(computed_cuda, op_cuda->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * outSize, cudaMemcpyDeviceToHost); + + // forward CPU + op_cpu->forward(); + float *computed_cpu = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr()); + REQUIRE(approxEq<float>(*computed_cuda, *computed_cpu)); + + delete[] array0; + delete[] weights; + delete[] bias; + delete[] mean; + delete[] var; + delete[] computed_cuda; + cudaFree(array0_d); + cudaFree(weight_d); + cudaFree(bias_d); + cudaFree(mean_d); + cudaFree(var_d); + } + std::cout << "total time: " << duration.count() << "μs" << std::endl; + + } +} +TEST_CASE("[gpu/operator] BatchNorm(backward)") { + SECTION("Static Input") { + std::shared_ptr<Node> myBatchNorm = BatchNorm<2>(3, 0.00001F, 0.1F, "mybatchnorm"); + auto op = std::static_pointer_cast<OperatorTensor>(myBatchNorm -> getOperator()); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + // Forward + std::shared_ptr<Tensor> myWeights= std::make_shared<Tensor>(Array1D<float,3> {{-1.58390772, -0.48463920, 1.30413496}}); + std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float,3> {{0.06150287, -0.03140282, -0.49673468}}); + std::shared_ptr<Tensor> myMean = std::make_shared<Tensor>(Array1D<float,3> {{0.68328333, -0.47286209, 1.11688483}}); + std::shared_ptr<Tensor> myVar = std::make_shared<Tensor>(Array1D<float,3> {{0.84838068, 1.05930495, 0.53670371}}); + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,3,2,2> { //NCHW + { + { + { + {1.46650600, 1.24083233}, + {-0.33106008, -0.15137172} + }, + { + { 0.06625678, -1.83266091}, + { 0.53444749, -0.05167147} + }, + { + { 0.41069385, -0.70850474}, + { 0.23363227, 0.06111236} + } + }, + { + { + { 0.16707586, 1.07217050}, + { 1.18544745, 0.03441877} + }, + { + { 0.88106865, 0.33312374}, + { 0.87147945, 1.46628737} + }, + { + { 0.23930393, -0.94172227}, + { 1.48735642, 0.46449399} + } + } + } + }); + + myInput->setBackend("cuda"); + myWeights->setBackend("cuda"); + myBias->setBackend("cuda"); + myMean->setBackend("cuda"); + myVar->setBackend("cuda"); + + op->associateInput(0,myInput); + op->associateInput(1,myWeights); + op->associateInput(2,myBias); + op->associateInput(3,myMean); + op->associateInput(4,myVar); + op->forward(); + + // Backward + std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array4D<float,2,3,2,2> { + { + { + { + { 1.34347093, 0.90813798}, + { 0.39607167, 1.20428133} + }, + { + { 0.16845724, 0.48487359}, + { 0.40748054, -0.21790814} + }, + { + {-1.83932650, -0.42746788}, + { 0.97129798, 2.04073548} + } + }, + { + { + {-0.95714629, 0.18446854}, + { 1.14551663, -1.38118088} + }, + { + {-0.44466951, 2.73914146}, + { 0.57898718, 2.23699141} + }, + { + { 0.25004527, -0.18481003}, + {-0.72439206, 0.87744337} + } + + } + } + }); + + myOutputGrad->setBackend("cuda"); + std::shared_ptr<Tensor> predictedOutput = op->getOutput(0); + std::shared_ptr<Tensor> input = op->getInput(0); + std::shared_ptr<Tensor> weights = op->getInput(1); + std::shared_ptr<Tensor> bias = op->getInput(2); + predictedOutput->setGrad(myOutputGrad); + REQUIRE_NOTHROW(myBatchNorm->backward()); + + std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array4D<float, 2, 3, 2, 2>{ + { + { + { + {-0.92418045, -0.26092845}, + {-1.53920066, -3.14756274}}, + + {{ 0.26948565, -0.18548687}, + { 0.21506749, 0.45458069}}, + + {{-3.57358932, -1.30609703}, + { 1.61337423, 3.55250096}}}, + + + {{{ 2.41264391, 1.16695499}, + {-0.90373814, 3.19601130}}, + + {{ 0.71554798, -1.04076481}, + { 0.17618656, -0.60461664}}, + + {{ 0.26926503, -0.92978811}, + {-1.13964832, 1.51398242} + } + } + } + }); + + float *computedGradCuda = new float[expectedInputGrad->size()](); + cudaMemcpy(computedGradCuda, input->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInputGrad->size(), cudaMemcpyDeviceToHost); + + for(int i = 0; i < expectedInputGrad->size(); i++){ + const float targetOutput = *(static_cast<float*>(expectedInputGrad->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6); + } + + delete[] computedGradCuda; + } +} \ No newline at end of file diff --git a/unit_tests/Test_ConvDepthWiseImpl.cpp b/unit_tests/Test_ConvDepthWiseImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4655de069cce86e80881a06673621c8159be18f6 --- /dev/null +++ b/unit_tests/Test_ConvDepthWiseImpl.cpp @@ -0,0 +1,314 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <array> +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution + +#include <catch2/catch_test_macros.hpp> + +#include "aidge/backend/cpu.hpp" +#include "aidge/backend/cuda.hpp" +#include "aidge/data/Tensor.hpp" +#include "aidge/utils/TensorUtils.hpp" + +using namespace Aidge; + +TEST_CASE("[cpu/operator] ConvDepthWise(forward)", "[ConvDepthWise][CPU]") { + SECTION("Deterministic Input") { + std::shared_ptr<Node> myCDW = ConvDepthWise(4, {3,3}, "mycdw"); + auto op = std::static_pointer_cast<OperatorTensor>(myCDW -> getOperator()); + std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array4D<float,4,1,3,3> { + { + {{ + { 0, 1, 2}, + { 3, 4, 5}, + { 6, 7, 8} + + }}, + {{ + { 27, 28, 29}, + { 30, 31, 32}, + { 33, 34, 35} + + }}, + {{ + { 54, 55, 56}, + { 57, 58, 59}, + { 60, 61, 62} + }}, + {{ + { 81, 82, 83}, + { 84, 85, 86}, + { 87, 88, 89} + }} + } + }); + std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float,4> {{7,0,9,0}}); + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,4,5,5> { //NCHW + { + { + {{ 0, 1, 2, 3, 4}, + { 5, 6, 7, 8, 9}, + { 10, 11, 12, 13, 14}, + { 15, 16, 17, 18, 19}, + { 20, 21, 22, 23, 24}}, + + {{ 25, 26, 27, 28, 29}, + { 30, 31, 32, 33, 34}, + { 35, 36, 37, 38, 39}, + { 40, 41, 42, 43, 44}, + { 45, 46, 47, 48, 49}}, + + {{ 50, 51, 52, 53, 54}, + { 55, 56, 57, 58, 59}, + { 60, 61, 62, 63, 64}, + { 65, 66, 67, 68, 69}, + { 70, 71, 72, 73, 74}}, + + {{ 75, 76, 77, 78, 79}, + { 80, 81, 82, 83, 84}, + { 85, 86, 87, 88, 89}, + { 90, 91, 92, 93, 94}, + { 95, 96, 97, 98, 99}} + }, + { + {{100, 101, 102, 103, 104}, + {105, 106, 107, 108, 109}, + {110, 111, 112, 113, 114}, + {115, 116, 117, 118, 119}, + {120, 121, 122, 123, 124}}, + + {{125, 126, 127, 128, 129}, + {130, 131, 132, 133, 134}, + {135, 136, 137, 138, 139}, + {140, 141, 142, 143, 144}, + {145, 146, 147, 148, 149}}, + + {{150, 151, 152, 153, 154}, + {155, 156, 157, 158, 159}, + {160, 161, 162, 163, 164}, + {165, 166, 167, 168, 169}, + {170, 171, 172, 173, 174}}, + + {{175, 176, 177, 178, 179}, + {180, 181, 182, 183, 184}, + {185, 186, 187, 188, 189}, + {190, 191, 192, 193, 194}, + {195, 196, 197, 198, 199}} + } + } + }); + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,4,3,3> { + { + { + {{ 319, 355, 391}, + { 499, 535, 571}, + { 679, 715, 751}}, + + {{ 8745, 9024, 9303}, + { 10140, 10419, 10698}, + { 11535, 11814, 12093}}, + + {{ 29337, 29859, 30381}, + { 31947, 32469, 32991}, + { 34557, 35079, 35601}}, + + {{ 62061, 62826, 63591}, + { 65886, 66651, 67416}, + { 69711, 70476, 71241}} + }, + { + {{ 3919, 3955, 3991}, + { 4099, 4135, 4171}, + { 4279, 4315, 4351}}, + + {{ 36645, 36924, 37203}, + { 38040, 38319, 38598}, + { 39435, 39714, 39993}}, + + {{ 81537, 82059, 82581}, + { 84147, 84669, 85191}, + { 86757, 87279, 87801}}, + + {{138561, 139326, 140091}, + {142386, 143151, 143916}, + {146211, 146976, 147741}} + } + } + }); + myInput->setBackend("cuda"); + myWeights->setBackend("cuda"); + myBias->setBackend("cuda"); + op -> associateInput(0, myInput); + op -> associateInput(1, myWeights); + op -> associateInput(2, myBias); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + + myCDW -> forward(); + + float* computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + + for(int i = 0; i < myOutput->size(); i++){ + const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + } + + delete[] computedOutput; + } + + SECTION("Random Input") { + constexpr std::uint16_t NBTRIALS = 10; + // Create a random number generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<float> valueDist( + 0.1f, 1.1f); // Random float distribution between 0 and 1 + std::uniform_int_distribution<std::size_t> kernelDist(1, std::size_t(5)); + std::uniform_int_distribution<std::size_t> dimSizeDist(1, std::size_t(10)); + + // To measure execution time of 'forward()' + std::chrono::time_point<std::chrono::system_clock> start; + std::chrono::time_point<std::chrono::system_clock> end; + std::chrono::duration<double, std::micro> duration{}; + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) + { + const std::size_t kernel = kernelDist(gen); + std::uniform_int_distribution<std::size_t> resolutionDist(std::size_t(kernel+2), + std::size_t(10)); + const std::size_t nbDims = 4; + // input (batch, ch, Xin, Yin) + // weight (outCh, ch, kernelX, kernelY) + std::vector<std::size_t> dims; + for (std::size_t i = 0; i < nbDims; ++i) { + if(i < 2) + dims.push_back(dimSizeDist(gen)); + else + dims.push_back(resolutionDist(gen)); + } + dims[1] = 1; // TODO FIX: ConvDepthWise doesn't give the same output in CUDA as in CPU unless channels is 1 + const std::size_t nbChannels = dims[1]; + const std::vector<std::size_t> dimsW{nbChannels,nbChannels,kernel,kernel}; + + const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t wieghtSize = std::accumulate(dimsW.cbegin(), dimsW.cend(), std::size_t(1), std::multiplies<std::size_t>()); + + // Create ConvDepthWise Operator CUDA + std::shared_ptr<Node> myConvCUDA = ConvDepthWise(nbChannels,{kernel,kernel}, "myconvcuda"); + auto op_cuda = std::static_pointer_cast<OperatorTensor>(myConvCUDA -> getOperator()); + + // Create ConvDepthWise Operator CPU + std::shared_ptr<Node> myConvCPU = ConvDepthWise(nbChannels,{kernel,kernel}, "myconvcpu"); + auto op_cpu = std::static_pointer_cast<OperatorTensor>(myConvCPU -> getOperator()); + op_cpu->setDataType(DataType::Float32); + op_cpu->setBackend("cpu"); + + + float* array0 = new float[nb_elements]; + float* weights = new float[wieghtSize]; + float* bias = new float[nbChannels]; + + for (std::size_t i = 0; i < nb_elements; ++i) { + array0[i] = valueDist(gen); + } + for (std::size_t i = 0; i < wieghtSize; ++i) { + weights[i] = valueDist(gen); + } + for (std::size_t i = 0; i < nbChannels; ++i) { + bias[i] = valueDist(gen); + } + + // input0 CUDA + float* array0_d, *weight_d, *bias_d; + std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>(); + T0_cuda->setDataType(DataType::Float32); + T0_cuda->setBackend("cuda"); + T0_cuda->resize(dims); + op_cuda->associateInput(0, T0_cuda); + cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements); + cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice); + T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements); + + // input0 CPU + std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>(); + op_cpu->associateInput(0,T0_cpu); + T0_cpu->setDataType(DataType::Float32); + T0_cpu->setBackend("cpu"); + T0_cpu->resize(dims); + T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements); + + // weight CUDA + std::shared_ptr<Tensor> Tw_cuda = std::make_shared<Tensor>(); + Tw_cuda->setDataType(DataType::Float32); + Tw_cuda->setBackend("cuda"); + Tw_cuda->resize(dimsW); + op_cuda->associateInput(1, Tw_cuda); + cudaMalloc(reinterpret_cast<void **>(&weight_d), sizeof(float) * wieghtSize); + cudaMemcpy(weight_d, weights, sizeof(float) * wieghtSize, cudaMemcpyHostToDevice); + Tw_cuda->getImpl()->setRawPtr(weight_d, wieghtSize); + + // weight CPU + std::shared_ptr<Tensor> Tw_cpu = std::make_shared<Tensor>(); + op_cpu->associateInput(1,Tw_cpu); + Tw_cpu->setDataType(DataType::Float32); + Tw_cpu->setBackend("cpu"); + Tw_cpu->resize(dimsW); + Tw_cpu -> getImpl() -> setRawPtr(weights, wieghtSize); + + // bias CUDA + std::shared_ptr<Tensor> Tb_cuda = std::make_shared<Tensor>(); + Tb_cuda->setDataType(DataType::Float32); + Tb_cuda->setBackend("cuda"); + Tb_cuda->resize({nbChannels}); + op_cuda->associateInput(2, Tb_cuda); + cudaMalloc(reinterpret_cast<void **>(&bias_d), sizeof(float) * nbChannels); + cudaMemcpy(bias_d, bias, sizeof(float) * nbChannels, cudaMemcpyHostToDevice); + Tb_cuda->getImpl()->setRawPtr(bias_d, nbChannels); + + // bias CPU + std::shared_ptr<Tensor> Tb_cpu = std::make_shared<Tensor>(); + op_cpu->associateInput(2,Tb_cpu); + Tb_cpu->setDataType(DataType::Float32); + Tb_cpu->setBackend("cpu"); + Tb_cpu->resize({nbChannels}); + Tb_cpu -> getImpl() -> setRawPtr(bias, nbChannels); + + // forward CUDA + op_cuda->setDataType(DataType::Float32); + op_cuda->setBackend("cuda"); + start = std::chrono::system_clock::now(); + op_cuda->forward(); + end = std::chrono::system_clock::now(); + duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + + const std::size_t outSize = op_cuda->getOutput(0)->size(); + float *computed_cuda = new float[outSize](); + cudaMemcpy(computed_cuda, op_cuda->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * outSize, cudaMemcpyDeviceToHost); + + // forward CPU + op_cpu->forward(); + float *computed_cpu = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr()); + + REQUIRE(approxEq<float>(*computed_cuda, *computed_cpu)); + + delete[] array0; + delete[] weights; + delete[] bias; + delete[] computed_cuda; + cudaFree(array0_d); + cudaFree(weight_d); + cudaFree(bias_d); + } + std::cout << "total time: " << duration.count() << "μs" << std::endl; + } +} \ No newline at end of file diff --git a/unit_tests/Test_ConvImpl.cpp b/unit_tests/Test_ConvImpl.cpp index 12e40cf8266a86259c5128b425919214f2db6052..dc77e35b64fd22952e683e373fcc271c742ece75 100644 --- a/unit_tests/Test_ConvImpl.cpp +++ b/unit_tests/Test_ConvImpl.cpp @@ -10,15 +10,15 @@ ********************************************************************************/ #include <array> +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution #include <catch2/catch_test_macros.hpp> -#include "Test_cuda.hpp" - -#include "aidge/data/Tensor.hpp" - #include "aidge/backend/cpu.hpp" #include "aidge/backend/cuda.hpp" +#include "aidge/data/Tensor.hpp" +#include "aidge/utils/TensorUtils.hpp" using namespace Aidge; @@ -222,4 +222,147 @@ TEST_CASE("[gpu/operator] Conv(forward)") { delete[] computedOutput; } + + SECTION("Random Input") { + constexpr std::uint16_t NBTRIALS = 10; + // Create a random number generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<float> valueDist( + 0.1f, 1.1f); // Random float distribution between 0 and 1 + std::uniform_int_distribution<std::size_t> kernelDist(1, std::size_t(5)); + std::uniform_int_distribution<std::size_t> dimSizeDist(1, std::size_t(10)); + + // To measure execution time of 'forward()' + std::chrono::time_point<std::chrono::system_clock> start; + std::chrono::time_point<std::chrono::system_clock> end; + std::chrono::duration<double, std::micro> duration{}; + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) + { + const std::size_t kernel = kernelDist(gen); + std::uniform_int_distribution<std::size_t> resolutionDist(std::size_t(kernel+2), + std::size_t(10)); + const std::size_t nbDims = 4; + std::vector<std::size_t> dims; + for (std::size_t i = 0; i < nbDims; ++i) { + if(i < 2) + dims.push_back(dimSizeDist(gen)); + else + dims.push_back(resolutionDist(gen)); + } + const std::size_t outChannels = dimSizeDist(gen); + const std::vector<std::size_t> dimsW{outChannels,dims[1],kernel,kernel}; + const std::size_t inChannels = dims[1]; + + const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t wieghtSize = std::accumulate(dimsW.cbegin(), dimsW.cend(), std::size_t(1), std::multiplies<std::size_t>()); + + // Create Conv Operator CUDA + std::shared_ptr<Node> myConvCUDA = Conv(inChannels,outChannels,{kernel,kernel}, "myconvcuda"); + auto op_cuda = std::static_pointer_cast<OperatorTensor>(myConvCUDA -> getOperator()); + + // Create Conv Operator CPU + std::shared_ptr<Node> myConvCPU = Conv(inChannels,outChannels,{kernel,kernel}, "myconvcpu"); + auto op_cpu = std::static_pointer_cast<OperatorTensor>(myConvCPU -> getOperator()); + op_cpu->setDataType(DataType::Float32); + op_cpu->setBackend("cpu"); + + + float* array0 = new float[nb_elements]; + float* weights = new float[wieghtSize]; + float* bias = new float[outChannels]; + + for (std::size_t i = 0; i < nb_elements; ++i) { + array0[i] = valueDist(gen); + } + for (std::size_t i = 0; i < wieghtSize; ++i) { + weights[i] = valueDist(gen); + } + for (std::size_t i = 0; i < outChannels; ++i) { + bias[i] = valueDist(gen); + } + + // input0 CUDA + float* array0_d, *weight_d, *bias_d; + std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>(); + T0_cuda->setDataType(DataType::Float32); + T0_cuda->setBackend("cuda"); + T0_cuda->resize(dims); + op_cuda->associateInput(0, T0_cuda); + cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements); + cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice); + T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements); + + // input0 CPU + std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>(); + op_cpu->associateInput(0,T0_cpu); + T0_cpu->setDataType(DataType::Float32); + T0_cpu->setBackend("cpu"); + T0_cpu->resize(dims); + T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements); + + // weight CUDA + std::shared_ptr<Tensor> Tw_cuda = std::make_shared<Tensor>(); + Tw_cuda->setDataType(DataType::Float32); + Tw_cuda->setBackend("cuda"); + Tw_cuda->resize(dimsW); + op_cuda->associateInput(1, Tw_cuda); + cudaMalloc(reinterpret_cast<void **>(&weight_d), sizeof(float) * wieghtSize); + cudaMemcpy(weight_d, weights, sizeof(float) * wieghtSize, cudaMemcpyHostToDevice); + Tw_cuda->getImpl()->setRawPtr(weight_d, wieghtSize); + + // weight CPU + std::shared_ptr<Tensor> Tw_cpu = std::make_shared<Tensor>(); + op_cpu->associateInput(1,Tw_cpu); + Tw_cpu->setDataType(DataType::Float32); + Tw_cpu->setBackend("cpu"); + Tw_cpu->resize(dimsW); + Tw_cpu -> getImpl() -> setRawPtr(weights, wieghtSize); + + // bias CUDA + std::shared_ptr<Tensor> Tb_cuda = std::make_shared<Tensor>(); + Tb_cuda->setDataType(DataType::Float32); + Tb_cuda->setBackend("cuda"); + Tb_cuda->resize({outChannels}); + op_cuda->associateInput(2, Tb_cuda); + cudaMalloc(reinterpret_cast<void **>(&bias_d), sizeof(float) * outChannels); + cudaMemcpy(bias_d, bias, sizeof(float) * outChannels, cudaMemcpyHostToDevice); + Tb_cuda->getImpl()->setRawPtr(bias_d, outChannels); + + // bias CPU + std::shared_ptr<Tensor> Tb_cpu = std::make_shared<Tensor>(); + op_cpu->associateInput(2,Tb_cpu); + Tb_cpu->setDataType(DataType::Float32); + Tb_cpu->setBackend("cpu"); + Tb_cpu->resize({outChannels}); + Tb_cpu -> getImpl() -> setRawPtr(bias, outChannels); + + // forward CUDA + op_cuda->setDataType(DataType::Float32); + op_cuda->setBackend("cuda"); + start = std::chrono::system_clock::now(); + op_cuda->forward(); + end = std::chrono::system_clock::now(); + duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + + const std::size_t outSize = op_cuda->getOutput(0)->size(); + float *computed_cuda = new float[outSize](); + cudaMemcpy(computed_cuda, op_cuda->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * outSize, cudaMemcpyDeviceToHost); + + // forward CPU + op_cpu->forward(); + float *computed_cpu = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr()); + REQUIRE(approxEq<float>(*computed_cuda, *computed_cpu)); + + delete[] array0; + delete[] weights; + delete[] bias; + delete[] computed_cuda; + cudaFree(array0_d); + cudaFree(weight_d); + cudaFree(bias_d); + } + std::cout << "total time: " << duration.count() << "μs" << std::endl; + } + } diff --git a/unit_tests/Test_FCImpl.cpp b/unit_tests/Test_FCImpl.cpp index 0126755d08727597b00823b2055300e7b15accb3..472fd273b1b5eff49e0d05ebd499afdb1435770c 100644 --- a/unit_tests/Test_FCImpl.cpp +++ b/unit_tests/Test_FCImpl.cpp @@ -10,122 +10,342 @@ ********************************************************************************/ #include <array> +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution #include <catch2/catch_test_macros.hpp> -#include "Test_cuda.hpp" - -#include "aidge/data/Tensor.hpp" - #include "aidge/backend/cpu.hpp" #include "aidge/backend/cuda.hpp" +#include "aidge/data/Tensor.hpp" +#include "aidge/utils/TensorUtils.hpp" using namespace Aidge; TEST_CASE("[gpu/operator] FC(forward)", "[FC][GPU]") { - std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array2D<float, 5, 75>{ - {{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, - 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, - 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, - 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, - {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, - 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, - 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, - 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, - {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, - 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, - 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, - 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, - {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, - 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, - 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, - 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, - {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, - 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, - 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, - 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}}}); - std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float, 5>{{1, 2, 3, 4, 5}}); - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float, 2, 5>{ - {{23601, 23602, 23603, 23604, 23605}, {68601, 68602, 68603, 68604, 68605}}}); - myWeights->setBackend("cuda"); - myBias->setBackend("cuda"); - std::shared_ptr<Node> myFC = FC(75, 5, false, "myfc"); - auto op = std::static_pointer_cast<OperatorTensor>(myFC -> getOperator()); - op -> associateInput(1, myWeights); - op -> associateInput(2, myBias); - SECTION("2D input") { - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<float, 2, 75>{ - {{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, - 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, - 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, - 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74}, - {75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, - 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, - 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, - 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, - 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149}}}); - myInput->setBackend("cuda"); - op->associateInput(0, myInput); - op -> setDataType(DataType::Float32); - op -> setBackend("cuda"); - myFC->forward(); + SECTION("Static Input") { + std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array2D<float, 5, 75>{ + {{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}}}); + std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float, 5>{{1, 2, 3, 4, 5}}); + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float, 2, 5>{ + {{23601, 23602, 23603, 23604, 23605}, {68601, 68602, 68603, 68604, 68605}}}); + myWeights->setBackend("cuda"); + myBias->setBackend("cuda"); + std::shared_ptr<Node> myFC = FC(75, 5, false, "myfc"); + auto op = std::static_pointer_cast<OperatorTensor>(myFC -> getOperator()); + op -> associateInput(1, myWeights); + op -> associateInput(2, myBias); + SECTION("2D input") { + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<float, 2, 75>{ + {{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, + 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, + 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74}, + {75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, + 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149}}}); + myInput->setBackend("cuda"); + op->associateInput(0, myInput); + op -> setDataType(DataType::Float32); + op -> setBackend("cuda"); + myFC->forward(); + + float* computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); - float* computedOutput = new float[myOutput->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + for(int i = 0; i < myOutput->size(); i++){ + const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + } - for(int i = 0; i < myOutput->size(); i++){ - const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + delete[] computedOutput; } + SECTION("4D input") { + std::shared_ptr<Tensor> myInput = + std::make_shared<Tensor>(Array4D<float, 2, 3, 5, 5>{{{{{0, 1, 2, 3, 4}, + {5, 6, 7, 8, 9}, + {10, 11, 12, 13, 14}, + {15, 16, 17, 18, 19}, + {20, 21, 22, 23, 24}}, + {{25, 26, 27, 28, 29}, + {30, 31, 32, 33, 34}, + {35, 36, 37, 38, 39}, + {40, 41, 42, 43, 44}, + {45, 46, 47, 48, 49}}, + {{50, 51, 52, 53, 54}, + {55, 56, 57, 58, 59}, + {60, 61, 62, 63, 64}, + {65, 66, 67, 68, 69}, + {70, 71, 72, 73, 74}}}, + {{{75, 76, 77, 78, 79}, + {80, 81, 82, 83, 84}, + {85, 86, 87, 88, 89}, + {90, 91, 92, 93, 94}, + {95, 96, 97, 98, 99}}, + {{100, 101, 102, 103, 104}, + {105, 106, 107, 108, 109}, + {110, 111, 112, 113, 114}, + {115, 116, 117, 118, 119}, + {120, 121, 122, 123, 124}}, + {{125, 126, 127, 128, 129}, + {130, 131, 132, 133, 134}, + {135, 136, 137, 138, 139}, + {140, 141, 142, 143, 144}, + {145, 146, 147, 148, 149}}}}}); + myInput->setBackend("cuda"); + op->associateInput(0, myInput); + op -> setDataType(DataType::Float32); + op -> setBackend("cuda"); + myFC->forward(); + + float* computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); - delete[] computedOutput; + for(int i = 0; i < myOutput->size(); i++){ + const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + } + + delete[] computedOutput; + } } - SECTION("4D input") { - std::shared_ptr<Tensor> myInput = - std::make_shared<Tensor>(Array4D<float, 2, 3, 5, 5>{{{{{0, 1, 2, 3, 4}, - {5, 6, 7, 8, 9}, - {10, 11, 12, 13, 14}, - {15, 16, 17, 18, 19}, - {20, 21, 22, 23, 24}}, - {{25, 26, 27, 28, 29}, - {30, 31, 32, 33, 34}, - {35, 36, 37, 38, 39}, - {40, 41, 42, 43, 44}, - {45, 46, 47, 48, 49}}, - {{50, 51, 52, 53, 54}, - {55, 56, 57, 58, 59}, - {60, 61, 62, 63, 64}, - {65, 66, 67, 68, 69}, - {70, 71, 72, 73, 74}}}, - {{{75, 76, 77, 78, 79}, - {80, 81, 82, 83, 84}, - {85, 86, 87, 88, 89}, - {90, 91, 92, 93, 94}, - {95, 96, 97, 98, 99}}, - {{100, 101, 102, 103, 104}, - {105, 106, 107, 108, 109}, - {110, 111, 112, 113, 114}, - {115, 116, 117, 118, 119}, - {120, 121, 122, 123, 124}}, - {{125, 126, 127, 128, 129}, - {130, 131, 132, 133, 134}, - {135, 136, 137, 138, 139}, - {140, 141, 142, 143, 144}, - {145, 146, 147, 148, 149}}}}}); + + SECTION("Random Input"){ + constexpr std::uint16_t NBTRIALS = 10; + // Create a random number generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<float> valueDist( + 0.1f, 1.1f); // Random float distribution between 0 and 1 + std::uniform_int_distribution<std::size_t> dimSizeDist(1, std::size_t(10)); + + // To measure execution time of 'forward()' + std::chrono::time_point<std::chrono::system_clock> start; + std::chrono::time_point<std::chrono::system_clock> end; + std::chrono::duration<double, std::micro> duration{}; + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) + { + const std::size_t nbDims = 4; + std::vector<std::size_t> dims; + for (std::size_t i = 0; i < nbDims; ++i) { + dims.push_back(dimSizeDist(gen)); + } + const std::size_t outChannels = dimSizeDist(gen); + + const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t inChannels = nb_elements / dims[0]; + + const std::vector<std::size_t> dimsW{outChannels, inChannels}; + const std::size_t wieghtSize = outChannels * inChannels; + + // Create FC Operator CUDA + std::shared_ptr<Node> myFCCUDA = FC(inChannels, outChannels, false, "myfccuda"); + auto op_cuda = std::static_pointer_cast<OperatorTensor>(myFCCUDA -> getOperator()); + op_cuda->setDataType(DataType::Float32); + op_cuda->setBackend("cuda"); + + // Create FC Operator CPU + std::shared_ptr<Node> myFCCPU = FC(inChannels, outChannels, false, "myfccpu"); + auto op_cpu = std::static_pointer_cast<OperatorTensor>(myFCCPU -> getOperator()); + op_cpu->setDataType(DataType::Float32); + op_cpu->setBackend("cpu"); + + + float* array0 = new float[nb_elements]; + float* weights = new float[wieghtSize]; + float* bias = new float[outChannels]; + + for (std::size_t i = 0; i < nb_elements; ++i) { + array0[i] = valueDist(gen); + } + for (std::size_t i = 0; i < wieghtSize; ++i) { + weights[i] = valueDist(gen); + } + for (std::size_t i = 0; i < outChannels; ++i) { + bias[i] = valueDist(gen); + } + + // input0 CUDA + float* array0_d, *weight_d, *bias_d; + std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>(); + T0_cuda->setDataType(DataType::Float32); + T0_cuda->setBackend("cuda"); + T0_cuda->resize(dims); + op_cuda->associateInput(0, T0_cuda); + cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements); + cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice); + T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements); + + // input0 CPU + std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>(); + op_cpu->associateInput(0,T0_cpu); + T0_cpu->setDataType(DataType::Float32); + T0_cpu->setBackend("cpu"); + T0_cpu->resize(dims); + T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements); + + // weight CUDA + std::shared_ptr<Tensor> Tw_cuda = std::make_shared<Tensor>(); + Tw_cuda->setDataType(DataType::Float32); + Tw_cuda->setBackend("cuda"); + Tw_cuda->resize(dimsW); + op_cuda->associateInput(1, Tw_cuda); + cudaMalloc(reinterpret_cast<void **>(&weight_d), sizeof(float) * wieghtSize); + cudaMemcpy(weight_d, weights, sizeof(float) * wieghtSize, cudaMemcpyHostToDevice); + Tw_cuda->getImpl()->setRawPtr(weight_d, wieghtSize); + + // weight CPU + std::shared_ptr<Tensor> Tw_cpu = std::make_shared<Tensor>(); + op_cpu->associateInput(1,Tw_cpu); + Tw_cpu->setDataType(DataType::Float32); + Tw_cpu->setBackend("cpu"); + Tw_cpu->resize(dimsW); + Tw_cpu -> getImpl() -> setRawPtr(weights, wieghtSize); + + // bias CUDA + std::shared_ptr<Tensor> Tb_cuda = std::make_shared<Tensor>(); + Tb_cuda->setDataType(DataType::Float32); + Tb_cuda->setBackend("cuda"); + Tb_cuda->resize({outChannels}); + op_cuda->associateInput(2, Tb_cuda); + cudaMalloc(reinterpret_cast<void **>(&bias_d), sizeof(float) * outChannels); + cudaMemcpy(bias_d, bias, sizeof(float) * outChannels, cudaMemcpyHostToDevice); + Tb_cuda->getImpl()->setRawPtr(bias_d, outChannels); + + // bias CPU + std::shared_ptr<Tensor> Tb_cpu = std::make_shared<Tensor>(); + op_cpu->associateInput(2,Tb_cpu); + Tb_cpu->setDataType(DataType::Float32); + Tb_cpu->setBackend("cpu"); + Tb_cpu->resize({outChannels}); + Tb_cpu -> getImpl() -> setRawPtr(bias, outChannels); + + // forward CUDA + start = std::chrono::system_clock::now(); + op_cuda->forward(); + end = std::chrono::system_clock::now(); + duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + + const std::size_t outSize = op_cuda->getOutput(0)->size(); + float *computed_cuda = new float[outSize](); + cudaMemcpy(computed_cuda, op_cuda->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * outSize, cudaMemcpyDeviceToHost); + + // forward CPU + op_cpu->forward(); + float *computed_cpu = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr()); + REQUIRE(approxEq<float>(*computed_cuda, *computed_cpu)); + + delete[] array0; + delete[] weights; + delete[] bias; + delete[] computed_cuda; + cudaFree(array0_d); + cudaFree(weight_d); + cudaFree(bias_d); + } + std::cout << "total time: " << duration.count() << "μs" << std::endl; + } +} + +TEST_CASE("[gpu/operator] FC(backward)", "[FC][GPU]") { + SECTION("2D input") { + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<float, 2, 3>{ + { + {0.1, 0.2, 0.3}, + {0.4, 0.5, 0.6} + }}); myInput->setBackend("cuda"); + std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array2D<float, 2, 3>{ + {{0.1, 0.2, 0.3}, + {0.4, 0.5, 0.6}}}); + + std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float, 2>{{0.1, 0.2}}); + myWeights->setBackend("cuda"); + myBias->setBackend("cuda"); + std::shared_ptr<Node> myFC = FC(3, 2, false, "myfc"); + auto op = std::static_pointer_cast<OperatorTensor>(myFC -> getOperator()); + op->associateInput(0, myInput); + op -> associateInput(1, myWeights); + op -> associateInput(2, myBias); op -> setDataType(DataType::Float32); op -> setBackend("cuda"); myFC->forward(); - float* computedOutput = new float[myOutput->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + // Run and test backward operation + std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array2D<float, 2, 2> { + { + {0.1, 0.2}, + {0.3, 0.4} + } + }); + myOutputGrad->setBackend("cuda"); + std::shared_ptr<Tensor> predictedOutput = op->getOutput(0); + std::shared_ptr<Tensor> input = op->getInput(0); + predictedOutput->setGrad(myOutputGrad); + REQUIRE_NOTHROW(myFC->backward()); + + std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array2D<float,2,3> { + { + {0.09, 0.12, 0.15}, + {0.19, 0.26, 0.33} + } + }); + std::shared_ptr<Tensor> expectedBiasGrad = std::make_shared<Tensor>(Array1D<float,2> { + {0.4, 0.6} + }); + std::shared_ptr<Tensor> expectedWeightsGrad = std::make_shared<Tensor>(Array2D<float,2,3> { + { + {0.13, 0.17, 0.21}, + {0.18, 0.24, 0.3 } + } + }); + float *computedGradCuda = new float[expectedInputGrad->size()](); + cudaMemcpy(computedGradCuda, input->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInputGrad->size(), cudaMemcpyDeviceToHost); + float *computedGradWCuda = new float[expectedWeightsGrad->size()](); + cudaMemcpy(computedGradWCuda, op->getInput(1)->grad()->getImpl()->rawPtr(), sizeof(float) * expectedWeightsGrad->size(), cudaMemcpyDeviceToHost); + float *computedGradBCuda = new float[expectedBiasGrad->size()](); + cudaMemcpy(computedGradBCuda, op->getInput(2)->grad()->getImpl()->rawPtr(), sizeof(float) * expectedBiasGrad->size(), cudaMemcpyDeviceToHost); - for(int i = 0; i < myOutput->size(); i++){ - const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + for(int i = 0; i < expectedInputGrad->size(); i++){ + const float targetOutput = *(static_cast<float*>(expectedInputGrad->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6); } + for(int i = 0; i < expectedBiasGrad->size(); i++){ + const float targetOutput = *(static_cast<float*>(expectedBiasGrad->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedGradBCuda[i] - targetOutput) < 1e-6); + } + for(int i = 0; i < expectedWeightsGrad->size(); i++){ + const float targetOutput = *(static_cast<float*>(expectedWeightsGrad->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedGradWCuda[i] - targetOutput) < 1e-6); + } + + + - delete[] computedOutput; + delete[] computedGradCuda; + delete[] computedGradWCuda; + delete[] computedGradBCuda; } } \ No newline at end of file diff --git a/unit_tests/Test_GlobalAveragePoolingImpl.cpp b/unit_tests/Test_GlobalAveragePoolingImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0a0f22ab60ced3a3f7648ce798484f72bd67839a --- /dev/null +++ b/unit_tests/Test_GlobalAveragePoolingImpl.cpp @@ -0,0 +1,172 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +// #include <aidge/utils/Types.h> +// #include <catch2/catch_test_macros.hpp> +// #include <chrono> +// #include <cmath> +// #include <memory> +#include <array> +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution + +#include <catch2/catch_test_macros.hpp> + +#include "aidge/backend/cpu.hpp" +#include "aidge/backend/cuda.hpp" +#include "aidge/data/Tensor.hpp" +#include "aidge/utils/TensorUtils.hpp" +namespace Aidge { +TEST_CASE("[gpu/operator] GlobalAveragePooling", + "[GlobalAveragePooling][GPU]") { + + SECTION("4D-Tensor") + { + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,1,3,4,4> { //NCHW + { + { + {{0, 1, 2, 3}, + {4, 5, 6, 7}, + {8, 9, 10, 11}, + {12, 13, 14, 15}}, + + {{16, 17, 18, 19}, + {20, 21, 22, 23}, + {24, 25, 26, 27}, + {28, 29, 30, 31}}, + + {{32, 33, 34, 35}, + {36, 37, 38, 39}, + {40, 41, 42, 43}, + {44, 45, 46, 47}} + } + } + }); + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,1,3,1,1> { + { + { + {{ 7.5 }}, + {{ 23.5 }}, + {{ 39.5 }} + } + } + }); + myInput->setBackend("cuda"); + myInput->setDataType(DataType::Float32); + // Create MyGlobalAveragePooling Operator + std::shared_ptr<Node> globAvgPool = GlobalAveragePooling(); + auto op = std::static_pointer_cast<OperatorTensor>(globAvgPool->getOperator()); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + op->associateInput(0, myInput); + + globAvgPool->forward(); + float* computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + + for(int i = 0; i < myOutput->size(); i++){ + const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + } + + delete[] computedOutput; + } + + SECTION("Random Input") { + constexpr std::uint16_t NBTRIALS = 10; + // Create a random number generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<float> valueDist( + 0.1f, 1.1f); // Random float distribution between 0 and 1 + std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1), + std::size_t(10)); + + // To measure execution time of 'AveragePooling_Op::forward()' + std::chrono::time_point<std::chrono::system_clock> start; + std::chrono::time_point<std::chrono::system_clock> end; + std::chrono::duration<double, std::micro> duration{}; + std::size_t number_of_operation = 0; + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) + { + // Create GlobalAveragePooling Operator CUDA + std::shared_ptr<Node> myGAvgPoolCuda = GlobalAveragePooling(); + auto op_cuda = std::static_pointer_cast<OperatorTensor>(myGAvgPoolCuda->getOperator()); + op_cuda->setDataType(DataType::Float32); + op_cuda->setBackend("cuda"); + + // Create GlobalAveragePooling Operator CUDA + std::shared_ptr<Node> myGAvgPoolCpu = GlobalAveragePooling(); + auto op_cpu = std::static_pointer_cast<OperatorTensor>(myGAvgPoolCpu->getOperator()); + op_cpu->setDataType(DataType::Float32); + op_cpu->setBackend("cpu"); + + // generate a random Tensor + const std::size_t nbDims = 4; + std::vector<std::size_t> dims; + for (std::size_t i = 0; i < nbDims; ++i) + { + dims.push_back(dimSizeDist(gen)); + } + + const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + number_of_operation += nb_elements; + + // Fill input tensor + float *array0 = new float[nb_elements]; + for (std::size_t i = 0; i < nb_elements; ++i) + { + array0[i] = valueDist(gen); + } + + // input0 CUDA + float* array0_d; + std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>(); + T0_cuda->setDataType(DataType::Float32); + T0_cuda->setBackend("cuda"); + T0_cuda->resize(dims); + op_cuda->associateInput(0, T0_cuda); + cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements); + cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice); + T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements); + + // input0 CPU + std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>(); + op_cpu->associateInput(0,T0_cpu); + T0_cpu->setDataType(DataType::Float32); + T0_cpu->setBackend("cpu"); + T0_cpu->resize(dims); + T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements); + + // Run inference + start = std::chrono::system_clock::now(); + op_cuda->forward(); + end = std::chrono::system_clock::now(); + duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + + const std::size_t outSize = op_cuda->getOutput(0)->size(); + float *computed_cuda = new float[outSize](); + cudaMemcpy(computed_cuda, op_cuda->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * outSize, cudaMemcpyDeviceToHost); + + // forward CPU + op_cpu->forward(); + float *computed_cpu = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr()); + REQUIRE(approxEq<float>(*computed_cuda, *computed_cpu)); + + delete[] computed_cuda; + delete[] array0; + cudaFree(array0_d); + } + std::cout << "number of elements over time spent: " << (number_of_operation / duration.count()) << std::endl; + std::cout << "total time: " << duration.count() << "μs" << std::endl; + } +} +} // namespace Aidge diff --git a/unit_tests/Test_MaxPoolingImpl.cpp b/unit_tests/Test_MaxPoolingImpl.cpp index bc2efdd447363044dc02fab06964909756a8e2d1..99850a0715cf8feb3164d58c410a1ef689feece1 100644 --- a/unit_tests/Test_MaxPoolingImpl.cpp +++ b/unit_tests/Test_MaxPoolingImpl.cpp @@ -10,20 +10,20 @@ ********************************************************************************/ #include <array> +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution #include <catch2/catch_test_macros.hpp> -#include "Test_cuda.hpp" - -#include "aidge/data/Tensor.hpp" - #include "aidge/backend/cpu.hpp" #include "aidge/backend/cuda.hpp" +#include "aidge/data/Tensor.hpp" +#include "aidge/utils/TensorUtils.hpp" using namespace Aidge; -TEST_CASE("[cpu/operator] MaxPooling(forward)", "[MaxPooling][CPU]") { +TEST_CASE("[gpu/operator] MaxPooling(forward)", "[MaxPooling][GPU]") { std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,2,5,5> { //NCHW { { @@ -89,4 +89,95 @@ TEST_CASE("[cpu/operator] MaxPooling(forward)", "[MaxPooling][CPU]") { delete[] computedOutput; } + + SECTION("Random Input") { + constexpr std::uint16_t NBTRIALS = 10; + std::size_t kernel = 3; + std::size_t stride = 3; + // Create a random number generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<float> valueDist( + 0.1f, 1.1f); // Random float distribution between 0 and 1 + std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(kernel), + std::size_t(10)); + + // To measure execution time of 'forward()' + std::chrono::time_point<std::chrono::system_clock> start; + std::chrono::time_point<std::chrono::system_clock> end; + std::chrono::duration<double, std::micro> duration{}; + std::size_t number_of_operation = 0; + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) + { + // Create MaxPooling Operator CUDA + std::shared_ptr<Node> myMaxPoolCuda = MaxPooling({kernel, kernel}, "myMaxPoolCuda", {stride, stride}); + auto op_cuda = std::static_pointer_cast<OperatorTensor>(myMaxPoolCuda->getOperator()); + op_cuda->setDataType(DataType::Float32); + op_cuda->setBackend("cuda"); + + // Create MaxPooling Operator CUDA + std::shared_ptr<Node> myMaxPoolCpu = MaxPooling({kernel, kernel}, "myMaxPoolCpu", {stride, stride}); + auto op_cpu = std::static_pointer_cast<OperatorTensor>(myMaxPoolCpu->getOperator()); + op_cpu->setDataType(DataType::Float32); + op_cpu->setBackend("cpu"); + + // generate a random Tensor + const std::size_t nbDims = 4; + std::vector<std::size_t> dims; + for (std::size_t i = 0; i < nbDims; ++i) + { + dims.push_back(dimSizeDist(gen)); + } + + const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + number_of_operation += nb_elements; + + // Fill input tensor + float *array0 = new float[nb_elements]; + for (std::size_t i = 0; i < nb_elements; ++i) + { + array0[i] = valueDist(gen); + } + + // input0 CUDA + float* array0_d; + std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>(); + T0_cuda->setDataType(DataType::Float32); + T0_cuda->setBackend("cuda"); + T0_cuda->resize(dims); + op_cuda->associateInput(0, T0_cuda); + cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements); + cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice); + T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements); + + // input0 CPU + std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>(); + op_cpu->associateInput(0,T0_cpu); + T0_cpu->setDataType(DataType::Float32); + T0_cpu->setBackend("cpu"); + T0_cpu->resize(dims); + T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements); + + // Run inference + start = std::chrono::system_clock::now(); + op_cuda->forward(); + end = std::chrono::system_clock::now(); + duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + + const std::size_t outSize = op_cuda->getOutput(0)->size(); + float *computed_cuda = new float[outSize](); + cudaMemcpy(computed_cuda, op_cuda->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * outSize, cudaMemcpyDeviceToHost); + + // forward CPU + op_cpu->forward(); + float *computed_cpu = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr()); + REQUIRE(approxEq<float>(*computed_cuda, *computed_cpu)); + + delete[] computed_cuda; + delete[] array0; + cudaFree(array0_d); + } + std::cout << "number of elements over time spent: " << (number_of_operation / duration.count()) << std::endl; + std::cout << "total time: " << duration.count() << "μs" << std::endl; + } } \ No newline at end of file diff --git a/unit_tests/Test_PadImpl.cpp b/unit_tests/Test_PadImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4e799ea6b7d11c9b446e0e4c8b9d12beae24bb05 --- /dev/null +++ b/unit_tests/Test_PadImpl.cpp @@ -0,0 +1,784 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <array> +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution + +#include <catch2/catch_test_macros.hpp> + +#include "aidge/backend/cpu.hpp" +#include "aidge/backend/cuda.hpp" +#include "aidge/data/Tensor.hpp" +#include "aidge/utils/TensorUtils.hpp" + +using namespace Aidge; + +TEST_CASE("[gpu/operator] Pad(forward)", "[Pad][GPU]") { + SECTION("Symmetric Pad") { + const int pv = 0; // pad value + + std::shared_ptr<Node> myPad = Pad<2>({1, 1, 1, 1}, "mypad", PadBorderType::Constant, static_cast<double>(pv)); + auto op = std::static_pointer_cast<OperatorTensor>(myPad -> getOperator()); + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,3,5,5> { //NCHW + { + { + {{ 0, 1, 2, 3, 4}, + { 5, 6, 7, 8, 9}, + { 10, 11, 12, 13, 14}, + { 15, 16, 17, 18, 19}, + { 20, 21, 22, 23, 24}}, + + {{ 25, 26, 27, 28, 29}, + { 30, 31, 32, 33, 34}, + { 35, 36, 37, 38, 39}, + { 40, 41, 42, 43, 44}, + { 45, 46, 47, 48, 49}}, + + {{ 50, 51, 52, 53, 54}, + { 55, 56, 57, 58, 59}, + { 60, 61, 62, 63, 64}, + { 65, 66, 67, 68, 69}, + { 70, 71, 72, 73, 74}} + }, + { + {{ 75, 76, 77, 78, 79}, + { 80, 81, 82, 83, 84}, + { 85, 86, 87, 88, 89}, + { 90, 91, 92, 93, 94}, + { 95, 96, 97, 98, 99}}, + + {{100, 101, 102, 103, 104}, + {105, 106, 107, 108, 109}, + {110, 111, 112, 113, 114}, + {115, 116, 117, 118, 119}, + {120, 121, 122, 123, 124}}, + + {{125, 126, 127, 128, 129}, + {130, 131, 132, 133, 134}, + {135, 136, 137, 138, 139}, + {140, 141, 142, 143, 144}, + {145, 146, 147, 148, 149}} + } + } + }); + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,3,7,7> { //NCHW + { + { + {{ pv, pv, pv, pv, pv, pv, pv}, + { pv, 0, 1, 2, 3, 4, pv}, + { pv, 5, 6, 7, 8, 9, pv}, + { pv, 10, 11, 12, 13, 14, pv}, + { pv, 15, 16, 17, 18, 19, pv}, + { pv, 20, 21, 22, 23, 24, pv}, + { pv, pv, pv, pv, pv, pv, pv}}, + + {{ pv, pv, pv, pv, pv, pv, pv}, + { pv, 25, 26, 27, 28, 29, pv}, + { pv, 30, 31, 32, 33, 34, pv}, + { pv, 35, 36, 37, 38, 39, pv}, + { pv, 40, 41, 42, 43, 44, pv}, + { pv, 45, 46, 47, 48, 49, pv}, + { pv, pv, pv, pv, pv, pv, pv}}, + + {{ pv, pv, pv, pv, pv, pv, pv}, + { pv, 50, 51, 52, 53, 54, pv}, + { pv, 55, 56, 57, 58, 59, pv}, + { pv, 60, 61, 62, 63, 64, pv}, + { pv, 65, 66, 67, 68, 69, pv}, + { pv, 70, 71, 72, 73, 74, pv}, + { pv, pv, pv, pv, pv, pv, pv}} + }, + { + {{ pv, pv, pv, pv, pv, pv, pv}, + { pv, 75, 76, 77, 78, 79, pv}, + { pv, 80, 81, 82, 83, 84, pv}, + { pv, 85, 86, 87, 88, 89, pv}, + { pv, 90, 91, 92, 93, 94, pv}, + { pv, 95, 96, 97, 98, 99, pv}, + { pv, pv, pv, pv, pv, pv, pv}}, + + {{ pv, pv, pv, pv, pv, pv, pv}, + {pv, 100, 101, 102, 103, 104, pv}, + {pv, 105, 106, 107, 108, 109, pv}, + {pv, 110, 111, 112, 113, 114, pv}, + {pv, 115, 116, 117, 118, 119, pv}, + {pv, 120, 121, 122, 123, 124, pv}, + { pv, pv, pv, pv, pv, pv, pv}}, + + {{ pv, pv, pv, pv, pv, pv, pv}, + {pv, 125, 126, 127, 128, 129, pv}, + {pv, 130, 131, 132, 133, 134, pv}, + {pv, 135, 136, 137, 138, 139, pv}, + {pv, 140, 141, 142, 143, 144, pv}, + {pv, 145, 146, 147, 148, 149, pv}, + { pv, pv, pv, pv, pv, pv, pv}} + } + } + }); + + myInput->setBackend("cuda"); + myPad->getOperator()->associateInput(0,myInput); + myPad->getOperator()->setDataType(DataType::Float32); + myPad->getOperator()->setBackend("cuda"); + + myPad->forward(); + + float* computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + + for(int i = 0; i < myOutput->size(); i++){ + const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + } + + delete[] computedOutput; + } + + SECTION("Asymmetric Pad") { + const int pv = 0; // pad value + + std::shared_ptr<Node> myPad = Pad<2>({1, 0, 0, 1}, "mypad", PadBorderType::Constant, static_cast<double>(pv)); + auto op = std::static_pointer_cast<OperatorTensor>(myPad -> getOperator()); + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,3,5,5> { //NCHW + { + { + {{ 0, 1, 2, 3, 4}, + { 5, 6, 7, 8, 9}, + { 10, 11, 12, 13, 14}, + { 15, 16, 17, 18, 19}, + { 20, 21, 22, 23, 24}}, + + {{ 25, 26, 27, 28, 29}, + { 30, 31, 32, 33, 34}, + { 35, 36, 37, 38, 39}, + { 40, 41, 42, 43, 44}, + { 45, 46, 47, 48, 49}}, + + {{ 50, 51, 52, 53, 54}, + { 55, 56, 57, 58, 59}, + { 60, 61, 62, 63, 64}, + { 65, 66, 67, 68, 69}, + { 70, 71, 72, 73, 74}} + }, + { + {{ 75, 76, 77, 78, 79}, + { 80, 81, 82, 83, 84}, + { 85, 86, 87, 88, 89}, + { 90, 91, 92, 93, 94}, + { 95, 96, 97, 98, 99}}, + + {{100, 101, 102, 103, 104}, + {105, 106, 107, 108, 109}, + {110, 111, 112, 113, 114}, + {115, 116, 117, 118, 119}, + {120, 121, 122, 123, 124}}, + + {{125, 126, 127, 128, 129}, + {130, 131, 132, 133, 134}, + {135, 136, 137, 138, 139}, + {140, 141, 142, 143, 144}, + {145, 146, 147, 148, 149}} + } + } + }); + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,3,6,6> { //NCHW + { + { + {{ pv, pv, pv, pv, pv, pv}, + { 0, 1, 2, 3, 4, pv}, + { 5, 6, 7, 8, 9, pv}, + { 10, 11, 12, 13, 14, pv}, + { 15, 16, 17, 18, 19, pv}, + { 20, 21, 22, 23, 24, pv}}, + + {{ pv, pv, pv, pv, pv, pv}, + { 25, 26, 27, 28, 29, pv}, + { 30, 31, 32, 33, 34, pv}, + { 35, 36, 37, 38, 39, pv}, + { 40, 41, 42, 43, 44, pv}, + { 45, 46, 47, 48, 49, pv}}, + + {{ pv, pv, pv, pv, pv, pv}, + { 50, 51, 52, 53, 54, pv}, + { 55, 56, 57, 58, 59, pv}, + { 60, 61, 62, 63, 64, pv}, + { 65, 66, 67, 68, 69, pv}, + { 70, 71, 72, 73, 74, pv}} + }, + { + {{ pv, pv, pv, pv, pv, pv}, + { 75, 76, 77, 78, 79, pv}, + { 80, 81, 82, 83, 84, pv}, + { 85, 86, 87, 88, 89, pv}, + { 90, 91, 92, 93, 94, pv}, + { 95, 96, 97, 98, 99, pv}}, + + {{ pv, pv, pv, pv, pv, pv}, + { 100, 101, 102, 103, 104, pv}, + { 105, 106, 107, 108, 109, pv}, + { 110, 111, 112, 113, 114, pv}, + { 115, 116, 117, 118, 119, pv}, + { 120, 121, 122, 123, 124, pv}}, + + {{ pv, pv, pv, pv, pv, pv}, + { 125, 126, 127, 128, 129, pv}, + { 130, 131, 132, 133, 134, pv}, + { 135, 136, 137, 138, 139, pv}, + { 140, 141, 142, 143, 144, pv}, + { 145, 146, 147, 148, 149, pv}} + } + } + }); + + myInput->setBackend("cuda"); + myPad->getOperator()->associateInput(0,myInput); + myPad->getOperator()->setDataType(DataType::Float32); + myPad->getOperator()->setBackend("cuda"); + + myPad->forward(); + + float* computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + + for(int i = 0; i < myOutput->size(); i++){ + const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + } + + delete[] computedOutput; + } + + SECTION("Pad Edge") { + std::shared_ptr<Node> myPad = Pad<2>({1, 1, 1, 1}, "mypad", PadBorderType::Edge); + auto op = std::static_pointer_cast<OperatorTensor>(myPad -> getOperator()); + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,3,5,5> { //NCHW + { + { + {{ 0, 1, 2, 3, 4}, + { 5, 6, 7, 8, 9}, + { 10, 11, 12, 13, 14}, + { 15, 16, 17, 18, 19}, + { 20, 21, 22, 23, 24}}, + + {{ 25, 26, 27, 28, 29}, + { 30, 31, 32, 33, 34}, + { 35, 36, 37, 38, 39}, + { 40, 41, 42, 43, 44}, + { 45, 46, 47, 48, 49}}, + + {{ 50, 51, 52, 53, 54}, + { 55, 56, 57, 58, 59}, + { 60, 61, 62, 63, 64}, + { 65, 66, 67, 68, 69}, + { 70, 71, 72, 73, 74}} + }, + { + {{ 75, 76, 77, 78, 79}, + { 80, 81, 82, 83, 84}, + { 85, 86, 87, 88, 89}, + { 90, 91, 92, 93, 94}, + { 95, 96, 97, 98, 99}}, + + {{100, 101, 102, 103, 104}, + {105, 106, 107, 108, 109}, + {110, 111, 112, 113, 114}, + {115, 116, 117, 118, 119}, + {120, 121, 122, 123, 124}}, + + {{125, 126, 127, 128, 129}, + {130, 131, 132, 133, 134}, + {135, 136, 137, 138, 139}, + {140, 141, 142, 143, 144}, + {145, 146, 147, 148, 149}} + } + } + }); + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,3,7,7> { //NCHW + { + { + {{ 0, 0, 1, 2, 3, 4, 4}, + { 0, 0, 1, 2, 3, 4, 4}, + { 5, 5, 6, 7, 8, 9, 9}, + { 10, 10, 11, 12, 13, 14, 14}, + { 15, 15, 16, 17, 18, 19, 19}, + { 20, 20, 21, 22, 23, 24, 24}, + { 20, 20, 21, 22, 23, 24, 24}}, + + {{ 25, 25, 26, 27, 28, 29, 29}, + { 25, 25, 26, 27, 28, 29, 29}, + { 30, 30, 31, 32, 33, 34, 34}, + { 35, 35, 36, 37, 38, 39, 39}, + { 40, 40, 41, 42, 43, 44, 44}, + { 45, 45, 46, 47, 48, 49, 49}, + { 45, 45, 46, 47, 48, 49, 49}}, + + {{ 50, 50, 51, 52, 53, 54, 54}, + { 50, 50, 51, 52, 53, 54, 54}, + { 55, 55, 56, 57, 58, 59, 59}, + { 60, 60, 61, 62, 63, 64, 64}, + { 65, 65, 66, 67, 68, 69, 69}, + { 70, 70, 71, 72, 73, 74, 74}, + { 70, 70, 71, 72, 73, 74, 74}} + }, + { + {{ 75, 75, 76, 77, 78, 79, 79}, + { 75, 75, 76, 77, 78, 79, 79}, + { 80, 80, 81, 82, 83, 84, 84}, + { 85, 85, 86, 87, 88, 89, 89}, + { 90, 90, 91, 92, 93, 94, 94}, + { 95, 95, 96, 97, 98, 99, 99}, + { 95, 95, 96, 97, 98, 99, 99}}, + + {{100, 100, 101, 102, 103, 104, 104}, + {100, 100, 101, 102, 103, 104, 104}, + {105, 105, 106, 107, 108, 109, 109}, + {110, 110, 111, 112, 113, 114, 114}, + {115, 115, 116, 117, 118, 119, 119}, + {120, 120, 121, 122, 123, 124, 124}, + {120, 120, 121, 122, 123, 124, 124}}, + + {{125, 125, 126, 127, 128, 129, 129}, + {125, 125, 126, 127, 128, 129, 129}, + {130, 130, 131, 132, 133, 134, 134}, + {135, 135, 136, 137, 138, 139, 139}, + {140, 140, 141, 142, 143, 144, 144}, + {145, 145, 146, 147, 148, 149, 149}, + {145, 145, 146, 147, 148, 149, 149}} + } + } + }); + myInput->setBackend("cuda"); + myPad->getOperator()->associateInput(0,myInput); + myPad->getOperator()->setDataType(DataType::Float32); + myPad->getOperator()->setBackend("cuda"); + + myPad->forward(); + + float* computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + for(int i = 0; i < myOutput->size(); i++){ + const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + } + + delete[] computedOutput; + } + + SECTION("Pad Reflect") { + std::shared_ptr<Node> myPad = Pad<2>({1, 1, 1, 1}, "mypad", PadBorderType::Reflect); + auto op = std::static_pointer_cast<OperatorTensor>(myPad -> getOperator()); + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,3,5,5> { //NCHW + { + { + {{ 0, 1, 2, 3, 4}, + { 5, 6, 7, 8, 9}, + { 10, 11, 12, 13, 14}, + { 15, 16, 17, 18, 19}, + { 20, 21, 22, 23, 24}}, + + {{ 25, 26, 27, 28, 29}, + { 30, 31, 32, 33, 34}, + { 35, 36, 37, 38, 39}, + { 40, 41, 42, 43, 44}, + { 45, 46, 47, 48, 49}}, + + {{ 50, 51, 52, 53, 54}, + { 55, 56, 57, 58, 59}, + { 60, 61, 62, 63, 64}, + { 65, 66, 67, 68, 69}, + { 70, 71, 72, 73, 74}} + }, + { + {{ 75, 76, 77, 78, 79}, + { 80, 81, 82, 83, 84}, + { 85, 86, 87, 88, 89}, + { 90, 91, 92, 93, 94}, + { 95, 96, 97, 98, 99}}, + + {{100, 101, 102, 103, 104}, + {105, 106, 107, 108, 109}, + {110, 111, 112, 113, 114}, + {115, 116, 117, 118, 119}, + {120, 121, 122, 123, 124}}, + + {{125, 126, 127, 128, 129}, + {130, 131, 132, 133, 134}, + {135, 136, 137, 138, 139}, + {140, 141, 142, 143, 144}, + {145, 146, 147, 148, 149}} + } + } + }); + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,3,7,7> { //NCHW + { + { + { + { 6, 5, 6, 7, 8, 9, 5}, + { 1, 0, 1, 2, 3, 4, 0}, + { 6, 5, 6, 7, 8, 9, 5}, + { 11, 10, 11, 12, 13, 14, 10}, + { 16, 15, 16, 17, 18, 19, 15}, + { 21, 20, 21, 22, 23, 24, 20}, + { 1, 0, 1, 2, 3, 4, 0} + }, + { + { 31, 30, 31, 32, 33, 34, 30}, + { 26, 25, 26, 27, 28, 29, 25}, + { 31, 30, 31, 32, 33, 34, 30}, + { 36, 35, 36, 37, 38, 39, 35}, + { 41, 40, 41, 42, 43, 44, 40}, + { 46, 45, 46, 47, 48, 49, 45}, + { 26, 25, 26, 27, 28, 29, 25} + }, + { + { 56, 55, 56, 57, 58, 59, 55}, + { 51, 50, 51, 52, 53, 54, 50}, + { 56, 55, 56, 57, 58, 59, 55}, + { 61, 60, 61, 62, 63, 64, 60}, + { 66, 65, 66, 67, 68, 69, 65}, + { 71, 70, 71, 72, 73, 74, 70}, + { 51, 50, 51, 52, 53, 54, 50} + } + }, + { + { + { 81, 80, 81, 82, 83, 84, 80}, + { 76, 75, 76, 77, 78, 79, 75}, + { 81, 80, 81, 82, 83, 84, 80}, + { 86, 85, 86, 87, 88, 89, 85}, + { 91, 90, 91, 92, 93, 94, 90}, + { 96, 95, 96, 97, 98, 99, 95}, + { 76, 75, 76, 77, 78, 79, 75} + }, + { + { 106, 105, 106, 107, 108, 109, 105}, + { 101, 100, 101, 102, 103, 104, 100}, + { 106, 105, 106, 107, 108, 109, 105}, + { 111, 110, 111, 112, 113, 114, 110}, + { 116, 115, 116, 117, 118, 119, 115}, + { 121, 120, 121, 122, 123, 124, 120}, + { 101, 100, 101, 102, 103, 104, 100} + }, + { + { 131, 130, 131, 132, 133, 134, 130}, + { 126, 125, 126, 127, 128, 129, 125}, + { 131, 130, 131, 132, 133, 134, 130}, + { 136, 135, 136, 137, 138, 139, 135}, + { 141, 140, 141, 142, 143, 144, 140}, + { 146, 145, 146, 147, 148, 149, 145}, + { 126, 125, 126, 127, 128, 129, 125} + } + } + } + }); + myInput->setBackend("cuda"); + myPad->getOperator()->associateInput(0,myInput); + myPad->getOperator()->setDataType(DataType::Float32); + myPad->getOperator()->setBackend("cuda"); + + myPad->forward(); + + float* computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + + for(int i = 0; i < myOutput->size(); i++){ + const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + } + + delete[] computedOutput; + } + + SECTION("Pad Wrap") { + std::shared_ptr<Node> myPad = Pad<2>({1, 1, 1, 1}, "mypad", PadBorderType::Wrap); + auto op = std::static_pointer_cast<OperatorTensor>(myPad -> getOperator()); + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,3,5,5> { //NCHW + { + { + {{ 0, 1, 2, 3, 4}, + { 5, 6, 7, 8, 9}, + { 10, 11, 12, 13, 14}, + { 15, 16, 17, 18, 19}, + { 20, 21, 22, 23, 24}}, + + {{ 25, 26, 27, 28, 29}, + { 30, 31, 32, 33, 34}, + { 35, 36, 37, 38, 39}, + { 40, 41, 42, 43, 44}, + { 45, 46, 47, 48, 49}}, + + {{ 50, 51, 52, 53, 54}, + { 55, 56, 57, 58, 59}, + { 60, 61, 62, 63, 64}, + { 65, 66, 67, 68, 69}, + { 70, 71, 72, 73, 74}} + }, + { + {{ 75, 76, 77, 78, 79}, + { 80, 81, 82, 83, 84}, + { 85, 86, 87, 88, 89}, + { 90, 91, 92, 93, 94}, + { 95, 96, 97, 98, 99}}, + + {{100, 101, 102, 103, 104}, + {105, 106, 107, 108, 109}, + {110, 111, 112, 113, 114}, + {115, 116, 117, 118, 119}, + {120, 121, 122, 123, 124}}, + + {{125, 126, 127, 128, 129}, + {130, 131, 132, 133, 134}, + {135, 136, 137, 138, 139}, + {140, 141, 142, 143, 144}, + {145, 146, 147, 148, 149}} + } + } + }); + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,3,7,7> { //NCHW + { + { + {{ 24, 20, 21, 22, 23, 24, 20}, + { 4, 0, 1, 2, 3, 4, 0}, + { 9, 5, 6, 7, 8, 9, 5}, + { 14, 10, 11, 12, 13, 14, 10}, + { 19, 15, 16, 17, 18, 19, 15}, + { 24, 20, 21, 22, 23, 24, 20}, + { 4, 0, 1, 2, 3, 4, 0}}, + + {{ 49, 45, 46, 47, 48, 49, 45}, + { 29, 25, 26, 27, 28, 29, 25}, + { 34, 30, 31, 32, 33, 34, 30}, + { 39, 35, 36, 37, 38, 39, 35}, + { 44, 40, 41, 42, 43, 44, 40}, + { 49, 45, 46, 47, 48, 49, 45}, + { 29, 25, 26, 27, 28, 29, 25}}, + + {{ 74, 70, 71, 72, 73, 74, 70}, + { 54, 50, 51, 52, 53, 54, 50}, + { 59, 55, 56, 57, 58, 59, 55}, + { 64, 60, 61, 62, 63, 64, 60}, + { 69, 65, 66, 67, 68, 69, 65}, + { 74, 70, 71, 72, 73, 74, 70}, + { 54, 50, 51, 52, 53, 54, 50}} + }, + { + {{ 99, 95, 96, 97, 98, 99, 95}, + { 79, 75, 76, 77, 78, 79, 75}, + { 84, 80, 81, 82, 83, 84, 80}, + { 89, 85, 86, 87, 88, 89, 85}, + { 94, 90, 91, 92, 93, 94, 90}, + { 99, 95, 96, 97, 98, 99, 95}, + { 79, 75, 76, 77, 78, 79, 75}}, + + {{124, 120, 121, 122, 123, 124, 120}, + {104, 100, 101, 102, 103, 104, 100}, + {109, 105, 106, 107, 108, 109, 105}, + {114, 110, 111, 112, 113, 114, 110}, + {119, 115, 116, 117, 118, 119, 115}, + {124, 120, 121, 122, 123, 124, 120}, + {104, 100, 101, 102, 103, 104, 100}}, + + {{149, 145, 146, 147, 148, 149, 145}, + {129, 125, 126, 127, 128, 129, 125}, + {134, 130, 131, 132, 133, 134, 130}, + {139, 135, 136, 137, 138, 139, 135}, + {144, 140, 141, 142, 143, 144, 140}, + {149, 145, 146, 147, 148, 149, 145}, + {129, 125, 126, 127, 128, 129, 125}} + } + } + }); + myInput->setBackend("cuda"); + myPad->getOperator()->associateInput(0,myInput); + myPad->getOperator()->setDataType(DataType::Float32); + myPad->getOperator()->setBackend("cuda"); + + myPad->forward(); + + float* computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + + for(int i = 0; i < myOutput->size(); i++){ + const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + } + + delete[] computedOutput; + } + SECTION("Random Input") { + constexpr std::uint16_t NBTRIALS = 10; + // Create a random number generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<float> valueDist( + 0.1f, 1.1f); // Random float distribution between 0 and 1 + std::uniform_int_distribution<std::size_t> padTypeDist(std::size_t(0), std::size_t(1)); + // TODO: fix Reflect and Wrap Pad, cpu and gpu only five same results when padding = 1 + std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1), std::size_t(10)); + std::uniform_int_distribution<std::size_t> padSizeDist(std::size_t(0), std::size_t(5)); + + // To measure execution time of 'forward()' + std::chrono::time_point<std::chrono::system_clock> start; + std::chrono::time_point<std::chrono::system_clock> end; + std::chrono::duration<double, std::micro> duration{}; + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) + { + const std::size_t nbDims = 4; + std::vector<std::size_t> dims; + for (std::size_t i = 0; i < nbDims; ++i) { + dims.push_back(dimSizeDist(gen)); + } + + const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + + const std::size_t borderType = padTypeDist(gen); + const std::size_t padding = padSizeDist(gen); + // Create Pad Operator CUDA + std::shared_ptr<Node> myPadCUDA = Pad<2>({padding, padding, padding, padding}, "mypadcuda", static_cast<PadBorderType>(borderType)); + auto op_cuda = std::static_pointer_cast<OperatorTensor>(myPadCUDA -> getOperator()); + op_cuda->setDataType(DataType::Float32); + op_cuda->setBackend("cuda"); + + // Create Pad Operator CPU + std::shared_ptr<Node> myPadCPU = Pad<2>({padding, padding, padding, padding}, "mypadcpu", static_cast<PadBorderType>(borderType)); + auto op_cpu = std::static_pointer_cast<OperatorTensor>(myPadCPU -> getOperator()); + op_cpu->setDataType(DataType::Float32); + op_cpu->setBackend("cpu"); + + float* array0 = new float[nb_elements]; + for (std::size_t i = 0; i < nb_elements; ++i) { + array0[i] = valueDist(gen); + } + + // input CUDA + float* array0_d; + std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>(); + T0_cuda->setDataType(DataType::Float32); + T0_cuda->setBackend("cuda"); + T0_cuda->resize(dims); + op_cuda->associateInput(0, T0_cuda); + cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements); + cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice); + T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements); + + // input CPU + std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>(); + op_cpu->associateInput(0,T0_cpu); + T0_cpu->setDataType(DataType::Float32); + T0_cpu->setBackend("cpu"); + T0_cpu->resize(dims); + T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements); + + // forward CUDA + start = std::chrono::system_clock::now(); + op_cuda->forward(); + end = std::chrono::system_clock::now(); + duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + + const std::size_t outSize = op_cuda->getOutput(0)->size(); + float *computed_cuda = new float[outSize](); + cudaMemcpy(computed_cuda, op_cuda->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * outSize, cudaMemcpyDeviceToHost); + + // forward CPU + op_cpu->forward(); + float *computed_cpu = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr()); + REQUIRE(approxEq<float>(*computed_cuda, *computed_cpu)); + + delete[] array0; + delete[] computed_cuda; + cudaFree(array0_d); + } + std::cout << "total time: " << duration.count() << "μs" << std::endl; + } +} + +TEST_CASE("[gpu/operator] Pad(backward)", "[Pad][GPU]") { + SECTION("Symmetric Pad") { + const int pv = 0; // pad value + + std::shared_ptr<Node> myPad = Pad<2>({1, 1, 1, 1}, "mypad", PadBorderType::Constant, static_cast<double>(pv)); + auto op = std::static_pointer_cast<OperatorTensor>(myPad -> getOperator()); + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,1,3,5,5> { //NCHW + { + { + {{ 0, 1, 2, 3, 4}, + { 5, 6, 7, 8, 9}, + { 10, 11, 12, 13, 14}, + { 15, 16, 17, 18, 19}, + { 20, 21, 22, 23, 24}}, + + {{ 25, 26, 27, 28, 29}, + { 30, 31, 32, 33, 34}, + { 35, 36, 37, 38, 39}, + { 40, 41, 42, 43, 44}, + { 45, 46, 47, 48, 49}}, + + {{ 50, 51, 52, 53, 54}, + { 55, 56, 57, 58, 59}, + { 60, 61, 62, 63, 64}, + { 65, 66, 67, 68, 69}, + { 70, 71, 72, 73, 74}} + } + } + }); + myInput->setBackend("cuda"); + myPad->getOperator()->associateInput(0,myInput); + myPad->getOperator()->setDataType(DataType::Float32); + myPad->getOperator()->setBackend("cuda"); + + myPad->forward(); + + std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array4D<float,1,3,7,7> { //NCHW + { + { + {{ pv, pv, pv, pv, pv, pv, pv}, + { pv, 0, 1, 2, 3, 4, pv}, + { pv, 5, 6, 7, 8, 9, pv}, + { pv, 10, 11, 12, 13, 14, pv}, + { pv, 15, 16, 17, 18, 19, pv}, + { pv, 20, 21, 22, 23, 24, pv}, + { pv, pv, pv, pv, pv, pv, pv}}, + + {{ pv, pv, pv, pv, pv, pv, pv}, + { pv, 25, 26, 27, 28, 29, pv}, + { pv, 30, 31, 32, 33, 34, pv}, + { pv, 35, 36, 37, 38, 39, pv}, + { pv, 40, 41, 42, 43, 44, pv}, + { pv, 45, 46, 47, 48, 49, pv}, + { pv, pv, pv, pv, pv, pv, pv}}, + + {{ pv, pv, pv, pv, pv, pv, pv}, + { pv, 50, 51, 52, 53, 54, pv}, + { pv, 55, 56, 57, 58, 59, pv}, + { pv, 60, 61, 62, 63, 64, pv}, + { pv, 65, 66, 67, 68, 69, pv}, + { pv, 70, 71, 72, 73, 74, pv}, + { pv, pv, pv, pv, pv, pv, pv}} + } + } + }); + myOutputGrad->setBackend("cuda"); + op->getOutput(0)->setGrad(myOutputGrad); + REQUIRE_NOTHROW(myPad->backward()); + + float *computedGradCuda = new float[myInput->size()](); + cudaMemcpy(computedGradCuda, op->getInput(0)->grad()->getImpl()->rawPtr(), sizeof(float) * myInput->size(), cudaMemcpyDeviceToHost); + + myInput->setBackend("cpu"); + for(int i = 0; i < myInput->size(); i++){ + const float targetOutput = *(static_cast<float*>(myInput->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6); + } + + delete[] computedGradCuda; + } +} \ No newline at end of file diff --git a/unit_tests/Test_ReLUImpl.cpp b/unit_tests/Test_ReLUImpl.cpp index 5651496561f3e1d864767ec38addd4d704b8693c..7ab38aa7def7f846555ae33ccd3871d6ee5a1539 100644 --- a/unit_tests/Test_ReLUImpl.cpp +++ b/unit_tests/Test_ReLUImpl.cpp @@ -1,5 +1,5 @@ /******************************************************************************** - * Copyright (c) 2023 CEA-List + * Copyright (c) 2024 CEA-List * * This program and the accompanying materials are made available under the * terms of the Eclipse Public License 2.0 which is available at @@ -10,123 +10,20 @@ ********************************************************************************/ #include <array> - #include <catch2/catch_test_macros.hpp> - -#include "Test_cuda.hpp" - -#include "aidge/data/Tensor.hpp" +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution #include "aidge/backend/cpu.hpp" #include "aidge/backend/cuda.hpp" +#include "aidge/data/Tensor.hpp" +#include "aidge/utils/TensorUtils.hpp" using namespace Aidge; TEST_CASE("[gpu/operator] ReLU(forward)", "[ReLU][GPU]") { - SECTION("1D Tensor") { - std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array1D<float,10> { - {0, 1, 2,-3, 4,-5,-6, 7, 8, 9} - }); - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array1D<float,10> { - {0, 1, 2, 0, 4, 0, 0, 7, 8, 9} - }); - - std::shared_ptr<Node> myReLU = ReLU(); - auto op = std::static_pointer_cast<OperatorTensor>(myReLU -> getOperator()); - op->associateInput(0,input0); - op->setDataType(DataType::Float32); - op->setBackend("cuda"); - myReLU->forward(); - - float* computedOutput = new float[myOutput->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); - - for(int i = 0; i < myOutput->size(); i++){ - const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); - } - - delete[] computedOutput; - } - - SECTION("2D Tensor") { - std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array2D<float,2,10> { - { - { 0, 1, 2,-3, 4,-5,-6, 7, 8, 9}, - {-5, 4, 2,-3, 4,-5,-6, 7,-1,10} - } - }); - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float,2,10> { - { - { 0, 1, 2, 0, 4, 0, 0, 7, 8, 9}, - { 0, 4, 2, 0, 4, 0, 0, 7, 0,10} - } - }); - - std::shared_ptr<Node> myReLU = ReLU(); - auto op = std::static_pointer_cast<OperatorTensor>(myReLU -> getOperator()); - op->associateInput(0,input0); - op->setDataType(DataType::Float32); - op->setBackend("cuda"); - myReLU->forward(); - - float* computedOutput = new float[myOutput->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); - - for(int i = 0; i < myOutput->size(); i++){ - const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); - } - - delete[] computedOutput; - } - - SECTION("3D Tensor") { - std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array3D<float,2,2,10> { - { - { - { 0, 1, 2,-3, 4,-5,-6, 7, 8, 9}, - {-5, 4, 2,-3, 4,-5,-6, 7,-1,10} - }, - { - { 0, 1, 2,-3, 4,-5,-6, 7, 8, 9}, - {-5, 4, 2,-3, 4,-5,-6, 7,-1,10} - } - } - }); - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array3D<float,2,2,10> { - { - { - { 0, 1, 2, 0, 4, 0, 0, 7, 8, 9}, - { 0, 4, 2, 0, 4, 0, 0, 7, 0,10} - }, - { - { 0, 1, 2, 0, 4, 0, 0, 7, 8, 9}, - { 0, 4, 2, 0, 4, 0, 0, 7, 0,10} - } - } - }); - - std::shared_ptr<Node> myReLU = ReLU(); - auto op = std::static_pointer_cast<OperatorTensor>(myReLU -> getOperator()); - op->associateInput(0,input0); - op->setDataType(DataType::Float32); - op->setBackend("cuda"); - myReLU->forward(); - - float* computedOutput = new float[myOutput->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); - - for(int i = 0; i < myOutput->size(); i++){ - const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); - } - - delete[] computedOutput; - } - - SECTION("4D Tensor") { + SECTION("Constant Input") { std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array4D<float,2,2,2,10> { { { @@ -193,4 +90,80 @@ TEST_CASE("[gpu/operator] ReLU(forward)", "[ReLU][GPU]") { delete[] computedOutput; } + SECTION("Random Input") + { + constexpr std::uint16_t NBTRIALS = 10; + // Create a random number generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<float> valueDist( + 0.1f, 1.1f); // Random float distribution between 0 and 1 + std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1), + std::size_t(10)); + + std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(8)); // Max nbDims supported by cudnn is 8 + // To measure execution time of 'forward()' + std::chrono::time_point<std::chrono::system_clock> start; + std::chrono::time_point<std::chrono::system_clock> end; + std::chrono::duration<double, std::micro> duration{}; + std::size_t number_of_operation = 0; + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) + { + // Create ReLU Operator + std::shared_ptr<Node> myReLU = ReLU("myReLU"); + auto op = std::static_pointer_cast<OperatorTensor>(myReLU->getOperator()); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + + // generate a random Tensor + const std::size_t nbDims = nbDimsDist(gen); + std::vector<std::size_t> dims; + for (std::size_t i = 0; i < nbDims; ++i) + { + dims.push_back(dimSizeDist(gen)); + } + + const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + number_of_operation += nb_elements; + + // Create the input Tensor + std::shared_ptr<Tensor> T0 = std::make_shared<Tensor>(); + T0->setDataType(DataType::Float32); + T0->setBackend("cuda"); + T0->resize(dims); + op->associateInput(0, T0); + + // Fill input tensor + float *input_h = new float[nb_elements]; + float *output_h = new float[nb_elements]; + for (std::size_t i = 0; i < nb_elements; ++i) + { + float value = valueDist(gen); + input_h[i] = value; + output_h[i] = value>=0?value:0.0f; + } + float *input_d; + cudaMalloc(reinterpret_cast<void **>(&input_d), sizeof(float) * nb_elements); + cudaMemcpy(input_d, input_h, sizeof(float) * nb_elements, cudaMemcpyHostToDevice); + T0->getImpl()->setRawPtr(input_d, nb_elements); + + // Run inference + start = std::chrono::system_clock::now(); + myReLU->forward(); + end = std::chrono::system_clock::now(); + duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + + float *computedOutput = new float[nb_elements](); + cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * nb_elements, cudaMemcpyDeviceToHost); + + REQUIRE(approxEq<float>(*computedOutput, *output_h)); + + delete[] computedOutput; + delete[] input_h; + cudaFree(input_d); + } + std::cout << "number of elements over time spent: " << (number_of_operation / duration.count()) << std::endl; + std::cout << "total time: " << duration.count() << "μs" << std::endl; + + } } diff --git a/unit_tests/Test_ReshapeImpl.cpp b/unit_tests/Test_ReshapeImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..df9a4dda6d59371c8dd07f8c4442e3a3bb4a7159 --- /dev/null +++ b/unit_tests/Test_ReshapeImpl.cpp @@ -0,0 +1,274 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <array> +#include <numeric> // std::accumulate, std::shuffle, std::transform +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution + +#include <catch2/catch_test_macros.hpp> + +#include "aidge/backend/cpu.hpp" +#include "aidge/backend/cuda.hpp" +#include "aidge/data/Tensor.hpp" +#include "aidge/utils/TensorUtils.hpp" + +using namespace Aidge; + + +TEST_CASE("[gpu/operator] Reshape(forward)") { + SECTION("1D Tensor") { + std::shared_ptr<Tensor> input = std::make_shared<Tensor>(Array1D<float,6> { + {1.0, 2.0, 3.0, 4.0, 5.0, 6.0} + }); + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float,2,3> { + { + {1.0, 2.0, 3.0}, + {4.0, 5.0, 6.0} + } + }); + + std::shared_ptr<Node> myReshape = Reshape({2, 3}); + auto op = std::static_pointer_cast<OperatorTensor>(myReshape -> getOperator()); + op->associateInput(0, input); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + myReshape->forward(); + + float* computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + + for(int i = 0; i < myOutput->size(); i++){ + const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + } + + delete[] computedOutput; + } + SECTION("2D Tensor") { + std::shared_ptr<Tensor> input = std::make_shared<Tensor>(Array2D<float,2,3> { + { + {1.0, 2.0, 3.0}, + {4.0, 5.0, 6.0} + } + + }); + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float,3,2> { + { + {1.0, 2.0}, + {3.0, 4.0}, + {5.0, 6.0} + } + }); + + std::shared_ptr<Node> myReshape = Reshape({3, 2}); + auto op = std::static_pointer_cast<OperatorTensor>(myReshape -> getOperator()); + op->associateInput(0, input); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + myReshape->forward(); + + float* computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + + for(int i = 0; i < myOutput->size(); i++){ + const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + } + + delete[] computedOutput; + } + SECTION("Random Input") + { + constexpr std::uint16_t NBTRIALS = 10; + // Create a random number generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<float> valueDist( + 0.1f, 1.1f); // Random float distribution between 0 and 1 + std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1), + std::size_t(10)); + + std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(CUDNN_DIM_MAX)); // Max nbDims supported by cudnn is 8 + // To measure execution time of 'forward()' + std::chrono::time_point<std::chrono::system_clock> start; + std::chrono::time_point<std::chrono::system_clock> end; + std::chrono::duration<double, std::micro> duration{}; + std::size_t number_of_operation = 0; + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) + { + // generate a random Tensor + const std::size_t nbDims = nbDimsDist(gen); + std::vector<std::size_t> dims, shuffeledDims; + for (std::size_t i = 0; i < nbDims; ++i) + { + dims.push_back(dimSizeDist(gen)); + } + shuffeledDims = dims; + std::shuffle(shuffeledDims.begin(), shuffeledDims.end(), gen); + + std::vector<std::int64_t> shuffeledIntDims(shuffeledDims.size()); + std::transform(shuffeledDims.begin(), shuffeledDims.end(), shuffeledIntDims.begin(), + [](int value) { return static_cast<std::int64_t>(value); }); + // Create Reshape Operator CUDA + std::shared_ptr<Node> myReshapeCuda = Reshape(shuffeledIntDims, false,"myreshapecuda"); + auto op_cuda = std::static_pointer_cast<OperatorTensor>(myReshapeCuda->getOperator()); + op_cuda->setDataType(DataType::Float32); + op_cuda->setBackend("cuda"); + + // Create Reshape Operator CPU + std::shared_ptr<Node> myReshapeCpu = Reshape(shuffeledIntDims, false,"myreshapecpu"); + auto op_cpu = std::static_pointer_cast<OperatorTensor>(myReshapeCpu->getOperator()); + op_cpu->setDataType(DataType::Float32); + op_cpu->setBackend("cpu"); + + const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + number_of_operation += nb_elements; + + // Fill input tensor + float *array0 = new float[nb_elements]; + for (std::size_t i = 0; i < nb_elements; ++i) + { + array0[i] = valueDist(gen); + } + + // input0 CUDA + float* array0_d; + std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>(); + T0_cuda->setDataType(DataType::Float32); + T0_cuda->setBackend("cuda"); + T0_cuda->resize(dims); + op_cuda->associateInput(0, T0_cuda); + cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements); + cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice); + T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements); + + // input0 CPU + std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>(); + op_cpu->associateInput(0,T0_cpu); + T0_cpu->setDataType(DataType::Float32); + T0_cpu->setBackend("cpu"); + T0_cpu->resize(dims); + T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements); + + // Run inference + start = std::chrono::system_clock::now(); + op_cuda->forward(); + end = std::chrono::system_clock::now(); + duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + + float *computed_cuda = new float[nb_elements]; + cudaMemcpy(computed_cuda, op_cuda->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * nb_elements, cudaMemcpyDeviceToHost); + + // forward CPU + op_cpu->forward(); + float *computed_cpu = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr()); + REQUIRE(approxEq<float>(*computed_cuda, *computed_cpu)); + + delete[] computed_cuda; + delete[] array0; + cudaFree(array0_d); + } + std::cout << "number of elements over time spent: " << (number_of_operation / duration.count()) << std::endl; + std::cout << "total time: " << duration.count() << "μs" << std::endl; + + } +} + +TEST_CASE("[gpu/operator] Reshape(backward)") { + SECTION("1D Tensor") { + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<float,2,3> { + { + {1.0, 2.0, 3.0}, + {4.0, 5.0, 6.0} + } + }); + + std::shared_ptr<Node> myReshape = Reshape({6}); + auto op = std::static_pointer_cast<OperatorTensor>(myReshape -> getOperator()); + op->associateInput(0, myInput); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + myReshape->forward(); + + // Run and test backward operation + std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array1D<float, 6> { + {1, 2, 3, 4, 5, 6} + }); + myOutputGrad->setBackend("cuda"); + std::shared_ptr<Tensor> predictedOutput = op->getOutput(0); + std::shared_ptr<Tensor> input = op->getInput(0); + predictedOutput->setGrad(myOutputGrad); + REQUIRE_NOTHROW(myReshape->backward()); + + std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array2D<float,2,3> { + { + {1.0, 2.0, 3.0}, + {4.0, 5.0, 6.0} + } + }); + + float *computedGradCuda = new float[expectedInputGrad->size()](); + cudaMemcpy(computedGradCuda, input->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInputGrad->size(), cudaMemcpyDeviceToHost); + + for(int i = 0; i < expectedInputGrad->size(); i++){ + const float targetOutput = *(static_cast<float*>(expectedInputGrad->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6); + } + + delete[] computedGradCuda; + } + SECTION("2D Tensor") { + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<float,2,3> { + { + {1.0, 2.0, 3.0}, + {4.0, 5.0, 6.0} + } + }); + + std::shared_ptr<Node> myReshape = Reshape({3, 2}); + auto op = std::static_pointer_cast<OperatorTensor>(myReshape -> getOperator()); + op->associateInput(0, myInput); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + myReshape->forward(); + + // Run and test backward operation + std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array2D<float, 3, 2> { + { + {1.0, 2.0}, + {3.0, 4.0}, + {5.0, 6.0} + } + }); + myOutputGrad->setBackend("cuda"); + std::shared_ptr<Tensor> predictedOutput = op->getOutput(0); + std::shared_ptr<Tensor> input = op->getInput(0); + predictedOutput->setGrad(myOutputGrad); + REQUIRE_NOTHROW(myReshape->backward()); + + std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array2D<float,2,3> { + { + {1.0, 2.0, 3.0}, + {4.0, 5.0, 6.0} + } + }); + + float *computedGradCuda = new float[expectedInputGrad->size()](); + cudaMemcpy(computedGradCuda, input->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInputGrad->size(), cudaMemcpyDeviceToHost); + + for(int i = 0; i < expectedInputGrad->size(); i++){ + const float targetOutput = *(static_cast<float*>(expectedInputGrad->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6); + } + + delete[] computedGradCuda; + } +} diff --git a/version.txt b/version.txt index 341cf11faf9a29504168de4e54beaad182c5adc5..f4778493c50025c6ab147a1fec7486ef0c706792 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -0.2.0 \ No newline at end of file +0.2.2 \ No newline at end of file