diff --git a/include/aidge/backend/cuda.hpp b/include/aidge/backend/cuda.hpp index 8a17d69ce11c8ba485d365686280a7a224f062c6..974dbb0248a77024b4c8bc7c7c467646512f0828 100644 --- a/include/aidge/backend/cuda.hpp +++ b/include/aidge/backend/cuda.hpp @@ -13,15 +13,18 @@ #define AIDGE_BACKEND_CUDA_IMPORTS_H_ #include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/OperatorImpl.hpp" #include "aidge/backend/cuda/operator/AddImpl.hpp" #include "aidge/backend/cuda/operator/AndImpl.hpp" #include "aidge/backend/cuda/operator/ArgMaxImpl.hpp" #include "aidge/backend/cuda/operator/AvgPoolingImpl.hpp" #include "aidge/backend/cuda/operator/BatchNormImpl.hpp" #include "aidge/backend/cuda/operator/ConvImpl.hpp" +#include "aidge/backend/cuda/operator/ClipImpl.hpp" #include "aidge/backend/cuda/operator/DivImpl.hpp" #include "aidge/backend/cuda/operator/FCImpl.hpp" #include "aidge/backend/cuda/operator/GlobalAveragePoolingImpl.hpp" +#include "aidge/backend/cuda/operator/LRNImpl.hpp" #include "aidge/backend/cuda/operator/LnImpl.hpp" #include "aidge/backend/cuda/operator/MaxPoolingImpl.hpp" #include "aidge/backend/cuda/operator/MulImpl.hpp" @@ -30,6 +33,7 @@ #include "aidge/backend/cuda/operator/ReduceMeanImpl.hpp" #include "aidge/backend/cuda/operator/ReduceSumImpl.hpp" #include "aidge/backend/cuda/operator/ReLUImpl.hpp" +#include "aidge/backend/cuda/operator/RoundImpl.hpp" #include "aidge/backend/cuda/operator/ShiftMaxImpl.hpp" #include "aidge/backend/cuda/operator/ShiftGELUImpl.hpp" #include "aidge/backend/cuda/operator/ReshapeImpl.hpp" diff --git a/include/aidge/backend/cuda/data/TensorImpl.hpp b/include/aidge/backend/cuda/data/TensorImpl.hpp index 541afeecc751332d41ff082b790282abcad5a1b0..5a2873cfbfa9aafc97b543f2ccce3d1abf0b6057 100644 --- a/include/aidge/backend/cuda/data/TensorImpl.hpp +++ b/include/aidge/backend/cuda/data/TensorImpl.hpp @@ -94,7 +94,7 @@ public: } void copy(const void *src, NbElts_t length, NbElts_t offset = 0) override { - AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "TensorImpl_cuda<{}>::copy(): copy length ({}) is above capacity ({})", typeid(T).name(), length, mNbElts); + AIDGE_ASSERT(offset + length <= mNbElts, "TensorImpl_cuda<{}>::copy(): copy offset ({}) + length ({}) is above capacity ({})", typeid(T).name(), offset, length, mNbElts); const T* srcT = static_cast<const T *>(src); T* dstT = static_cast<T *>(rawPtr(offset)); @@ -107,7 +107,7 @@ public: return; } - AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "TensorImpl_cuda<{}>::copyCast(): copy length ({}) is above capacity ({})", typeid(T).name(), length, mNbElts); + AIDGE_ASSERT(offset + length <= mNbElts, "TensorImpl_cuda<{}>::copyCast(): copy offset ({}) + length ({}) is above capacity ({})", typeid(T).name(), offset, length, mNbElts); switch (srcDt) { case DataType::Float64: thrust_copy(static_cast<const double*>(src), @@ -171,17 +171,17 @@ public: } void copyFromDevice(const void *src, const std::pair<std::string, DeviceIdx_t>& device, NbElts_t length, NbElts_t offset = 0) override { - AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "TensorImpl_cuda<{}>::copyFromDevice(): copy length ({}) is above capacity ({})", typeid(T).name(), length, mNbElts); + AIDGE_ASSERT(offset + length <= mNbElts, "TensorImpl_cuda<{}>::copyFromDevice(): copy offset ({}) + length ({}) is above capacity ({})", typeid(T).name(), offset, length, mNbElts); CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(offset), src, length * sizeof(T), cudaMemcpyDeviceToDevice)); } void copyFromHost(const void *src, NbElts_t length, NbElts_t offset = 0) override { - AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "TensorImpl_cuda<{}>::copyFromHost(): copy length ({}) is above capacity ({})", typeid(T).name(), length, mNbElts); + AIDGE_ASSERT(offset + length <= mNbElts, "TensorImpl_cuda<{}>::copyFromHost(): copy offset ({}) + length ({}) is above capacity ({})", typeid(T).name(), offset, length, mNbElts); CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(offset), src, length * sizeof(T), cudaMemcpyHostToDevice)); } void copyToHost(void *dst, NbElts_t length, NbElts_t offset = 0) const override { - AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "TensorImpl_cuda<{}>::copyToHost(): copy length ({}) is above capacity ({})", typeid(T).name(), length, mNbElts); + AIDGE_ASSERT(offset + length <= mNbElts, "TensorImpl_cuda<{}>::copyToHost(): copy offset ({}) + length ({}) is above capacity ({})", typeid(T).name(), offset, length, mNbElts); CHECK_CUDA_STATUS(cudaMemcpy(dst, rawPtr(offset), length * sizeof(T), cudaMemcpyDeviceToHost)); } diff --git a/include/aidge/backend/cuda/operator/AddImpl.hpp b/include/aidge/backend/cuda/operator/AddImpl.hpp index 429d6f1b04489d9e38ce96d584a1ce9528dd0b2d..42d420f8410f79100fdfdbe3eabb8b43e616a74a 100644 --- a/include/aidge/backend/cuda/operator/AddImpl.hpp +++ b/include/aidge/backend/cuda/operator/AddImpl.hpp @@ -36,7 +36,7 @@ public: return std::make_unique<AddImpl_cuda>(op); } - virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + virtual std::vector<ImplSpec> getAvailableImplSpecs() const override { return { {DataType::Float64}, {DataType::Float32}, diff --git a/include/aidge/backend/cuda/operator/AndImpl.hpp b/include/aidge/backend/cuda/operator/AndImpl.hpp index 4105ec87db2c58e218c629a1c94f31efd37c80ee..e90a4c5fe3d7b4cd529dcb4cb5400a6447f53e3c 100644 --- a/include/aidge/backend/cuda/operator/AndImpl.hpp +++ b/include/aidge/backend/cuda/operator/AndImpl.hpp @@ -36,7 +36,7 @@ public: return std::make_unique<AndImpl_cuda>(op); } - virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + virtual std::vector<ImplSpec> getAvailableImplSpecs() const override { return { {DataType::Float64}, {DataType::Float32}, diff --git a/include/aidge/backend/cuda/operator/ArgMaxImpl.hpp b/include/aidge/backend/cuda/operator/ArgMaxImpl.hpp index a89aebf96914f258f6be616b940ec195ec9ae2a9..7b4628084a913a10e48302597a4d5b77fb7f6d16 100644 --- a/include/aidge/backend/cuda/operator/ArgMaxImpl.hpp +++ b/include/aidge/backend/cuda/operator/ArgMaxImpl.hpp @@ -36,7 +36,7 @@ public: return std::make_unique<ArgMaxImpl_cuda>(op); } - virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + virtual std::vector<ImplSpec> getAvailableImplSpecs() const override { return { {DataType::Float64}, {DataType::Float32}, diff --git a/include/aidge/backend/cuda/operator/AvgPoolingImpl.hpp b/include/aidge/backend/cuda/operator/AvgPoolingImpl.hpp index 7f8fb4075affd3e5f17533ea67b051dbb6395f04..1c4efcf66850330fe9747c500093efa4456fa3f1 100644 --- a/include/aidge/backend/cuda/operator/AvgPoolingImpl.hpp +++ b/include/aidge/backend/cuda/operator/AvgPoolingImpl.hpp @@ -37,7 +37,7 @@ public: return std::make_unique<AvgPoolingImpl_cuda>(op); } - virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + virtual std::vector<ImplSpec> getAvailableImplSpecs() const override { return { {DataType::Float64}, {DataType::Float32}, diff --git a/include/aidge/backend/cuda/operator/BatchNormImpl.hpp b/include/aidge/backend/cuda/operator/BatchNormImpl.hpp index 5ba8656ef8a25ffa53584641a938f637ecff9b94..025ef406fa6a988e758707b11fb2ceab6c829f26 100644 --- a/include/aidge/backend/cuda/operator/BatchNormImpl.hpp +++ b/include/aidge/backend/cuda/operator/BatchNormImpl.hpp @@ -37,7 +37,7 @@ public: return std::make_unique<BatchNormImpl_cuda>(op); } - virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + virtual std::vector<ImplSpec> getAvailableImplSpecs() const override { return { {DataType::Float64}, {DataType::Float32}, diff --git a/include/aidge/backend/cuda/operator/ClipImpl.hpp b/include/aidge/backend/cuda/operator/ClipImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..ee641526cc4217f4ed309db43071345b46397282 --- /dev/null +++ b/include/aidge/backend/cuda/operator/ClipImpl.hpp @@ -0,0 +1,59 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_BACKEND_CUDA_OPERATOR_CLIPIMPL_H_ +#define AIDGE_BACKEND_CUDA_OPERATOR_CLIPIMPL_H_ + +#include <array> +#include <memory> +#include <tuple> +#include <vector> + +#include <cudnn.h> + +#include "aidge/backend/OperatorImpl.hpp" +#include "aidge/operator/Clip.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" + +#include "aidge/backend/cuda/utils/CudaUtils.hpp" + +namespace Aidge { +// Operator implementation entry point for the backend +class ClipImpl_cuda : public OperatorImpl { +public: + ClipImpl_cuda(const Clip_Op& op) : OperatorImpl(op, "cuda") {} + + static std::unique_ptr<ClipImpl_cuda> create(const Clip_Op& op) { + return std::make_unique<ClipImpl_cuda>(op); + } + + virtual std::vector<ImplSpec> getAvailableImplSpecs() const override { + return { + {DataType::Float64}, + {DataType::Float32}, + {DataType::Float16}, + }; + } + + void forward() override; + void backward() override; + +private: + template <class T> void forward_(); + template <class T> void backward_(const Tensor& outGrad); +}; + +// Implementation entry point registration to Operator +REGISTRAR(Clip_Op, "cuda", Aidge::ClipImpl_cuda::create); +} // namespace Aidge + +#endif /* AIDGE_BACKEND_CUDA_OPERATOR_CLIPIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/ClipImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/ClipImpl_CUDA_kernels.hpp new file mode 100644 index 0000000000000000000000000000000000000000..96bc460a04e3e149c5d57b646ee8baa2f3bf6e85 --- /dev/null +++ b/include/aidge/backend/cuda/operator/ClipImpl_CUDA_kernels.hpp @@ -0,0 +1,37 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CUDA_OPERATOR_CLIPIMPL_KERNELS_H_ +#define AIDGE_CUDA_OPERATOR_CLIPIMPL_KERNELS_H_ + +#include <stdexcept> +#include <cfloat> +#include <cuda.h> +#include <cuda_runtime_api.h> +#include <cuda_fp16.h> + +#include "aidge/data/Data.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/utils/Types.h" + +namespace Aidge { + +template <class T> +void clipForward(const T* input, T* output,int size,T min_val, T max_val); + + +} +#endif /* AIDGE_CUDA_OPERATOR_CLIPIMPL_KERNELS_H_ */ + + + + + diff --git a/include/aidge/backend/cuda/operator/ConvImpl.hpp b/include/aidge/backend/cuda/operator/ConvImpl.hpp index ce94ec6695735c93d5c8d0acfdc6153e91e7147d..27f3781a6824dd71d228b90c71df58b12ea0a6b3 100644 --- a/include/aidge/backend/cuda/operator/ConvImpl.hpp +++ b/include/aidge/backend/cuda/operator/ConvImpl.hpp @@ -43,7 +43,7 @@ public: return std::make_unique<ConvImpl_cuda<DIM>>(op, true); } - virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + virtual std::vector<ImplSpec> getAvailableImplSpecs() const override { return { {DataType::Any} }; diff --git a/include/aidge/backend/cuda/operator/DivImpl.hpp b/include/aidge/backend/cuda/operator/DivImpl.hpp index 4b15445cb791aa1cf2520018d1015e19aaf10ce3..fbd3c73f1741d05549f06290ba9166b8d11c604d 100644 --- a/include/aidge/backend/cuda/operator/DivImpl.hpp +++ b/include/aidge/backend/cuda/operator/DivImpl.hpp @@ -36,7 +36,7 @@ public: return std::make_unique<DivImpl_cuda>(op); } - virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + virtual std::vector<ImplSpec> getAvailableImplSpecs() const override { return { {DataType::Float64}, {DataType::Float32}, diff --git a/include/aidge/backend/cuda/operator/FCImpl.hpp b/include/aidge/backend/cuda/operator/FCImpl.hpp index f2dd0c90c0096a1b57fb6860e5991d0c1e824be9..8380754ea2419b2baff6de5126f8b6ff3e640178 100644 --- a/include/aidge/backend/cuda/operator/FCImpl.hpp +++ b/include/aidge/backend/cuda/operator/FCImpl.hpp @@ -36,7 +36,7 @@ public: return std::make_unique<FCImpl_cuda>(op); } - virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + virtual std::vector<ImplSpec> getAvailableImplSpecs() const override { return { {DataType::Float64}, {DataType::Float32}, diff --git a/include/aidge/backend/cuda/operator/GlobalAveragePoolingImpl.hpp b/include/aidge/backend/cuda/operator/GlobalAveragePoolingImpl.hpp index 3f0386dcfa68d4b55bebeb524dfedfd5edeb0fe9..5b0cf07ab8687b9746d13af2274465ad923e6571 100644 --- a/include/aidge/backend/cuda/operator/GlobalAveragePoolingImpl.hpp +++ b/include/aidge/backend/cuda/operator/GlobalAveragePoolingImpl.hpp @@ -36,7 +36,7 @@ public: return std::make_unique<GlobalAveragePoolingImpl_cuda>(op); } - virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + virtual std::vector<ImplSpec> getAvailableImplSpecs() const override { return { {DataType::Any} }; diff --git a/include/aidge/backend/cuda/operator/ILayerNormImpl.hpp b/include/aidge/backend/cuda/operator/ILayerNormImpl.hpp index 742401de7903f19ab4d8f51a153b0e864f21dd47..0d858c4719899094f996ca4f82f075df547a6fd4 100644 --- a/include/aidge/backend/cuda/operator/ILayerNormImpl.hpp +++ b/include/aidge/backend/cuda/operator/ILayerNormImpl.hpp @@ -37,7 +37,7 @@ public: return std::make_unique<ILayerNormImpl_cuda>(op); } - virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + virtual std::vector<ImplSpec> getAvailableImplSpecs() const override { return { {DataType::Float64}, {DataType::Float32}, diff --git a/include/aidge/backend/cuda/operator/LRNImpl.hpp b/include/aidge/backend/cuda/operator/LRNImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..6eb412435e735dd2ec765911d74e521fe9612a74 --- /dev/null +++ b/include/aidge/backend/cuda/operator/LRNImpl.hpp @@ -0,0 +1,63 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_BACKEND_CUDA_OPERATOR_LRNIMPL_H_ +#define AIDGE_BACKEND_CUDA_OPERATOR_LRNIMPL_H_ + +#include <array> +#include <memory> +#include <tuple> +#include <vector> + +#include <cudnn.h> + +#include "aidge/backend/OperatorImpl.hpp" +#include "aidge/operator/LRN.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" + +#include "aidge/backend/cuda/utils/CudaUtils.hpp" + +namespace Aidge { +// Operator implementation entry point for the backend +class LRNImpl_cuda : public OperatorImpl { +public: + LRNImpl_cuda(const LRN_Op& op) : OperatorImpl(op, "cuda") {} + + static std::unique_ptr<LRNImpl_cuda> create(const LRN_Op& op) { + return std::make_unique<LRNImpl_cuda>(op); + } + + virtual std::vector<ImplSpec> getAvailableImplSpecs() const override { + return { + {DataType::Any} + }; + } + + void forward() override; + void backward() override; + ~LRNImpl_cuda(); + +private: + // CuDNN specific variables + cudnnLRNDescriptor_t mLRNDesc = nullptr; + std::shared_ptr<Tensor> mInputFallback; + std::shared_ptr<Tensor> mOutputGradFallback; + + template <class T> void forward_(const Tensor& input); + template <class T> void backward_(const Tensor& output_grad); +}; + +// Implementation entry point registration to Operator +REGISTRAR(LRN_Op, "cuda", Aidge::LRNImpl_cuda::create); +} // namespace Aidge + +#endif /* AIDGE_BACKEND_CUDA_OPERATOR_LRNIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/LnImpl.hpp b/include/aidge/backend/cuda/operator/LnImpl.hpp index 1617754fbf5dd52e099a9787a25a827851933af9..fbbccc11275b5c11bbaa86d05a2c19a1a46c11c1 100644 --- a/include/aidge/backend/cuda/operator/LnImpl.hpp +++ b/include/aidge/backend/cuda/operator/LnImpl.hpp @@ -36,7 +36,7 @@ public: return std::make_unique<LnImpl_cuda>(op); } - virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + virtual std::vector<ImplSpec> getAvailableImplSpecs() const override { return { {DataType::Float64}, {DataType::Float32}, diff --git a/include/aidge/backend/cuda/operator/MaxPoolingImpl.hpp b/include/aidge/backend/cuda/operator/MaxPoolingImpl.hpp index a203e761beaeccec96b36bbd5a424a193cdb6387..474a408f9697e8e91ffe9c8e2a79a79d7968e80a 100644 --- a/include/aidge/backend/cuda/operator/MaxPoolingImpl.hpp +++ b/include/aidge/backend/cuda/operator/MaxPoolingImpl.hpp @@ -37,7 +37,7 @@ public: return std::make_unique<MaxPoolingImpl_cuda>(op); } - virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + virtual std::vector<ImplSpec> getAvailableImplSpecs() const override { return { {DataType::Any} }; diff --git a/include/aidge/backend/cuda/operator/MulImpl.hpp b/include/aidge/backend/cuda/operator/MulImpl.hpp index 37d3d5a0df7b63dc63ad13737d8a8b463bf315c8..9a1a4d79d32c7a962d2086319d948e60a9f51049 100644 --- a/include/aidge/backend/cuda/operator/MulImpl.hpp +++ b/include/aidge/backend/cuda/operator/MulImpl.hpp @@ -36,7 +36,7 @@ public: return std::make_unique<MulImpl_cuda>(op); } - virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + virtual std::vector<ImplSpec> getAvailableImplSpecs() const override { return { {DataType::Float64}, {DataType::Float32}, diff --git a/include/aidge/backend/cuda/operator/OperatorImpl.hpp b/include/aidge/backend/cuda/operator/OperatorImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..4cbd29a617e6f89c73939f214e727b2f96e78149 --- /dev/null +++ b/include/aidge/backend/cuda/operator/OperatorImpl.hpp @@ -0,0 +1,53 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CUDA_OPERATOR_IMPL_H_ +#define AIDGE_CUDA_OPERATOR_IMPL_H_ + +#include <cstddef> // std::size_t +#include <memory> +#include <tuple> // std::tuple +#include <vector> + +#include "aidge/backend/OperatorImpl.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" + +namespace Aidge { + +template <class Op, class FwdFunc, class BwdFunc = void()> +class OperatorImpl_cuda : public OperatorImpl, + public Registrable<OperatorImpl_cuda<Op, FwdFunc, BwdFunc>, ImplSpec, Impl<FwdFunc, BwdFunc>> +{ +public: + OperatorImpl_cuda(const Op& op) : OperatorImpl(op, "cuda") {} + + static std::unique_ptr<OperatorImpl_cuda<Op, FwdFunc, BwdFunc>> create(const Op& op) { + return std::make_unique<OperatorImpl_cuda<Op, FwdFunc, BwdFunc>>(op); + } + + virtual std::shared_ptr<ProdConso> getProdConso() const override { + const auto impl = Registrar<OperatorImpl_cuda>::create(getBestMatch(getRequiredSpec())); + return impl.prodConso(mOp); + } + + virtual std::vector<ImplSpec> getAvailableImplSpecs() const override { + // return Registrar<OperatorImpl_cuda>::getKeys(); // Note: cannot return set due to python binding + std::set<ImplSpec> implSpecsSet = Registrar<OperatorImpl_cuda>::getKeys(); + return std::vector<ImplSpec>(implSpecsSet.begin(), implSpecsSet.end()); + } + + void forward() override; + void backward() override; +}; +} // namespace Aidge + +#endif /* AIDGE_CUDA_OPERATOR_IMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/PadImpl.hpp b/include/aidge/backend/cuda/operator/PadImpl.hpp index d51361d6ee5a3ec9a858d290b3f5fe5251b6fa97..a0f7037c811cd3cb130cffed0bb7746e33220074 100644 --- a/include/aidge/backend/cuda/operator/PadImpl.hpp +++ b/include/aidge/backend/cuda/operator/PadImpl.hpp @@ -37,7 +37,7 @@ public: return std::make_unique<PadImpl_cuda>(op); } - virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + virtual std::vector<ImplSpec> getAvailableImplSpecs() const override { return { {DataType::Float64}, {DataType::Float32}, diff --git a/include/aidge/backend/cuda/operator/PadImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/PadImpl_CUDA_kernels.hpp index 11ddb0ea8b0e6603bf009c4ae0a7fa3247a8904f..b52d9883fa0acd320396bb358f253dcf62fea638 100644 --- a/include/aidge/backend/cuda/operator/PadImpl_CUDA_kernels.hpp +++ b/include/aidge/backend/cuda/operator/PadImpl_CUDA_kernels.hpp @@ -32,6 +32,8 @@ namespace Aidge unsigned int padType, T padValue, const T *input, - T *outputs); + T *outputs, + const T alpha, + const T beta); } #endif /* AIDGE_CUDA_OPERATOR_PADIMPL_KERNELS_H_ */ \ No newline at end of file diff --git a/include/aidge/backend/cuda/operator/PowImpl.hpp b/include/aidge/backend/cuda/operator/PowImpl.hpp index 403648d9a294ee598f117c8b05e6f0875e998307..9b53d8dc04985794238f79cff9c78c44408fb6d7 100644 --- a/include/aidge/backend/cuda/operator/PowImpl.hpp +++ b/include/aidge/backend/cuda/operator/PowImpl.hpp @@ -36,7 +36,7 @@ public: return std::make_unique<PowImpl_cuda>(op); } - virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + virtual std::vector<ImplSpec> getAvailableImplSpecs() const override { return { {DataType::Float64}, {DataType::Float32}, diff --git a/include/aidge/backend/cuda/operator/ReLUImpl.hpp b/include/aidge/backend/cuda/operator/ReLUImpl.hpp index 344923ba1ee08642a3e3e5f685bfd2c7de8a74b4..306a56c4d0959dc4d818a6791173c375f5435360 100644 --- a/include/aidge/backend/cuda/operator/ReLUImpl.hpp +++ b/include/aidge/backend/cuda/operator/ReLUImpl.hpp @@ -36,7 +36,7 @@ public: return std::make_unique<ReLUImpl_cuda>(op); } - virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + virtual std::vector<ImplSpec> getAvailableImplSpecs() const override { return { {DataType::Any} }; diff --git a/include/aidge/backend/cuda/operator/ReduceImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/ReduceImpl_CUDA_kernels.hpp index 9d352b8b1d14aeaa4230accd7aa81c279c18b7a8..bd9d4804330344e10cda9beffa595881d996ce9d 100644 --- a/include/aidge/backend/cuda/operator/ReduceImpl_CUDA_kernels.hpp +++ b/include/aidge/backend/cuda/operator/ReduceImpl_CUDA_kernels.hpp @@ -25,6 +25,8 @@ namespace Aidge const std::vector<std::size_t>& outputDims, const std::vector<int>& axes, const std::vector<std::size_t>& factors, - int outSize); + int outSize, + const T alpha, + const T beta); } #endif /* AIDGE_CUDA_OPERATOR_REDUCEIMPL_KERNEL_H_ */ \ No newline at end of file diff --git a/include/aidge/backend/cuda/operator/ReduceMeanImpl.hpp b/include/aidge/backend/cuda/operator/ReduceMeanImpl.hpp index a50ff21b35f0b062c6a9c327ea2892c15055a175..1f6878480d69e19f8c73a12862cc12b2d675440d 100644 --- a/include/aidge/backend/cuda/operator/ReduceMeanImpl.hpp +++ b/include/aidge/backend/cuda/operator/ReduceMeanImpl.hpp @@ -36,7 +36,7 @@ public: return std::make_unique<ReduceMeanImpl_cuda>(op); } - virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + virtual std::vector<ImplSpec> getAvailableImplSpecs() const override { return { {DataType::Float64}, {DataType::Float32}, diff --git a/include/aidge/backend/cuda/operator/ReduceSumImpl.hpp b/include/aidge/backend/cuda/operator/ReduceSumImpl.hpp index a5a7ae48d7e5bd8f370964d7f81795ecbaa5986b..10af90ba3a4ffc1d1464dd73f15313315b0c0032 100644 --- a/include/aidge/backend/cuda/operator/ReduceSumImpl.hpp +++ b/include/aidge/backend/cuda/operator/ReduceSumImpl.hpp @@ -36,7 +36,7 @@ public: return std::make_unique<ReduceSumImpl_cuda>(op); } - virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + virtual std::vector<ImplSpec> getAvailableImplSpecs() const override { return { {DataType::Float64}, {DataType::Float32}, diff --git a/include/aidge/backend/cuda/operator/ReshapeImpl.hpp b/include/aidge/backend/cuda/operator/ReshapeImpl.hpp index d412590c63f925806973038d67ee18e0847f79c2..2c8ebd68cff0313031279f83109043eb17d919b5 100644 --- a/include/aidge/backend/cuda/operator/ReshapeImpl.hpp +++ b/include/aidge/backend/cuda/operator/ReshapeImpl.hpp @@ -36,7 +36,7 @@ public: return std::make_unique<ReshapeImpl_cuda>(op); } - virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + virtual std::vector<ImplSpec> getAvailableImplSpecs() const override { return { {DataType::Float64}, {DataType::Float32}, diff --git a/include/aidge/backend/cuda/operator/RoundImpl.hpp b/include/aidge/backend/cuda/operator/RoundImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..8b9b9b3dc34aa6c63baf1e9c510ec6b48ddd6586 --- /dev/null +++ b/include/aidge/backend/cuda/operator/RoundImpl.hpp @@ -0,0 +1,59 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_BACKEND_CUDA_OPERATOR_ROUNDIMPL_H_ +#define AIDGE_BACKEND_CUDA_OPERATOR_ROUNDIMPL_H_ + +#include <array> +#include <memory> +#include <tuple> +#include <vector> + +#include <cudnn.h> + +#include "aidge/backend/OperatorImpl.hpp" +#include "aidge/operator/Round.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" + +#include "aidge/backend/cuda/utils/CudaUtils.hpp" + +namespace Aidge { +// Operator implementation entry point for the backend +class RoundImpl_cuda : public OperatorImpl { +public: + RoundImpl_cuda(const Round_Op& op) : OperatorImpl(op, "cuda") {} + + static std::unique_ptr<RoundImpl_cuda> create(const Round_Op& op) { + return std::make_unique<RoundImpl_cuda>(op); + } + + virtual std::vector<ImplSpec> getAvailableImplSpecs() const override { + return { + {DataType::Float64}, + {DataType::Float32}, + {DataType::Float16}, + }; + } + + void forward() override; + void backward() override; + +private: + template <class T> void forward_(); + template <class T> void backward_(const Tensor& outGrad); +}; + +// Implementation entry point registration to Operator +REGISTRAR(Round_Op, "cuda", Aidge::RoundImpl_cuda::create); +} // namespace Aidge + +#endif /* AIDGE_BACKEND_CUDA_OPERATOR_ROUNDIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/RoundImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/RoundImpl_CUDA_kernels.hpp new file mode 100644 index 0000000000000000000000000000000000000000..3df921cbf26c2365fae74c883d2f5f1acde2eb84 --- /dev/null +++ b/include/aidge/backend/cuda/operator/RoundImpl_CUDA_kernels.hpp @@ -0,0 +1,36 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CUDA_OPERATOR_ROUNDIMPL_KERNELS_H_ +#define AIDGE_CUDA_OPERATOR_ROUNDIMPL_KERNELS_H_ + +#include <stdexcept> +#include <cfloat> +#include <cuda.h> +#include <cuda_runtime_api.h> +#include <cuda_fp16.h> + +#include "aidge/data/Data.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/utils/Types.h" + +namespace Aidge { + +template <class T> +void roundForward(const T* input, T* output,int size); + +} +#endif /* AIDGE_CUDA_OPERATOR_ROUNDIMPL_KERNELS_H_ */ + + + + + diff --git a/include/aidge/backend/cuda/operator/ShiftGELUImpl.hpp b/include/aidge/backend/cuda/operator/ShiftGELUImpl.hpp index f83b41ae139482cdb0cd1060846c77ba78fcc0ee..1eff6dfbb1777d8dbd823d7bc9b94894bb2646b9 100644 --- a/include/aidge/backend/cuda/operator/ShiftGELUImpl.hpp +++ b/include/aidge/backend/cuda/operator/ShiftGELUImpl.hpp @@ -37,7 +37,7 @@ public: return std::make_unique<ShiftGELUImpl_cuda>(op); } - virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + virtual std::vector<ImplSpec> getAvailableImplSpecs() const override { return { {DataType::Float64}, {DataType::Float32}, diff --git a/include/aidge/backend/cuda/operator/ShiftMaxImpl.hpp b/include/aidge/backend/cuda/operator/ShiftMaxImpl.hpp index 707b5616fde120f7e8ef38e6dc9f1552cfdb0d59..3e6e3744cb544d0928a9229aa5110cf776f0c507 100644 --- a/include/aidge/backend/cuda/operator/ShiftMaxImpl.hpp +++ b/include/aidge/backend/cuda/operator/ShiftMaxImpl.hpp @@ -37,7 +37,7 @@ public: return std::make_unique<ShiftMaxImpl_cuda>(op); } - virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + virtual std::vector<ImplSpec> getAvailableImplSpecs() const override { return { {DataType::Float64}, {DataType::Float32}, diff --git a/include/aidge/backend/cuda/operator/SigmoidImpl.hpp b/include/aidge/backend/cuda/operator/SigmoidImpl.hpp index bc29b9e5f53716641a692cd63c29f4600f3cdd02..dc1434c8ecc8568bd4f82c7c7ce5db78cc1885a9 100644 --- a/include/aidge/backend/cuda/operator/SigmoidImpl.hpp +++ b/include/aidge/backend/cuda/operator/SigmoidImpl.hpp @@ -36,7 +36,7 @@ public: return std::make_unique<SigmoidImpl_cuda>(op); } - virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + virtual std::vector<ImplSpec> getAvailableImplSpecs() const override { return { {DataType::Any} }; diff --git a/include/aidge/backend/cuda/operator/SqrtImpl.hpp b/include/aidge/backend/cuda/operator/SqrtImpl.hpp index dfa2bd0f67e5556838c923bf8a2857f0c613503b..2828abcbee1c94d84a499082ad28c18342b87980 100644 --- a/include/aidge/backend/cuda/operator/SqrtImpl.hpp +++ b/include/aidge/backend/cuda/operator/SqrtImpl.hpp @@ -36,7 +36,7 @@ public: return std::make_unique<SqrtImpl_cuda>(op); } - virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + virtual std::vector<ImplSpec> getAvailableImplSpecs() const override { return { {DataType::Float64}, {DataType::Float32}, diff --git a/include/aidge/backend/cuda/operator/SqrtImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/SqrtImpl_CUDA_kernels.hpp index 26f609cab8137a16fb5cf682561f237aaab74530..a1ac156424e657be66423ecae2d260bc962ef894 100644 --- a/include/aidge/backend/cuda/operator/SqrtImpl_CUDA_kernels.hpp +++ b/include/aidge/backend/cuda/operator/SqrtImpl_CUDA_kernels.hpp @@ -25,10 +25,19 @@ namespace Aidge { template <class T> -void sqrtForward(const T* input, T* output, int size); +void sqrtForward(const T* input, + T* output, + int size, + const T alpha, + const T beta); template <class T> -void sqrtBackward(const T* input, const T* outputGrad, T* inputGrad, int size); +void sqrtBackward(const T* input, + const T* outputGrad, + T* inputGrad, + int size, + const T alpha, + const T beta); } #endif /* AIDGE_CUDA_OPERATOR_SQRTIMPL_KERNEL_H_ */ diff --git a/include/aidge/backend/cuda/operator/SubImpl.hpp b/include/aidge/backend/cuda/operator/SubImpl.hpp index 45c833f3e7f9f25258469a4d1e34e8598df068ef..529d0b2b2dd4a0ec8a3dae5bf0219f8a4f2968c6 100644 --- a/include/aidge/backend/cuda/operator/SubImpl.hpp +++ b/include/aidge/backend/cuda/operator/SubImpl.hpp @@ -36,7 +36,7 @@ public: return std::make_unique<SubImpl_cuda>(op); } - virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + virtual std::vector<ImplSpec> getAvailableImplSpecs() const override { return { {DataType::Float64}, {DataType::Float32}, diff --git a/include/aidge/backend/cuda/operator/TanhImpl.hpp b/include/aidge/backend/cuda/operator/TanhImpl.hpp index 166acd6adee397a3f284363a9db1e71152467b94..a87d7bd8c318149cb625a3cf0122f7eac1ea6149 100644 --- a/include/aidge/backend/cuda/operator/TanhImpl.hpp +++ b/include/aidge/backend/cuda/operator/TanhImpl.hpp @@ -36,7 +36,7 @@ public: return std::make_unique<TanhImpl_cuda>(op); } - virtual std::set<ImplSpec> getAvailableImplSpecs() const override { + virtual std::vector<ImplSpec> getAvailableImplSpecs() const override { return { {DataType::Any} }; diff --git a/src/operator/AddImpl.cpp b/src/operator/AddImpl.cpp index de7ea925554906ea5fe1e5dcba268b17a06a47bd..8771a79e938dff893d5295bd847567a0dcb18f32 100644 --- a/src/operator/AddImpl.cpp +++ b/src/operator/AddImpl.cpp @@ -155,10 +155,12 @@ void Aidge::AddImpl_cuda::backward() { } template <class T> -void Aidge::AddImpl_cuda::backward_(const Tensor& outputGrad, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) { +void Aidge::AddImpl_cuda::backward_(const Tensor& outputGrad, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) +{ const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; - const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; + const typename Cuda::cudnn_scaling_type<T>::type beta = 1.0f; // accumulate for (std::size_t i = 0; i < inputsDims.size(); i++) { diff --git a/src/operator/AvgPoolingImpl.cpp b/src/operator/AvgPoolingImpl.cpp index d1270ee4b0a556e1053f3cfde8d71ec5efbee279..854171017899c7ea52a20f59e9181e8008a4d3ad 100644 --- a/src/operator/AvgPoolingImpl.cpp +++ b/src/operator/AvgPoolingImpl.cpp @@ -97,11 +97,13 @@ void Aidge::AvgPoolingImpl_cuda<DIM>::backward() { template <Aidge::DimIdx_t DIM> template <class T> -void Aidge::AvgPoolingImpl_cuda<DIM>::backward_(const Tensor& output_grad) { +void Aidge::AvgPoolingImpl_cuda<DIM>::backward_(const Tensor& output_grad) +{ const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); const T alpha = 1.0f; - const T beta = 0.0f; + const T beta = 1.0f; // accumulate + CHECK_CUDNN_STATUS( cudnnPoolingBackward(CudaContext::cudnnHandle(), mAvgPoolingDesc, diff --git a/src/operator/BatchNormImpl.cpp b/src/operator/BatchNormImpl.cpp index 5cf079326a0ea003fb72875bcaebefe847086ecb..f72e0abee0e925aaa213265670cc77f3ca1e13b3 100644 --- a/src/operator/BatchNormImpl.cpp +++ b/src/operator/BatchNormImpl.cpp @@ -86,7 +86,9 @@ void Aidge::BatchNormImpl_cuda<DIM>::forward() { template <Aidge::DimIdx_t DIM> template <class T> void Aidge::BatchNormImpl_cuda<DIM>::forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, const Tensor& input3, const Tensor& input4) { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + + const BatchNorm_Op<DIM>& op = static_cast<const BatchNorm_Op<DIM>&>(mOp); + const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; @@ -102,8 +104,11 @@ void Aidge::BatchNormImpl_cuda<DIM>::forward_(const Tensor& input0, const Tensor else { tensorDesc = std::dynamic_pointer_cast<TensorImpl_cuda_>(input1.getImpl())->getCudnnTensorDesc(input1); } - CHECK_CUDNN_STATUS( - cudnnBatchNormalizationForwardInference( + + if (op.trainingMode()) + { + CHECK_CUDNN_STATUS( + cudnnBatchNormalizationForwardTraining( CudaContext::cudnnHandle(), mMode, &alpha, @@ -114,11 +119,36 @@ void Aidge::BatchNormImpl_cuda<DIM>::forward_(const Tensor& input0, const Tensor std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr(), tensorDesc, input1.getImpl()->rawPtr(), - input2.getImpl()->rawPtr(), + input2.getImpl()->rawPtr(), + op.momentum(), input3.getImpl()->rawPtr(), - input4.getImpl()->rawPtr(), - mEpsilon) - ); + input4.getImpl()->rawPtr(), + mEpsilon, + nullptr, + nullptr) // TODO add savedMean and savedVar? + ); + } + else + { + CHECK_CUDNN_STATUS( + cudnnBatchNormalizationForwardInference( + CudaContext::cudnnHandle(), + mMode, + &alpha, + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0), + input0.getImpl()->rawPtr(), + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), + std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr(), + tensorDesc, + input1.getImpl()->rawPtr(), + input2.getImpl()->rawPtr(), + input3.getImpl()->rawPtr(), + input4.getImpl()->rawPtr(), + mEpsilon) + ); + } + if (input1.nbDims() == 1) { CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc)); @@ -163,9 +193,9 @@ template <class T> void Aidge::BatchNormImpl_cuda<DIM>::backward_(const Tensor& input0, const Tensor& outputGrad, const Tensor& weights) { const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; - const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; + const typename Cuda::cudnn_scaling_type<T>::type beta = 1.0f; // accumulate const typename Cuda::cudnn_scaling_type<T>::type alphaData = 1.0f; - const typename Cuda::cudnn_scaling_type<T>::type betaData = 0.0f; + const typename Cuda::cudnn_scaling_type<T>::type betaData = 1.0f; // accumulate cudnnTensorDescriptor_t scaleBiasDesc; // For scale, bias, var and mean, if we have a 1D tensor, the dim should go on the channels diff --git a/src/operator/ClipImpl.cpp b/src/operator/ClipImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..bb2de7efd81b914da97aa34330694cb62f1c8ba8 --- /dev/null +++ b/src/operator/ClipImpl.cpp @@ -0,0 +1,69 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <algorithm> +#include <cassert> +#include <numeric> +#include <vector> + +#include <cuda_fp16.h> +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/ClipImpl.hpp" +#include "aidge/backend/cuda/operator/ClipImpl_CUDA_kernels.hpp" +#include "aidge/backend/cuda/utils/CudaContext.hpp" +#include "aidge/backend/cuda/utils/CudaContext.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/operator/Div.hpp" +#include "aidge/utils/Types.h" + +void Aidge::ClipImpl_cuda::forward() { + const Clip_Op& op = static_cast<const Clip_Op&>(mOp); + // Check inputs + AIDGE_ASSERT(op.getInput(0), "missing input in Clip operator"); + AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Clip forward because the 0-th input has no implementation."); + switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + forward_<double>(); + break; + case DataType::Float32: + forward_<float>(); + break; + case DataType::Float16: + forward_<half>(); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + } +} + +template <class T> +void Aidge::ClipImpl_cuda::forward_() +{ + const Clip_Op& op = static_cast<const Clip_Op&>(mOp); + float min = op.min(); + float max = op.max(); + int size = op.getInput(0)->size(); + T* inputPtr = static_cast<T*>(op.getInput(0)->getImpl()->rawPtr()); + T* outputPtr = static_cast<T*>(op.getOutput(0)->getImpl()->rawPtr()); + Aidge::clipForward<T>(inputPtr,outputPtr,size,min,max); +} + +void Aidge::ClipImpl_cuda::backward() { + // TODO +} + +template <class T> +void Aidge::ClipImpl_cuda::backward_(const Tensor& outGrad) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; + const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; + // TODO +} \ No newline at end of file diff --git a/src/operator/ClipImpl_CUDA_kernels.cu b/src/operator/ClipImpl_CUDA_kernels.cu new file mode 100644 index 0000000000000000000000000000000000000000..c625debcd3d837d63a0d6d7a1da32a970a6b4314 --- /dev/null +++ b/src/operator/ClipImpl_CUDA_kernels.cu @@ -0,0 +1,56 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include "aidge/backend/cuda/operator/ClipImpl_CUDA_kernels.hpp" + + +// Helper function for Clip +template <typename T> +__device__ T clip(T a, T min_val, T max_val) { + return min(max(a, min_val), max_val); +} + +template <> +__device__ half clip<half>(half a, half min_val, half max_val) { +#if __CUDA_ARCH__ >= 530 && defined(CUDART_VERSION) && CUDART_VERSION >= 8000 + return __hmax(min_val, __hmin(a, max_val)); +#else + return __float2half(fmaxf(__half2float(min_val), fminf(__half2float(a), __half2float(max_val)))); +#endif +} + + +template <class T> +__global__ void clipKernel(const T* input, T* output, int size, T min_val, T max_val) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= size) return; + output[idx] = clip(input[idx], min_val, max_val); +} + +template <class T> + +void Aidge::clipForward(const T* input, T* output,int size,T min_val, T max_val) +{ + int blockSize = 256; + int numBlocks = (size + blockSize - 1) / blockSize; + + clipKernel<<<numBlocks, blockSize>>>(input, output, size, min_val, max_val); + + CHECK_CUDA_STATUS(cudaGetLastError()); + CHECK_CUDA_STATUS(cudaDeviceSynchronize()); +}; + +template void Aidge::clipForward<double>(const double* input, double* output, int size, double min_val, double max_val); + +template void Aidge::clipForward<float>(const float* input, float* output, int size, float min_val, float max_val); + +template void Aidge::clipForward<half>(const half* input, half* output, int size, half min_val, half max_val); + diff --git a/src/operator/ConvImpl.cpp b/src/operator/ConvImpl.cpp index 24e01db03692ffaa884b31a224a1947a9e1645a0..076dccab3e52cc458b7b95788890e7fb600e4e49 100644 --- a/src/operator/ConvImpl.cpp +++ b/src/operator/ConvImpl.cpp @@ -265,7 +265,7 @@ void Aidge::ConvImpl_cuda<DIM>::backward_(const Tensor& input0, const Tensor& in const auto& gradOutput = op.getOutput(0)->grad()->refCastFrom(gradOutputFallback, *(op.getInput(0)->grad())); const T alpha = 1.0f; - const T beta = 0.0f; + const T beta = 1.0f; // accumulate CHECK_CUDNN_STATUS(cudnnConvolutionBackwardFilter( CudaContext::cudnnHandle(), diff --git a/src/operator/DivImpl.cpp b/src/operator/DivImpl.cpp index 0326a60c1a3aabf43ca3a1d892328991d6d72366..8f5fdc717dd2337a1324a0c1be4887133bb70492 100644 --- a/src/operator/DivImpl.cpp +++ b/src/operator/DivImpl.cpp @@ -108,6 +108,6 @@ template <class T> void Aidge::DivImpl_cuda::backward_(const Tensor& outGrad) { const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; - const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; + const typename Cuda::cudnn_scaling_type<T>::type beta = 1.0f; // accumulate // TODO } \ No newline at end of file diff --git a/src/operator/FCImpl.cpp b/src/operator/FCImpl.cpp index 1a7bb8edb51312d08467354e20723ad19176bfee..55cb31b7492956a2c722e775c225276d22fbdf4e 100644 --- a/src/operator/FCImpl.cpp +++ b/src/operator/FCImpl.cpp @@ -116,6 +116,7 @@ void Aidge::FCImpl_cuda::forward_(const Tensor& input0, const Tensor& input1, co } void Aidge::FCImpl_cuda::backward() { + AIDGE_ASSERT(mOp.getRawInput(0), "missing input #0"); AIDGE_ASSERT(mOp.getRawInput(1), "missing input #1"); AIDGE_ASSERT(mOp.getRawInput(2), "missing input #2"); @@ -146,9 +147,11 @@ template<class T> void Aidge::FCImpl_cuda::backward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, std::size_t outChannels) { const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; - const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; - const typename Cuda::cudnn_scaling_type<T>::type betaData = 0.0f; + const typename Cuda::cudnn_scaling_type<T>::type beta = 1.0f; // accumulate + const typename Cuda::cudnn_scaling_type<T>::type betaData = 1.0f; // accumulate + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const T * input = static_cast<const T*>(input0.getImpl()->rawPtr()); const T * weights = static_cast<const T*>(input1.getImpl()->rawPtr()); const T * outputGrad = static_cast<const T*>(op.getOutput(0)->grad()->getImpl()->rawPtr()); @@ -175,7 +178,8 @@ void Aidge::FCImpl_cuda::backward_(const Tensor& input0, const Tensor& input1, c weightsGrad, m)); - if(!input2.empty()){ + if (!input2.empty()) + { T * biasGrad = static_cast<T*>(op.getInput(2)->grad()->getImpl()->rawPtr()); T* onesVector; CHECK_CUDA_STATUS(cudaMalloc((void**)&onesVector, m * sizeof(T))); @@ -200,6 +204,7 @@ void Aidge::FCImpl_cuda::backward_(const Tensor& input0, const Tensor& input1, c 1)); CHECK_CUDA_STATUS(cudaFree(onesVector)); } + // Performing inputGrad = (weights) * (outputGrad) CHECK_CUBLAS_STATUS(cublasGemm( CudaContext::cublasHandle(), diff --git a/src/operator/GlobalAveragePoolingImpl.cpp b/src/operator/GlobalAveragePoolingImpl.cpp index 8c83d477094d9cce41807d888cca57bd614e9cc6..c409c84a4eef466e43fa8dd2e2f138bb55158a0d 100644 --- a/src/operator/GlobalAveragePoolingImpl.cpp +++ b/src/operator/GlobalAveragePoolingImpl.cpp @@ -88,7 +88,7 @@ void Aidge::GlobalAveragePoolingImpl_cuda::backward_(const Tensor& output_grad) const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); const T alpha = 1.0f; - const T beta = 0.0f; + const T beta = 1.0f; // accumulate CHECK_CUDNN_STATUS( cudnnPoolingBackward(CudaContext::cudnnHandle(), mGlobalAveragePoolingDesc, diff --git a/src/operator/LRNImpl.cpp b/src/operator/LRNImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e350a0c83ed334e9de57f6db90759942e5a90573 --- /dev/null +++ b/src/operator/LRNImpl.cpp @@ -0,0 +1,112 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <cassert> +#include <vector> + +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/LRNImpl.hpp" +#include "aidge/backend/cuda/utils/CudaContext.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/operator/LRN.hpp" +#include "aidge/utils/Types.h" + +void Aidge::LRNImpl_cuda::forward() { + const LRN_Op& op = static_cast<const LRN_Op&>(mOp); + assert(mOp.getRawInput(0) && "missing input #0"); + + const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0)); + + // Lazy-initialize CuDNN LRN descriptor + if (mLRNDesc == nullptr) { + CHECK_CUDNN_STATUS(cudnnCreateLRNDescriptor(&mLRNDesc)); + CHECK_CUDNN_STATUS(cudnnSetLRNDescriptor(mLRNDesc, op.size(), op.alpha(), op.beta(), op.bias())); + } + + // Do the actual forward computation + // Template is only for scaling parameters, which are always in float + // excepted when the convolution is performed in double precision. + if (op.getOutput(0)->dataType() == DataType::Float64) { + forward_<double>(input); + } + else { + forward_<float>(input); + } +} + +template <class T> +void Aidge::LRNImpl_cuda::forward_(const Tensor& input) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; + const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; + + CHECK_CUDNN_STATUS(cudnnLRNCrossChannelForward(CudaContext::cudnnHandle(), + mLRNDesc, + CUDNN_LRN_CROSS_CHANNEL_DIM1, + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input), + input.getImpl()->rawPtr(), + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), + std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr())); +} + +void Aidge::LRNImpl_cuda::backward() { + const LRN_Op& op = static_cast<const LRN_Op&>(mOp); + assert(mOp.getRawInput(0) && "missing input #0"); + + const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad()); + + // Lazy-initialize CuDNN LRN descriptor + if (mLRNDesc == nullptr) { + CHECK_CUDNN_STATUS(cudnnCreateLRNDescriptor(&mLRNDesc)); + CHECK_CUDNN_STATUS(cudnnSetLRNDescriptor(mLRNDesc, op.size(), op.alpha(), op.beta(), op.bias())); + } + + // Do the actual backward computation + // Template is only for scaling parameters, which are always in float + // excepted when the convolution is performed in double precision. + if (op.getInput(0)->grad()->dataType() == DataType::Float64) { + backward_<double>(output_grad); + } + else { + backward_<float>(output_grad); + } +} + +template <class T> +void Aidge::LRNImpl_cuda::backward_(const Tensor& output_grad) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; + const typename Cuda::cudnn_scaling_type<T>::type beta = 1.0f; + + CHECK_CUDNN_STATUS(cudnnLRNCrossChannelBackward( + CudaContext::cudnnHandle(), + mLRNDesc, + CUDNN_LRN_CROSS_CHANNEL_DIM1, + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), + std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr(), + std::dynamic_pointer_cast<TensorImpl_cuda_>(output_grad.getImpl())->getCudnnTensorDesc(output_grad), + output_grad.getImpl()->rawPtr(), + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->getImpl())->getCudnnTensorDesc(*op.getInput(0)), + std::static_pointer_cast<Tensor>(op.getRawInput(0))->getImpl()->rawPtr(), + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->grad()->getImpl())->getCudnnTensorDesc(*op.getInput(0)->grad()), + op.getInput(0)->grad()->getImpl()->rawPtr())); +} + +Aidge::LRNImpl_cuda::~LRNImpl_cuda() { + if (mLRNDesc != nullptr) { + cudnnDestroyLRNDescriptor(mLRNDesc); + } +} + diff --git a/src/operator/LnImpl.cpp b/src/operator/LnImpl.cpp index ed09ed45f5006c3760376a9d6f44f29d05bcfabe..7f0ac34d262f2c903e08dd93194cf9901da6282a 100644 --- a/src/operator/LnImpl.cpp +++ b/src/operator/LnImpl.cpp @@ -47,8 +47,7 @@ void Aidge::LnImpl_cuda::forward_(const Tensor& input) { const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); const T * inputPtr = static_cast<const T*>(input.getImpl()->rawPtr()); T * outputPtr = static_cast<T*>(op.getOutput(0)->getImpl()->rawPtr()); - - + Aidge::lnForward<T>(inputPtr, outputPtr, static_cast<int>(op.getOutput(0)->size())); } diff --git a/src/operator/MaxPoolingImpl.cpp b/src/operator/MaxPoolingImpl.cpp index 39050635102ebebaed8192cb4bb338e2bc31d5e8..19aacb5076e6ca32241eac6efa8b83bbadbcd456 100644 --- a/src/operator/MaxPoolingImpl.cpp +++ b/src/operator/MaxPoolingImpl.cpp @@ -102,7 +102,7 @@ void Aidge::MaxPoolingImpl_cuda<DIM>::backward_(const Tensor& output_grad) { const MaxPooling_Op<DIM>& op_ = static_cast<const MaxPooling_Op<DIM>&>(mOp); const T alpha = 1.0f; - const T beta = 0.0f; + const T beta = 1.0f; // accumulate CHECK_CUDNN_STATUS( cudnnPoolingBackward(CudaContext::cudnnHandle(), mMaxPoolingDesc, diff --git a/src/operator/MulImpl.cpp b/src/operator/MulImpl.cpp index af87251e8f29eded7d24cca2f08b880557ebb482..aa9b4c74785d3d5785f9d9d62d1a72503f8be104 100644 --- a/src/operator/MulImpl.cpp +++ b/src/operator/MulImpl.cpp @@ -172,10 +172,10 @@ void Aidge::MulImpl_cuda::backward() { template <class T> void Aidge::MulImpl_cuda::backward_(const Tensor& outputGrad, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; - const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; - + const typename Cuda::cudnn_scaling_type<T>::type beta = 1.0f; // accumulate // Create a Tensor descriptor with the broadcasted dims and strides cudnnTensorDescriptor_t tensorDesc0, tensorDesc1; diff --git a/src/operator/PadImpl.cpp b/src/operator/PadImpl.cpp index 3606ba66d002f1467aa65771015cab02c066d5a5..0b17332d84c9b7eccf864ab99c3f1bb453640aa4 100644 --- a/src/operator/PadImpl.cpp +++ b/src/operator/PadImpl.cpp @@ -60,7 +60,12 @@ void Aidge::PadImpl_cuda<DIM>::forward_(const Tensor &input) { const auto outDims = std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(); const T *inputPtr = static_cast<const T *>(input.getImpl()->rawPtr()); + + const T alpha = 1.0f; + const T beta = 0.0f; + T *output = static_cast<T *>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr()); + Aidge::cudaPadding(CudaContext::getDeviceProp(), outDims[1], outDims[3], @@ -74,7 +79,9 @@ void Aidge::PadImpl_cuda<DIM>::forward_(const Tensor &input) mPadType, static_cast<T>(mPadVal), inputPtr, - output); + output, + alpha, + beta); } template <Aidge::DimIdx_t DIM> @@ -116,7 +123,12 @@ void Aidge::PadImpl_cuda<DIM>::backward_(const Tensor &outGrad) { const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); const auto inputGradDims = op.getInput(0)->grad()->dims(); + + const T alpha = 1.0f; + const T beta = 1.0f; // accumulate + T *inputGrad = static_cast<T *>(op.getInput(0)->grad()->getImpl()->rawPtr()); + Aidge::cudaPadding(CudaContext::getDeviceProp(), inputGradDims[1], inputGradDims[3], @@ -130,7 +142,9 @@ void Aidge::PadImpl_cuda<DIM>::backward_(const Tensor &outGrad) mPadType, static_cast<T>(mPadVal), static_cast<const T *>(outGrad.getImpl()->rawPtr()), - inputGrad); + inputGrad, + alpha, + beta); } // Template declarations diff --git a/src/operator/PadImpl_CUDA_kernels.cu b/src/operator/PadImpl_CUDA_kernels.cu index a20a4c10a6cb5e783a09868389b8f968bc0f42a3..0628751311ab69d6b19cc4ff870f93f6dae2cf5a 100644 --- a/src/operator/PadImpl_CUDA_kernels.cu +++ b/src/operator/PadImpl_CUDA_kernels.cu @@ -23,7 +23,9 @@ __global__ void cudaPadding_kernel(unsigned int nbOutputs, unsigned int padType, T padValue, const T *input, - T *outputs) + T *outputs, + const T alpha, + const T beta) { const unsigned int inputOffset = (blockIdx.z * blockDim.z + threadIdx.z) * nbChannels * inputWidth * inputHeight; @@ -48,8 +50,8 @@ __global__ void cudaPadding_kernel(unsigned int nbOutputs, if (ix >= 0 && ix < (int)inputWidth && iy >= 0 && iy < (int)inputHeight) { - outputValue = input[ix + - iy * inputWidth + ch * inputWidth * inputHeight + inputOffset]; + int inputIndex = ix + iy * inputWidth + ch * inputWidth * inputHeight + inputOffset; + outputValue = input[inputIndex]; } } else if (padType == 1) // Edge padding @@ -57,8 +59,8 @@ __global__ void cudaPadding_kernel(unsigned int nbOutputs, int ix = max(0, min((int)inputWidth - 1, (int)ox - leftPad)); int iy = max(0, min((int)inputHeight - 1, (int)oy - topPad)); - outputValue = input[ix + - iy * inputWidth + ch * inputWidth * inputHeight + inputOffset]; + int inputIndex = ix + iy * inputWidth + ch * inputWidth * inputHeight + inputOffset; + outputValue = input[inputIndex]; } else if (padType == 2) // Reflect padding { @@ -74,18 +76,22 @@ __global__ void cudaPadding_kernel(unsigned int nbOutputs, if (iy >= (int)inputHeight) iy = (int)inputHeight - iy; - outputValue = input[ix + - iy * inputWidth + ch * inputWidth * inputHeight + inputOffset]; + int inputIndex = ix + iy * inputWidth + ch * inputWidth * inputHeight + inputOffset; + outputValue = input[inputIndex]; } else if (padType == 3) // Wrap padding { int ix = (inputWidth + (int)ox - leftPad) % inputWidth; int iy = (inputHeight + (int)oy - topPad) % inputHeight; - outputValue = input[ix + - iy * inputWidth + ch * inputWidth * inputHeight + inputOffset]; + int inputIndex = ix + iy * inputWidth + ch * inputWidth * inputHeight + inputOffset; + outputValue = input[inputIndex]; } - outputs[ox + oy * outputWidth + ch * outputWidth * outputHeight + outputOffset] = outputValue; + + int outputIndex = ox + oy * outputWidth + ch * outputWidth * outputHeight + outputOffset; + + // old : outputs[outputIndex] = outputValue; + outputs[outputIndex] = alpha * outputValue + beta * outputs[outputIndex]; } } } @@ -105,7 +111,9 @@ void Aidge::cudaPadding(const cudaDeviceProp &deviceProp, unsigned int padType, double padValue, const double *input, - double *outputs) + double *outputs, + const double alpha, + const double beta) { const unsigned int maxSize = (unsigned int)deviceProp.maxThreadsPerBlock; const unsigned int prefMultiple = (unsigned int)deviceProp.warpSize; @@ -131,7 +139,9 @@ void Aidge::cudaPadding(const cudaDeviceProp &deviceProp, padType, padValue, input, - outputs); + outputs, + alpha, + beta); CHECK_CUDA_STATUS(cudaPeekAtLastError()); } @@ -149,7 +159,9 @@ void Aidge::cudaPadding(const cudaDeviceProp &deviceProp, unsigned int padType, float padValue, const float *input, - float *outputs) + float *outputs, + const float alpha, + const float beta) { const unsigned int maxSize = (unsigned int)deviceProp.maxThreadsPerBlock; const unsigned int prefMultiple = (unsigned int)deviceProp.warpSize; @@ -175,7 +187,9 @@ void Aidge::cudaPadding(const cudaDeviceProp &deviceProp, padType, padValue, input, - outputs); + outputs, + alpha, + beta); CHECK_CUDA_STATUS(cudaPeekAtLastError()); } @@ -193,7 +207,9 @@ void Aidge::cudaPadding(const cudaDeviceProp &deviceProp, unsigned int padType, half padValue, const half *input, - half *outputs) + half *outputs, + const half alpha, + const half beta) { const unsigned int maxSize = (unsigned int)deviceProp.maxThreadsPerBlock; const unsigned int prefMultiple = (unsigned int)deviceProp.warpSize; @@ -219,6 +235,8 @@ void Aidge::cudaPadding(const cudaDeviceProp &deviceProp, padType, padValue, input, - outputs); + outputs, + alpha, + beta); CHECK_CUDA_STATUS(cudaPeekAtLastError()); } \ No newline at end of file diff --git a/src/operator/ReLUImpl.cpp b/src/operator/ReLUImpl.cpp index 80d52045e832b42a95b6d7448f2016530bb9d1ac..db2739290c1deab2995c360573afae410d2870b8 100644 --- a/src/operator/ReLUImpl.cpp +++ b/src/operator/ReLUImpl.cpp @@ -94,10 +94,13 @@ void Aidge::ReLUImpl_cuda::backward() { } template <class T> -void Aidge::ReLUImpl_cuda::backward_(const Tensor& output_grad) { +void Aidge::ReLUImpl_cuda::backward_(const Tensor& output_grad) +{ const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; - const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; + const typename Cuda::cudnn_scaling_type<T>::type beta = 1.0f; // accumulate + CHECK_CUDNN_STATUS( cudnnActivationBackward(CudaContext::cudnnHandle(), mReLUDesc, diff --git a/src/operator/ReduceImpl_CUDA_kernels.cu b/src/operator/ReduceImpl_CUDA_kernels.cu index 7002e34116d2c1050987dc0cb93dbf7339a7ea93..4ce42389624fcfb88b0d6eb35a746a24504ac35a 100644 --- a/src/operator/ReduceImpl_CUDA_kernels.cu +++ b/src/operator/ReduceImpl_CUDA_kernels.cu @@ -12,7 +12,18 @@ #include "aidge/backend/cuda/operator/ReduceImpl_CUDA_kernels.hpp" template <typename T> -__global__ void duplicateElements(const T* input, T* output, const std::size_t* shape, const std::size_t* new_shape, const int* axes, const std::size_t* factors, int num_dims, int num_axes) { +__global__ void duplicateElements( + const T* input, + T* output, + const std::size_t* shape, + const std::size_t* new_shape, + const int* axes, + const std::size_t* factors, + int num_dims, + int num_axes, + const T alpha, + const T beta) +{ int idx = blockIdx.x * blockDim.x + threadIdx.x; int input_size = 1; int output_size = 1; @@ -55,15 +66,25 @@ __global__ void duplicateElements(const T* input, T* output, const std::size_t* output_stride *= new_shape[i]; } - output[out_linear_idx] = input[in_linear_idx]; + // old : output[out_linear_idx] = input[in_linear_idx]; + output[out_linear_idx] = alpha * input[in_linear_idx] + beta * output[out_linear_idx]; delete[] out_idx; delete[] in_idx; } template <typename T> -void Aidge::ReduceBackward(const T* input, T* output, const std::vector<std::size_t>& inputDims, const std::vector<std::size_t>& outputDims, const std::vector<int>& axes, const std::vector<std::size_t>& factors, int outSize) { - +void Aidge::ReduceBackward( + const T* input, + T* output, + const std::vector<std::size_t>& inputDims, + const std::vector<std::size_t>& outputDims, + const std::vector<int>& axes, + const std::vector<std::size_t>& factors, + int outSize, + const T alpha, + const T beta) +{ std::size_t* d_shape; std::size_t* d_new_shape; int* d_axes; @@ -81,7 +102,18 @@ void Aidge::ReduceBackward(const T* input, T* output, const std::vector<std::siz int blockSize = 256; int numBlocks = (outSize + blockSize - 1) / blockSize; - duplicateElements<<<numBlocks, blockSize>>>(input, output, d_shape, d_new_shape, d_axes, d_factors, static_cast<int>(inputDims.size()), static_cast<int>(axes.size())); + duplicateElements<<<numBlocks, blockSize>>> ( + input, + output, + d_shape, + d_new_shape, + d_axes, + d_factors, + static_cast<int>(inputDims.size()), + static_cast<int>(axes.size()), + alpha, + beta); + cudaFree(d_shape); cudaFree(d_new_shape); cudaFree(d_axes); @@ -95,7 +127,9 @@ template void Aidge::ReduceBackward(const double* input, const std::vector<std::size_t>& outputDims, const std::vector<int>& axes, const std::vector<std::size_t>& factors, - int outSize); + int outSize, + const double alpha, + const double beta); template void Aidge::ReduceBackward(const float* input, float* output, @@ -103,7 +137,10 @@ template void Aidge::ReduceBackward(const float* input, const std::vector<std::size_t>& outputDims, const std::vector<int>& axes, const std::vector<std::size_t>& factors, - int outSize); + int outSize, + const float alpha, + const float beta); + template void Aidge::ReduceBackward(const half* input, half* output, @@ -111,4 +148,6 @@ template void Aidge::ReduceBackward(const half* input, const std::vector<std::size_t>& outputDims, const std::vector<int>& axes, const std::vector<std::size_t>& factors, - int outSize); + int outSize, + const half alpha, + const half beta); diff --git a/src/operator/ReduceMeanImpl.cpp b/src/operator/ReduceMeanImpl.cpp index 645929355d9c9036503ae8a90043573ed0aef4b1..2746d4c36fd2d2a5cfe8196d4b091c67ce0f2324 100644 --- a/src/operator/ReduceMeanImpl.cpp +++ b/src/operator/ReduceMeanImpl.cpp @@ -179,9 +179,15 @@ void Aidge::ReduceMeanImpl_cuda::backward() { template <class T> void Aidge::ReduceMeanImpl_cuda::backward_(const Tensor& outGrad, const std::vector<int>& axes) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + // const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; // const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; + + const T alpha = 1.0f; + const T beta = 1.0f; // accumulate + const T * outputGrad = static_cast<const T*>(op.getOutput(0)->grad()->getImpl()->rawPtr()); T * inputGrad = static_cast<T*>(op.getInput(0)->grad()->getImpl()->rawPtr()); @@ -196,5 +202,7 @@ void Aidge::ReduceMeanImpl_cuda::backward_(const Tensor& outGrad, const std::vec op.getInput(0)->grad()->dims(), axes, factors, - static_cast<int>(op.getInput(0)->grad()->size())); + static_cast<int>(op.getInput(0)->grad()->size()), + alpha, + beta); } diff --git a/src/operator/ReduceSumImpl.cpp b/src/operator/ReduceSumImpl.cpp index 84658cae495fefb1b893b78e1515e42a7d1f65f7..e8c5b1e98d10d40dc01157465ba21f3a5330ced4 100644 --- a/src/operator/ReduceSumImpl.cpp +++ b/src/operator/ReduceSumImpl.cpp @@ -178,7 +178,11 @@ void Aidge::ReduceSumImpl_cuda::backward() { } template <class T> -void Aidge::ReduceSumImpl_cuda::backward_(const Tensor& outGrad, const std::vector<int>& axes) { +void Aidge::ReduceSumImpl_cuda::backward_(const Tensor& outGrad, const std::vector<int>& axes) +{ + const T alpha = 1.0f; + const T beta = 1.0f; // accumulate + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); const T * outputGrad = static_cast<const T*>(op.getOutput(0)->grad()->getImpl()->rawPtr()); @@ -195,5 +199,7 @@ void Aidge::ReduceSumImpl_cuda::backward_(const Tensor& outGrad, const std::vect op.getInput(0)->grad()->dims(), axes, factors, - static_cast<int>(op.getInput(0)->grad()->size())); + static_cast<int>(op.getInput(0)->grad()->size()), + alpha, + beta); } diff --git a/src/operator/ReshapeImpl.cpp b/src/operator/ReshapeImpl.cpp index 783e244057b0fc42a782fd363c3a99aa6d73b46b..49f732e120fd5de9454b47828caaa5b2ce6f5c58 100644 --- a/src/operator/ReshapeImpl.cpp +++ b/src/operator/ReshapeImpl.cpp @@ -32,10 +32,11 @@ void Aidge::ReshapeImpl_cuda::forward() { } void Aidge::ReshapeImpl_cuda::backward() { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); AIDGE_ASSERT(op.getOutput(0)->grad(), "missing output grad #0"); const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad()); - std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->grad() -> getImpl() -> setRawPtr(output_grad.getImpl()->rawPtr(), output_grad.getImpl()->size()); + std::static_pointer_cast<Tensor> (mOp.getRawInput(0))->grad()->getImpl()->setRawPtr(output_grad.getImpl()->rawPtr(), output_grad.getImpl()->size()); } diff --git a/src/operator/RoundImpl.cpp b/src/operator/RoundImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..28203a2cd35f0cbd44af6d80b0fe14b68b1bcd2e --- /dev/null +++ b/src/operator/RoundImpl.cpp @@ -0,0 +1,64 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <algorithm> +#include <cassert> +#include <numeric> +#include <vector> + +#include <cuda_fp16.h> +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/RoundImpl.hpp" +#include "aidge/backend/cuda/operator/RoundImpl_CUDA_kernels.hpp" +#include "aidge/backend/cuda/utils/CudaContext.hpp" +#include "aidge/backend/cuda/utils/CudaContext.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/operator/Round.hpp" +#include "aidge/utils/Types.h" + +void Aidge::RoundImpl_cuda::forward() { + const Round_Op& op = static_cast<const Round_Op&>(mOp); + // Check inputs + AIDGE_ASSERT(op.getInput(0), "missing input in Round operator"); + AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Round forward because the 0-th input has no implementation."); + switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + forward_<double>(); + break; + case DataType::Float32: + forward_<float>(); + break; + case DataType::Float16: + forward_<half>(); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + } +} + +template <class T> +void Aidge::RoundImpl_cuda::forward_() +{ + const Round_Op& op = static_cast<const Round_Op&>(mOp); + int size = op.getInput(0)->size(); + const T* inputPtr = static_cast<T*>(op.getInput(0)->getImpl()->rawPtr()); + T* outputPtr = static_cast<T*>(op.getOutput(0)->getImpl()->rawPtr()); + Aidge::roundForward<T>(inputPtr,outputPtr,size); +} + +void Aidge::RoundImpl_cuda::backward() { + // TODO +} + +template <class T> +void Aidge::RoundImpl_cuda::backward_(const Tensor& outGrad) { + // TODO +} \ No newline at end of file diff --git a/src/operator/RoundImpl_CUDA_kernels.cu b/src/operator/RoundImpl_CUDA_kernels.cu new file mode 100644 index 0000000000000000000000000000000000000000..ca2decadd2dacf6dfe76ef091b959434e25fb6d7 --- /dev/null +++ b/src/operator/RoundImpl_CUDA_kernels.cu @@ -0,0 +1,80 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include "aidge/backend/cuda/operator/RoundImpl_CUDA_kernels.hpp" + +// Helper function for Round +#include <math.h> + +template <typename T> +__device__ T round_util(T a) { + if (a - floor(a) == 0.5) { + if (fmod(floor(a), 2.0) == 0.0) { + return floor(a); + } else { + return ceil(a); + } + } + return round(a); +} +template <> +__device__ float round_util<float>(float a) { + if (a - floor(a) == 0.5) { + if (fmodf(floor(a), 2.0) == 0.0) { + return floor(a); + } else { + return ceil(a); + } + } + return roundf(a); +} + + +template <> +__device__ half round_util<half>(half a) { +#if __CUDA_ARCH__ >= 530 && defined(CUDART_VERSION) && CUDART_VERSION >= 8000 + return __float2half_rn(__half2float(a)); +#else + float af = __half2float(a); + return __float2half(round_util(af)); +#endif +} + + + + + +template <class T> +__global__ void roundKernel(const T* input, T* output, int size) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= size) return; + output[idx] = round_util(input[idx]); +} + +template <class T> + +void Aidge::roundForward(const T* input, T* output,int size) +{ + int blockSize = 256; + int numBlocks = (size + blockSize - 1) / blockSize; + + roundKernel<<<numBlocks, blockSize>>>(input, output, size); + + CHECK_CUDA_STATUS(cudaGetLastError()); + CHECK_CUDA_STATUS(cudaDeviceSynchronize()); +}; + +template void Aidge::roundForward<double>(const double* input, double* output, int size); + +template void Aidge::roundForward<float>(const float* input, float* output, int size); + +template void Aidge::roundForward<half>(const half* input, half* output, int size); + diff --git a/src/operator/SigmoidImpl.cpp b/src/operator/SigmoidImpl.cpp index 386cd9d821b3019cf8f0de2cc757ae514446f1a6..f6b0695cc71ffb82d6a1514195b13d23bae4a213 100644 --- a/src/operator/SigmoidImpl.cpp +++ b/src/operator/SigmoidImpl.cpp @@ -95,9 +95,12 @@ void Aidge::SigmoidImpl_cuda::backward() { template <class T> void Aidge::SigmoidImpl_cuda::backward_(const Tensor& output_grad) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; - const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; + const typename Cuda::cudnn_scaling_type<T>::type beta = 1.0f; // accumulate + CHECK_CUDNN_STATUS( cudnnActivationBackward(CudaContext::cudnnHandle(), mSigmoidDesc, diff --git a/src/operator/SqrtImpl.cpp b/src/operator/SqrtImpl.cpp index 60498e2907b0953b756064d6d19aeb1667ea7575..c1eccd107f421592619474fbb4c641a0f0b958bc 100644 --- a/src/operator/SqrtImpl.cpp +++ b/src/operator/SqrtImpl.cpp @@ -20,7 +20,8 @@ #include "aidge/operator/Sqrt.hpp" #include "aidge/utils/Types.h" -void Aidge::SqrtImpl_cuda::forward() { +void Aidge::SqrtImpl_cuda::forward() +{ const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); AIDGE_ASSERT(op.getInput(0), "missing input #0"); @@ -43,15 +44,25 @@ void Aidge::SqrtImpl_cuda::forward() { } template <class T> -void Aidge::SqrtImpl_cuda::forward_(const Tensor& input) { +void Aidge::SqrtImpl_cuda::forward_(const Tensor& input) +{ const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + + const T alpha = 1.0f; + const T beta = 0.0f; + const T * inputPtr = static_cast<const T*>(input.getImpl()->rawPtr()); T * outputPtr = static_cast<T*>(op.getOutput(0)->getImpl()->rawPtr()); - Aidge::sqrtForward<T>(inputPtr, outputPtr, static_cast<int>(op.getOutput(0)->size())); + Aidge::sqrtForward<T>(inputPtr, + outputPtr, + static_cast<int>(op.getOutput(0)->size()), + alpha, + beta); } -void Aidge::SqrtImpl_cuda::backward() { +void Aidge::SqrtImpl_cuda::backward() +{ const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); AIDGE_ASSERT(op.getInput(0), "missing input #0"); @@ -76,11 +87,21 @@ void Aidge::SqrtImpl_cuda::backward() { } template <class T> -void Aidge::SqrtImpl_cuda::backward_(const Tensor& input, const Tensor& output_grad) { +void Aidge::SqrtImpl_cuda::backward_(const Tensor& input, const Tensor& output_grad) +{ const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + + const T alpha = 1.0f; + const T beta = 1.0f; // accumulate + const T * inputPtr = static_cast<const T*>(input.getImpl()->rawPtr()); const T * outputGradPtr = static_cast<const T*>(output_grad.getImpl()->rawPtr()); T * inputGradPtr = static_cast<T*>(op.getInput(0)->grad()->getImpl()->rawPtr()); - Aidge::sqrtBackward<T>(inputPtr, outputGradPtr, inputGradPtr, static_cast<int>(op.getOutput(0)->size())); + Aidge::sqrtBackward<T>(inputPtr, + outputGradPtr, + inputGradPtr, + static_cast<int>(op.getOutput(0)->size()), + alpha, + beta); } diff --git a/src/operator/SqrtImpl_CUDA_kernels.cu b/src/operator/SqrtImpl_CUDA_kernels.cu index b8da130e8b7cf4d7f94f2567ba77c7da363441ea..7af45c3ea9a0b8fb8a4ceab4cb9944f38fab3111 100644 --- a/src/operator/SqrtImpl_CUDA_kernels.cu +++ b/src/operator/SqrtImpl_CUDA_kernels.cu @@ -47,46 +47,70 @@ __device__ half mul_helper<half>(half a, half b) { // Forward Kernel template <class T> -__global__ void sqrtCUDAForwardKernel(const T* input, T* output, int size) { +__global__ void sqrtCUDAForwardKernel(const T* input, + T* output, + int size, + const T alpha, + const T beta) +{ int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= size) return; - output[idx] = sqrt_helper(input[idx]); + output[idx] = alpha * sqrt_helper(input[idx]) + beta * output[idx]; } - template <class T> -void Aidge::sqrtForward(const T* input, T* output, int size) +void Aidge::sqrtForward(const T* input, + T* output, + int size, + const T alpha, + const T beta) { const int blockSize = 256; int numBlocks = (size + blockSize - 1) / blockSize; // Launch the kernel - sqrtCUDAForwardKernel<<<numBlocks, blockSize>>>(input, output, size); + sqrtCUDAForwardKernel<<<numBlocks, blockSize>>>(input, output, size, alpha, beta); }; // Backward Kernel template <class T> -__global__ void sqrtCUDABackwardKernel(const T* input, const T* outputGrad, T* inputGrad, int size) { +__global__ void sqrtCUDABackwardKernel(const T* input, + const T* outputGrad, + T* inputGrad, + int size, + const T alpha, + const T beta) +{ int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= size) return; - inputGrad[idx] = outputGrad[idx] / mul_helper(static_cast<T>(2), sqrt_helper(input[idx])); + T val = outputGrad[idx] / mul_helper(static_cast<T>(2), sqrt_helper(input[idx])); + + inputGrad[idx] = alpha * val + beta * inputGrad[idx]; } template <class T> -void Aidge::sqrtBackward(const T* input, const T* outputGrad, T* inputGrad, int size) +void Aidge::sqrtBackward(const T* input, + const T* outputGrad, + T* inputGrad, + int size, + const T alpha, + const T beta) { const int blockSize = 256; int numBlocks = (size + blockSize - 1) / blockSize; // Launch the kernel - sqrtCUDABackwardKernel<<<numBlocks, blockSize>>>(input, outputGrad, inputGrad, size); + sqrtCUDABackwardKernel<<<numBlocks, blockSize>>>(input, outputGrad, inputGrad, size, alpha, beta); }; -template void Aidge::sqrtForward<double>(const double* input, double* output, int size); -template void Aidge::sqrtForward<float>(const float* input, float* output, int size); -template void Aidge::sqrtForward<half>(const half* input, half* output, int size); -template void Aidge::sqrtBackward<double>(const double* input, const double* outputGrad, double* inputGrad, int size); -template void Aidge::sqrtBackward<float>(const float* input, const float* outputGrad, float* inputGrad, int size); -template void Aidge::sqrtBackward<half>(const half* input, const half* outputGrad, half* inputGrad, int size); \ No newline at end of file +template void Aidge::sqrtForward<double>(const double* input, double* output, int size, const double alpha, const double beta); +template void Aidge::sqrtForward<float>(const float* input, float* output, int size, const float alpha, const float beta); +template void Aidge::sqrtForward<half>(const half* input, half* output, int size, const half alpha, const half beta); + +template void Aidge::sqrtBackward<double>(const double* input, const double* outputGrad, double* inputGrad, int size, const double alpha, const double beta); +template void Aidge::sqrtBackward<float>(const float* input, const float* outputGrad, float* inputGrad, int size, const float alpha, const float beta); +template void Aidge::sqrtBackward<half>(const half* input, const half* outputGrad, half* inputGrad, int size, const half alpha, const half beta); diff --git a/src/operator/SubImpl.cpp b/src/operator/SubImpl.cpp index a04a1c3018b0c9ba455d21ba563253eb3e004e10..249d95f5a03c17e96db41c924361be3de1cbc6b0 100644 --- a/src/operator/SubImpl.cpp +++ b/src/operator/SubImpl.cpp @@ -155,11 +155,17 @@ void Aidge::SubImpl_cuda::backward() { } template <class T> -void Aidge::SubImpl_cuda::backward_(const Tensor& outputGrad, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) { +void Aidge::SubImpl_cuda::backward_( + const Tensor& outputGrad, + const std::vector<std::vector<int>>& inputsDims, + const std::vector<std::vector<int>>& inputsStrides) +{ const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; - const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; + const typename Cuda::cudnn_scaling_type<T>::type beta = 1.0f; // accumulate const typename Cuda::cudnn_scaling_type<T>::type gamma = -1.0f; + for (std::size_t i = 0; i < inputsDims.size(); i++) { if (op.getInput(i)->size() == op.getOutput(0)->size()) diff --git a/src/operator/TanhImpl.cpp b/src/operator/TanhImpl.cpp index 96c0330febba35cfea04bbbac97d9308195d6309..2e61280ec83488cbba5b7b1fd23d49e70276790c 100644 --- a/src/operator/TanhImpl.cpp +++ b/src/operator/TanhImpl.cpp @@ -94,10 +94,13 @@ void Aidge::TanhImpl_cuda::backward() { } template <class T> -void Aidge::TanhImpl_cuda::backward_(const Tensor& output_grad) { +void Aidge::TanhImpl_cuda::backward_(const Tensor& output_grad) +{ const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; - const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; + const typename Cuda::cudnn_scaling_type<T>::type beta = 1.0f; // accumulate + CHECK_CUDNN_STATUS( cudnnActivationBackward(CudaContext::cudnnHandle(), mTanhDesc, diff --git a/unit_tests/Test_AddImpl.cpp b/unit_tests/Test_AddImpl.cpp index b8129175d88323c896244e531f1dd52a5cbaa19e..dffabe6aab92bdfdd0c79b61ab59e9bc6efb9d94 100644 --- a/unit_tests/Test_AddImpl.cpp +++ b/unit_tests/Test_AddImpl.cpp @@ -22,48 +22,27 @@ using namespace Aidge; TEST_CASE("[gpu/operator] Add(forward)", "[Add][GPU]") { - std::shared_ptr<Tensor> input1 = std::make_shared<Tensor>(Array4D<float,3,3,3,2> { - { // - { // - {{20, 47},{21, 48},{22, 49}}, // - {{23, 50},{24, 51},{25, 52}}, // - {{26, 53},{27, 54},{28, 55}} // - }, // - { // - {{29, 56},{30, 57},{31, 58}}, // - {{32, 59},{33, 60},{34, 61}}, // - {{35, 62},{36, 63},{37, 64}} // - }, // - { // - {{38, 65},{39, 66},{40, 67}}, // - {{41, 68},{42, 69},{43, 70}}, // - {{44, 71},{45, 72},{46, 73}} // - } // - } // - }); // - input1->setBackend("cuda"); - SECTION("One input") { - std::shared_ptr<Node> myAdd = Add(1); - auto op = std::static_pointer_cast<OperatorTensor>(myAdd -> getOperator()); - op->associateInput(0, input1); - op->setBackend("cuda"); - op->setDataType(DataType::Float32); - myAdd->forward(); - - float* computedOutput = new float[input1->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * input1->size(), cudaMemcpyDeviceToHost); - float* targetOutput = new float[input1->size()](); - cudaMemcpy(targetOutput, input1->getImpl()->rawPtr(), sizeof(float) * input1->size(), cudaMemcpyDeviceToHost); - - for(int i = 0; i < input1->size(); i++){ - REQUIRE(fabs(computedOutput[i] - targetOutput[i]) < 1e-6); - } - - delete[] computedOutput; - delete[] targetOutput; - } - - SECTION("Two inputs") { + SECTION("Same input") { + std::shared_ptr<Tensor> input1 = std::make_shared<Tensor>(Array4D<float,3,3,3,2> { + { // + { // + {{20, 47},{21, 48},{22, 49}}, // + {{23, 50},{24, 51},{25, 52}}, // + {{26, 53},{27, 54},{28, 55}} // + }, // + { // + {{29, 56},{30, 57},{31, 58}}, // + {{32, 59},{33, 60},{34, 61}}, // + {{35, 62},{36, 63},{37, 64}} // + }, // + { // + {{38, 65},{39, 66},{40, 67}}, // + {{41, 68},{42, 69},{43, 70}}, // + {{44, 71},{45, 72},{46, 73}} // + } // + } // + }); // + input1->setBackend("cuda"); std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<float,3,3,3,2> { { { @@ -84,7 +63,7 @@ TEST_CASE("[gpu/operator] Add(forward)", "[Add][GPU]") { } }); - std::shared_ptr<Node> myAdd = Add(2); + std::shared_ptr<Node> myAdd = Add(); auto op = std::static_pointer_cast<OperatorTensor>(myAdd -> getOperator()); op->associateInput(0, input1); op->associateInput(1, input1); @@ -103,47 +82,6 @@ TEST_CASE("[gpu/operator] Add(forward)", "[Add][GPU]") { delete[] computedOutput; } - SECTION("Three inputs") { - std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<float,3,3,3,2> { - { - { - {{ 60, 141},{ 63, 144},{ 66, 147}}, - {{ 69, 150},{ 72, 153},{ 75, 156}}, - {{ 78, 159},{ 81, 162},{ 84, 165}} - }, - { - {{ 87, 168},{ 90, 171},{ 93, 174}}, - {{ 96, 177},{ 99, 180},{102, 183}}, - {{105, 186},{108, 189},{111, 192}} - }, - { - {{114, 195},{117, 198},{120, 201}}, - {{123, 204},{126, 207},{129, 210}}, - {{132, 213},{135, 216},{138, 219}} - } - } - }); - - std::shared_ptr<Node> myAdd = Add(3); - auto op = std::static_pointer_cast<OperatorTensor>(myAdd -> getOperator()); - op->associateInput(0, input1); - op->associateInput(1, input1); - op->associateInput(2, input1); - op->setDataType(DataType::Float32); - op->setBackend("cuda"); - myAdd->forward(); - - float* computedOutput = new float[input1->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * expectedOutput->size(), cudaMemcpyDeviceToHost); - - for(int i = 0; i < expectedOutput->size(); i++){ - const float targetOutput = *(static_cast<float*>(expectedOutput->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); - } - - delete[] computedOutput; - } - SECTION("Broadcasting") { std::shared_ptr<Tensor> input_0 = std::make_shared<Tensor>(Array4D<float,3,1,3,2> { { // @@ -168,47 +106,80 @@ TEST_CASE("[gpu/operator] Add(forward)", "[Add][GPU]") { } // }); // - std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<float,2> {{100,200}}); - std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<float,3,3,3,2> { - { // - { // - {{ 120, 222},{ 124, 226},{ 128, 230}}, // - {{ 126, 228},{ 130, 232},{ 134, 236}}, // - {{ 132, 234},{ 136, 238},{ 140, 242}} // - }, // - { // - {{ 126, 228},{ 130, 232},{ 134, 236}}, // - {{ 132, 234},{ 136, 238},{ 140, 242}}, // - {{ 138, 240},{ 142, 244},{ 146, 248}} // - }, // - { // - {{ 132, 234},{ 136, 238},{140, 242}}, // - {{ 138, 240},{ 142, 244},{146, 248}}, // - {{ 144, 246},{ 148, 250},{152, 254}} // - } // - } // - }); // + std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<float,2> {{100,200}}); input_0->setBackend("cuda"); input_1->setBackend("cuda"); input_2->setBackend("cuda"); - std::shared_ptr<Node> myAdd = Add(3); - auto op = std::static_pointer_cast<OperatorTensor>(myAdd -> getOperator()); - op->associateInput(0, input_0); - op->associateInput(1, input_1); - op->associateInput(2, input_2); - op->setDataType(DataType::Float32); - op->setBackend("cuda"); - myAdd->forward(); - float* computedOutput = new float[input1->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * expectedOutput->size(), cudaMemcpyDeviceToHost); + /// Input0(d0, 1, d2, d3) + Input1(1, d1, d2, d3) = Output(d0, d1, d2, d3) + std::shared_ptr<Tensor> expectedOutput0 = std::make_shared<Tensor>(Array4D<float,3,3,3,2> { + { // + { // + {{ 20, 22},{ 24, 26},{ 28, 30}}, // + {{ 26, 28},{ 30, 32},{ 34, 36}}, // + {{ 32, 34},{ 36, 38},{ 40, 42}} // + }, // + { // + {{ 26, 28},{ 30, 32},{ 34, 36}}, // + {{ 32, 34},{ 36, 38},{ 40, 42}}, // + {{ 38, 40},{ 42, 44},{ 46, 48}} // + }, // + { // + {{ 32, 34},{ 36, 38},{40, 42}}, // + {{ 38, 40},{ 42, 44},{46, 48}}, // + {{ 44, 46},{ 48, 50},{52, 54}} // + } // + } // + }); // - for(int i = 0; i < expectedOutput->size(); i++){ - const float targetOutput = *(static_cast<float*>(expectedOutput->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + std::shared_ptr<Node> myAdd0 = Add(); + auto op0 = std::static_pointer_cast<OperatorTensor>(myAdd0 -> getOperator()); + op0->associateInput(0, input_0); + op0->associateInput(1, input_1); + op0->setDataType(DataType::Float32); + op0->setBackend("cuda"); + myAdd0->forward(); + + float* computedOutput0 = new float[expectedOutput0->size()](); + cudaMemcpy(computedOutput0, op0->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * expectedOutput0->size(), cudaMemcpyDeviceToHost); + + for(int i = 0; i < expectedOutput0->size(); i++){ + const float targetOutput = *(static_cast<float*>(expectedOutput0->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedOutput0[i] - targetOutput) < 1e-6); } - delete[] computedOutput; + delete[] computedOutput0; + + /// Input0(d0, d1, d2, d3) + Input1(d3) = Output(d0, d1, d2, d3) + std::shared_ptr<Tensor> expectedOutput1 = std::make_shared<Tensor>(Array4D<float,3,1,3,2> { + { // + { // + {{100, 201},{102, 203},{104, 205}} // + }, // + { // + {{106, 207},{108, 209},{110, 211}} // + }, // + { // + {{112, 213},{114, 215},{116, 217}} // + } // + } // + }); // + std::shared_ptr<Node> myAdd1 = Add(); + auto op1 = std::static_pointer_cast<OperatorTensor>(myAdd1 -> getOperator()); + op1->associateInput(0, input_0); + op1->associateInput(1, input_2); + op1->setDataType(DataType::Float32); + op1->setBackend("cuda"); + myAdd1->forward(); + float* computedOutput1 = new float[expectedOutput1->size()](); + cudaMemcpy(computedOutput1, op1->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * expectedOutput1->size(), cudaMemcpyDeviceToHost); + + for(int i = 0; i < expectedOutput1->size(); i++){ + const float targetOutput = *(static_cast<float*>(expectedOutput1->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedOutput1[i] - targetOutput) < 1e-6); + } + + delete[] computedOutput1; } SECTION("Random Input") { @@ -231,11 +202,11 @@ TEST_CASE("[gpu/operator] Add(forward)", "[Add][GPU]") { for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { // Create Add Operator CUDA - std::shared_ptr<Node> myAddCUDA = Add(2, "myaddcuda"); + std::shared_ptr<Node> myAddCUDA = Add("myaddcuda"); auto op_cuda = std::static_pointer_cast<OperatorTensor>(myAddCUDA -> getOperator()); // Create Add Operator CPU - std::shared_ptr<Node> myAddCPU = Add(2, "myaddcpu"); + std::shared_ptr<Node> myAddCPU = Add("myaddcpu"); auto op_cpu = std::static_pointer_cast<OperatorTensor>(myAddCPU -> getOperator()); op_cpu->setDataType(DataType::Float32); op_cpu->setBackend("cpu"); @@ -360,16 +331,12 @@ TEST_CASE("[gpu/operator] Add(backward)", "[Add][GPU]") { } // }); // - std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<float,2> {{100,200}}); - input_0->setBackend("cuda"); input_1->setBackend("cuda"); - input_2->setBackend("cuda"); - std::shared_ptr<Node> myAdd = Add(3); + std::shared_ptr<Node> myAdd = Add(); auto op = std::static_pointer_cast<OperatorTensor>(myAdd -> getOperator()); op->associateInput(0, input_0); op->associateInput(1, input_1); - op->associateInput(2, input_2); op->setDataType(DataType::Float32); op->setBackend("cuda"); myAdd->forward(); @@ -420,14 +387,11 @@ TEST_CASE("[gpu/operator] Add(backward)", "[Add][GPU]") { } // } // }); // - std::shared_ptr<Tensor> expectedInput3Grad = std::make_shared<Tensor>(Array1D<float,2> {{729, 756}}); float *computedGrad1Cuda = new float[expectedInput1Grad->size()](); cudaMemcpy(computedGrad1Cuda, op->getInput(0)->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInput1Grad->size(), cudaMemcpyDeviceToHost); float *computedGrad2Cuda = new float[expectedInput2Grad->size()](); cudaMemcpy(computedGrad2Cuda, op->getInput(1)->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInput2Grad->size(), cudaMemcpyDeviceToHost); - float *computedGrad3Cuda = new float[expectedInput3Grad->size()](); - cudaMemcpy(computedGrad3Cuda, op->getInput(2)->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInput3Grad->size(), cudaMemcpyDeviceToHost); for(int i = 0; i < expectedInput1Grad->size(); i++){ const float targetOutput = *(static_cast<float*>(expectedInput1Grad->getImpl()->rawPtr()) + i); @@ -437,12 +401,7 @@ TEST_CASE("[gpu/operator] Add(backward)", "[Add][GPU]") { const float targetOutput = *(static_cast<float*>(expectedInput2Grad->getImpl()->rawPtr()) + i); REQUIRE(fabs(computedGrad2Cuda[i] - targetOutput) < 1e-6); } - for(int i = 0; i < expectedInput3Grad->size(); i++){ - const float targetOutput = *(static_cast<float*>(expectedInput3Grad->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedGrad3Cuda[i] - targetOutput) < 1e-6); - } delete[] computedGrad1Cuda; delete[] computedGrad2Cuda; - delete[] computedGrad3Cuda; } \ No newline at end of file diff --git a/unit_tests/Test_BatchNormImpl.cpp b/unit_tests/Test_BatchNormImpl.cpp index c83624020d86a2eb786d249c5ee664ca3bfdde3b..5b8d3eae7b8816bf70ffdd7e78b56305fe0f7191 100644 --- a/unit_tests/Test_BatchNormImpl.cpp +++ b/unit_tests/Test_BatchNormImpl.cpp @@ -25,7 +25,7 @@ using namespace Aidge; TEST_CASE("[gpu/operator] BatchNorm(forward)") { SECTION("Static Input") { - std::shared_ptr<Node> myBatchNorm = BatchNorm<2>(3, 0.00001F, 0.1F, "mybatchnorm"); + std::shared_ptr<Node> myBatchNorm = BatchNorm<2>(3, 0.00001F, 0.1F, false, "mybatchnorm"); auto op = std::static_pointer_cast<OperatorTensor>(myBatchNorm -> getOperator()); op->setDataType(DataType::Float32); op->setBackend("cuda"); @@ -148,13 +148,13 @@ TEST_CASE("[gpu/operator] BatchNorm(forward)") { // Create BatchNorm Operator Cuda - std::shared_ptr<Node> myBatchNormCuda = BatchNorm<2>(nbChannels, epsilon, momentum, "mybatchnormcuda"); + std::shared_ptr<Node> myBatchNormCuda = BatchNorm<2>(nbChannels, epsilon, momentum, false, "mybatchnormcuda"); auto op_cuda = std::static_pointer_cast<OperatorTensor>(myBatchNormCuda -> getOperator()); op_cuda->setDataType(DataType::Float32); op_cuda->setBackend("cuda"); // Create BatchNorm Operator CPU - std::shared_ptr<Node> myBatchNormCpu = BatchNorm<2>(nbChannels, epsilon, momentum, "mybatchnormcuda"); + std::shared_ptr<Node> myBatchNormCpu = BatchNorm<2>(nbChannels, epsilon, momentum, false, "mybatchnormcpu"); auto op_cpu = std::static_pointer_cast<OperatorTensor>(myBatchNormCpu -> getOperator()); op_cpu->setDataType(DataType::Float32); op_cpu->setBackend("cpu"); diff --git a/unit_tests/Test_ClipImpl.cpp b/unit_tests/Test_ClipImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b62f874c198a544f571a98ebb22c1cd7fbd923cf --- /dev/null +++ b/unit_tests/Test_ClipImpl.cpp @@ -0,0 +1,122 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <array> +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <chrono> +#include <catch2/catch_test_macros.hpp> + +#include "aidge/backend/cpu.hpp" +#include "aidge/backend/cuda.hpp" +#include "aidge/operator/Clip.hpp" +#include "aidge/data/Tensor.hpp" +#include "aidge/utils/TensorUtils.hpp" +using namespace std::chrono; +namespace Aidge { + +TEST_CASE("[gpu/operator] Clip", "[Clip][GPU]") +{ + + constexpr std::uint16_t NBTRIALS = 10; + // Create a random number generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<float> valueDist(0, 10); + std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1),std::size_t(20)); + std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(4), std::size_t(6)); + std::uniform_int_distribution<int> boolDist(0,1); + + // To measure execution time of 'forward()' + std::chrono::time_point<std::chrono::system_clock> startcuda; + std::chrono::time_point<std::chrono::system_clock> endcuda; + std::chrono::time_point<std::chrono::system_clock> startcpu; + std::chrono::time_point<std::chrono::system_clock> endcpu; + std::chrono::duration<double, std::micro> duration{}; + std::size_t number_of_operation = 0; + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) + { + std::shared_ptr<Node> myClipCUDA = Clip("clcuda",1.0,3.0); + auto op_cuda = std::static_pointer_cast<OperatorTensor>(myClipCUDA -> getOperator()); + + // Create Div Operator CPU + std::shared_ptr<Node> myClipCPU = Clip("clcpu",1.0,3.0); + auto op_cpu = std::static_pointer_cast<OperatorTensor>(myClipCPU -> getOperator()); + op_cpu->setDataType(DataType::Float32); + op_cpu->setBackend("cpu"); + + + const std::size_t nbDims = nbDimsDist(gen); + std::vector<std::size_t> dims0; + for (std::size_t i = 0; i < nbDims; ++i) + { + const std::size_t dim = dimSizeDist(gen); + dims0.push_back(dim); + } + + const std::size_t nb_elements0 = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + float* array0 = new float[nb_elements0]; + for (std::size_t i = 0; i < nb_elements0; ++i) { + array0[i] = valueDist(gen); + } + + + + float* array0_d; + std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>(); + T0_cuda->setDataType(DataType::Float32); + T0_cuda->setBackend("cuda"); + T0_cuda->resize(dims0); + op_cuda->associateInput(0, T0_cuda); + cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements0); + cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements0, cudaMemcpyHostToDevice); + T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements0); + + // input0 CPU + std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>(); + op_cpu->associateInput(0,T0_cpu); + T0_cpu->setDataType(DataType::Float32); + T0_cpu->setBackend("cpu"); + T0_cpu->resize(dims0); + T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements0); + + // forward CUDA + op_cuda->setDataType(DataType::Float32); + op_cuda->setBackend("cuda"); + startcuda = std::chrono::system_clock::now(); + op_cuda->forward(); + endcuda = std::chrono::system_clock::now(); + + + // forward CPU + startcpu = std::chrono::system_clock::now(); + op_cpu->forward(); + endcpu = std::chrono::system_clock::now(); + float *computedCPU = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr()); + + std::shared_ptr<Tensor> outputFallback; + const auto& cudaOutput = op_cuda->getOutput(0)->refCastFrom(outputFallback, *op_cpu->getOutput(0));; + REQUIRE(approxEq<float>(cudaOutput, *(op_cpu->getOutput(0)))); + + delete[] array0; + cudaFree(array0_d); + + auto duration_cuda = duration_cast<milliseconds>(endcuda - startcuda).count(); + std::cout << "CUDA exec time: " << duration_cuda << " ms" << std::endl; + auto duration_cpu = duration_cast<milliseconds>(endcpu - startcpu).count(); + std::cout << "CPU exec time: " << duration_cpu << " ms" << std::endl; + //Exec time difference (CPU - CUDA): + auto difference = duration_cpu - duration_cuda; + std::cout << "Exec time difference (CPU - CUDA): " << difference << " ms" << std::endl; + + } +} +} // namespace Aidge diff --git a/unit_tests/Test_RoundImpl.cpp b/unit_tests/Test_RoundImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4602e29c8070884927f2b4b3cda560aae380fd71 --- /dev/null +++ b/unit_tests/Test_RoundImpl.cpp @@ -0,0 +1,119 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <array> +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <chrono> +#include <catch2/catch_test_macros.hpp> + +#include "aidge/backend/cpu.hpp" +#include "aidge/backend/cuda.hpp" +#include "aidge/operator/Round.hpp" +#include "aidge/data/Tensor.hpp" +#include "aidge/utils/TensorUtils.hpp" +using namespace std::chrono; +namespace Aidge { + +TEST_CASE("[gpu/operator] Round", "[Round][GPU]") +{ + + constexpr std::uint16_t NBTRIALS = 10; + // Create a random number generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<float> valueDist(0, 10); + std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1), + std::size_t(20)); + std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(3), std::size_t(6)); + + // To measure execution time of 'forward()' + std::chrono::time_point<std::chrono::system_clock> startcuda; + std::chrono::time_point<std::chrono::system_clock> endcuda; + std::chrono::time_point<std::chrono::system_clock> startcpu; + std::chrono::time_point<std::chrono::system_clock> endcpu; + std::chrono::duration<double, std::micro> duration{}; + std::size_t number_of_operation = 0; + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) + { + std::shared_ptr<Node> myRoundCUDA = Round("clcuda"); + auto op_cuda = std::static_pointer_cast<OperatorTensor>(myRoundCUDA -> getOperator()); + + // Create Round Operator CPU + std::shared_ptr<Node> myRoundCPU = Round("clcpu"); + auto op_cpu = std::static_pointer_cast<OperatorTensor>(myRoundCPU -> getOperator()); + op_cpu->setDataType(DataType::Float32); + op_cpu->setBackend("cpu"); + + const std::size_t nbDims = nbDimsDist(gen); + std::vector<std::size_t> dims0; + for (std::size_t i = 0; i < nbDims; ++i) + { + const std::size_t dim = dimSizeDist(gen); + dims0.push_back(dim); + } + const std::size_t nb_elements0 = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + float* array0 = new float[nb_elements0]; + for (std::size_t i = 0; i < nb_elements0; ++i) { + array0[i] = valueDist(gen); + } + + float* array0_d; + std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>(); + T0_cuda->setDataType(DataType::Float32); + T0_cuda->setBackend("cuda"); + T0_cuda->resize(dims0); + op_cuda->associateInput(0, T0_cuda); + cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements0); + cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements0, cudaMemcpyHostToDevice); + T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements0); + + // input0 CPU + std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>(); + op_cpu->associateInput(0,T0_cpu); + T0_cpu->setDataType(DataType::Float32); + T0_cpu->setBackend("cpu"); + T0_cpu->resize(dims0); + T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements0); + + // forward CUDA + op_cuda->setDataType(DataType::Float32); + op_cuda->setBackend("cuda"); + startcuda = std::chrono::system_clock::now(); + op_cuda->forward(); + endcuda = std::chrono::system_clock::now(); + + + // forward CPU + startcpu = std::chrono::system_clock::now(); + op_cpu->forward(); + endcpu = std::chrono::system_clock::now(); + float *computedCPU = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr()); + + std::shared_ptr<Tensor> outputFallback; + const auto& cudaOutput = op_cuda->getOutput(0)->refCastFrom(outputFallback, *op_cpu->getOutput(0));; + + REQUIRE(approxEq<float>(cudaOutput, *(op_cpu->getOutput(0)))); + + delete[] array0; + cudaFree(array0_d); + + auto duration_cuda = duration_cast<milliseconds>(endcuda - startcuda).count(); + std::cout << "Temps d'exécution CUDA: " << duration_cuda << " ms" << std::endl; + auto duration_cpu = duration_cast<milliseconds>(endcpu - startcpu).count(); + std::cout << "Temps d'exécution CPU: " << duration_cpu << " ms" << std::endl; + // Benchmark between CUDA and CPU execution time + auto difference = duration_cpu - duration_cuda; + std::cout << "Différence de temps (CPU - CUDA): " << difference << " ms" << std::endl; + + } +} +} // namespace Aidge \ No newline at end of file diff --git a/unit_tests/Test_TensorImpl.cpp b/unit_tests/Test_TensorImpl.cpp index cb120a970c5310f80f8c62960c029a845937ba30..c24b5b457cce602d465a1ecddefc7b7f35964794 100644 --- a/unit_tests/Test_TensorImpl.cpp +++ b/unit_tests/Test_TensorImpl.cpp @@ -37,7 +37,7 @@ TEST_CASE("CUDA test") { } // Allocate device memory - float *d_a, *d_b, *d_out; + float *d_a, *d_b, *d_out; cudaMalloc(reinterpret_cast<void**>(&d_a), sizeof(float) * N); cudaMalloc(reinterpret_cast<void**>(&d_b), sizeof(float) * N); cudaMalloc(reinterpret_cast<void**>(&d_out), sizeof(float) * N); @@ -46,9 +46,9 @@ TEST_CASE("CUDA test") { cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice); cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice); - // Executing kernel + // Executing kernel vector_add(d_out, d_a, d_b, N); - + // Transfer data back to host memory cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost); @@ -71,7 +71,6 @@ TEST_CASE("CUDA test") { TEST_CASE("Tensor creation", "[Connector]") { SECTION("from const array") { Tensor x; - x.setBackend("cuda"); x = Array3D<int,2,2,2>{ { { @@ -83,6 +82,7 @@ TEST_CASE("Tensor creation", "[Connector]") { {7, 8} } }}; + x.setBackend("cuda"); REQUIRE(x.nbDims() == 3); REQUIRE(x.dims()[0] == 2);