diff --git a/include/aidge/backend/cuda.hpp b/include/aidge/backend/cuda.hpp index d5e9d1654f0a4fe894ed0e965a25b32c9e5caa06..0d7a55aaa3d3d5d150c1e55f6fd9f87d65c3e3ab 100644 --- a/include/aidge/backend/cuda.hpp +++ b/include/aidge/backend/cuda.hpp @@ -27,19 +27,18 @@ #include "aidge/backend/cuda/operator/MulImpl.hpp" #include "aidge/backend/cuda/operator/PadImpl.hpp" #include "aidge/backend/cuda/operator/PowImpl.hpp" +#include "aidge/backend/cuda/operator/ReLUImpl.hpp" #include "aidge/backend/cuda/operator/ReduceMeanImpl.hpp" #include "aidge/backend/cuda/operator/ReduceSumImpl.hpp" -#include "aidge/backend/cuda/operator/ReLUImpl.hpp" -#include "aidge/backend/cuda/operator/ShiftMaxImpl.hpp" -#include "aidge/backend/cuda/operator/ShiftGELUImpl.hpp" #include "aidge/backend/cuda/operator/ReshapeImpl.hpp" +#include "aidge/backend/cuda/operator/ShiftGELUImpl.hpp" +#include "aidge/backend/cuda/operator/ShiftMaxImpl.hpp" #include "aidge/backend/cuda/operator/SigmoidImpl.hpp" #include "aidge/backend/cuda/operator/SubImpl.hpp" #include "aidge/backend/cuda/operator/TanhImpl.hpp" -#include "aidge/backend/cuda/operator/ShiftMaxImpl.hpp" -#include "aidge/backend/cuda/operator/ShiftGELUImpl.hpp" #include "aidge/backend/cuda/operator/ILayerNormImpl.hpp" - +#include "aidge/backend/cuda/operator/ShiftGELUImpl.hpp" +#include "aidge/backend/cuda/operator/ShiftMaxImpl.hpp" #endif /* AIDGE_BACKEND_CUDA_IMPORTS_H_ */ diff --git a/include/aidge/backend/cuda/data/TensorImpl.hpp b/include/aidge/backend/cuda/data/TensorImpl.hpp index 541afeecc751332d41ff082b790282abcad5a1b0..35f6cfc1f517a31570c2d7b25341413330728314 100644 --- a/include/aidge/backend/cuda/data/TensorImpl.hpp +++ b/include/aidge/backend/cuda/data/TensorImpl.hpp @@ -1,30 +1,38 @@ #ifndef AIDGE_BACKEND_CUDA_DATA_TENSORIMPL_H_ #define AIDGE_BACKEND_CUDA_DATA_TENSORIMPL_H_ -#include <cstddef> // std::size_t +#include <cstddef> // std::size_t #include <memory> #include <string> #include "aidge/backend/TensorImpl.hpp" #include "aidge/data/Tensor.hpp" +#include "aidge/utils/ErrorHandling.hpp" #include "aidge/utils/Registrar.hpp" #include "aidge/utils/Types.h" -#include "aidge/utils/ErrorHandling.hpp" #include "aidge/utils/future_std/span.hpp" -#include "aidge/backend/cuda/utils/CudaUtils.hpp" #include "aidge/backend/cuda/utils/CudaContext.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" namespace Aidge { template <typename SRC_T, typename DST_T> -void thrust_copy(const SRC_T* /*srcData*/, DST_T* /*dstData*/, size_t /*size*/); -template <typename SRC_T, typename std::enable_if<!std::is_same<half_float::half, SRC_T>::value>::type* = nullptr> -void thrust_copy(const SRC_T* srcData, half_float::half* dstData, size_t size); -template <typename DST_T, typename std::enable_if<!std::is_same<half_float::half, DST_T>::value>::type* = nullptr> -void thrust_copy(const half_float::half* srcData, DST_T* dstData, size_t size); +void thrust_copy(const SRC_T * /*srcData*/, + DST_T * /*dstData*/, + size_t /*size*/); +template <typename SRC_T, + typename std::enable_if< + !std::is_same<half_float::half, SRC_T>::value>::type * = nullptr> +void thrust_copy(const SRC_T *srcData, half_float::half *dstData, size_t size); +template <typename DST_T, + typename std::enable_if< + !std::is_same<half_float::half, DST_T>::value>::type * = nullptr> +void thrust_copy(const half_float::half *srcData, DST_T *dstData, size_t size); template <> -void thrust_copy(const half_float::half* srcData, half_float::half* dstData, size_t size); +void thrust_copy(const half_float::half *srcData, + half_float::half *dstData, + size_t size); /** * @brief Abstract class for the TensorImpl_cuda class template. @@ -33,17 +41,18 @@ void thrust_copy(const half_float::half* srcData, half_float::half* dstData, siz * class), but whose data type does not need to be known. */ class TensorImpl_cuda_ { -protected: + protected: mutable cudnnTensorDescriptor_t mCudnnTensor = nullptr; -public: + public: /** * @brief Return the CuDNN tensor descriptor of the tensor. * @details This method uses lazy initialization for the descriptor * (which is therefore mutable in the derived class). * @return cudnnTensorDescriptor_t CuDNN tensor descriptor. */ - virtual const cudnnTensorDescriptor_t& getCudnnTensorDesc(const Tensor& tensor) const = 0; + virtual const cudnnTensorDescriptor_t & + getCudnnTensorDesc(const Tensor &tensor) const = 0; virtual ~TensorImpl_cuda_() { if (mCudnnTensor != nullptr) @@ -52,137 +61,199 @@ public: }; template <class T> -class TensorImpl_cuda : public TensorImpl, public TensorImpl_cuda_ { -private: - static T* cudaAlloc(NbElts_t length) { - T* data; - CHECK_CUDA_STATUS(cudaMalloc(reinterpret_cast<void**>(&data), length * sizeof(T))); +class TensorImpl_cuda : public TensorImpl, public TensorImpl_cuda_ { + private: + static T *cudaAlloc(NbElts_t length) { + T *data; + CHECK_CUDA_STATUS( + cudaMalloc(reinterpret_cast<void **>(&data), length * sizeof(T))); return data; } - static void cudaDelete(T* data) { + static void cudaDelete(T *data) { // Should not be called if data is nullptr, according to the standard cudaFree(data); } -private: + private: future_std::span<T> mData; /// If this instance own the data, std::unique_ptr manages it std::unique_ptr<T, decltype(&cudaDelete)> mDataOwner; -public: + public: static const std::string Backend; - TensorImpl_cuda(DeviceIdx_t device, std::vector<DimSize_t> dims) : TensorImpl(Backend, device, dims), mDataOwner(nullptr, cudaDelete) {} - + TensorImpl_cuda(DeviceIdx_t device, std::vector<DimSize_t> dims) + : TensorImpl(Backend, device, dims), mDataOwner(nullptr, cudaDelete) {} bool operator==(const TensorImpl &otherImpl) const override final; - static std::shared_ptr<TensorImpl_cuda> create(DeviceIdx_t device, std::vector<DimSize_t> dims) { + static std::shared_ptr<TensorImpl_cuda> + create(DeviceIdx_t device, std::vector<DimSize_t> dims) { return std::make_shared<TensorImpl_cuda<T>>(device, dims); } // native interface - const future_std::span<T>& data() const { return mData; } + const future_std::span<T> &data() const { + return mData; + } - inline std::size_t capacity() const noexcept override { return mData.size(); } + inline std::size_t capacity() const noexcept override { + return mData.size(); + } - std::size_t scalarSize() const noexcept override { return sizeof(T); } + std::size_t scalarSize() const noexcept override { + return sizeof(T); + } void zeros() override final { CHECK_CUDA_STATUS(cudaMemset(rawPtr(), T(0), mNbElts * sizeof(T))); } void copy(const void *src, NbElts_t length, NbElts_t offset = 0) override { - AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "TensorImpl_cuda<{}>::copy(): copy length ({}) is above capacity ({})", typeid(T).name(), length, mNbElts); - const T* srcT = static_cast<const T *>(src); - T* dstT = static_cast<T *>(rawPtr(offset)); - - AIDGE_ASSERT(dstT < srcT || dstT >= srcT + length, "TensorImpl_cuda<{}>::copy(): overlapping copy is not supported", typeid(T).name()); - CHECK_CUDA_STATUS(cudaMemcpy(dstT, srcT, length * sizeof(T), cudaMemcpyDeviceToDevice)); + AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, + "TensorImpl_cuda<{}>::copy(): copy length ({}) is above " + "capacity ({})", + typeid(T).name(), + length, + mNbElts); + const T *srcT = static_cast<const T *>(src); + T *dstT = static_cast<T *>(rawPtr(offset)); + + AIDGE_ASSERT( + dstT < srcT || dstT >= srcT + length, + "TensorImpl_cuda<{}>::copy(): overlapping copy is not supported", + typeid(T).name()); + CHECK_CUDA_STATUS(cudaMemcpy(dstT, + srcT, + length * sizeof(T), + cudaMemcpyDeviceToDevice)); } - void copyCast(const void *src, const DataType srcDt, NbElts_t length, NbElts_t offset = 0) override { + void copyCast(const void *src, + const DataType srcDt, + NbElts_t length, + NbElts_t offset = 0) override { if (length == 0) { return; } - AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "TensorImpl_cuda<{}>::copyCast(): copy length ({}) is above capacity ({})", typeid(T).name(), length, mNbElts); + AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, + "TensorImpl_cuda<{}>::copyCast(): copy length ({}) is " + "above capacity ({})", + typeid(T).name(), + length, + mNbElts); switch (srcDt) { case DataType::Float64: - thrust_copy(static_cast<const double*>(src), - static_cast<T*>(rawPtr(offset)), + thrust_copy(static_cast<const double *>(src), + static_cast<T *>(rawPtr(offset)), length); break; case DataType::Float32: - thrust_copy(static_cast<const float*>(src), - static_cast<T*>(rawPtr(offset)), + thrust_copy(static_cast<const float *>(src), + static_cast<T *>(rawPtr(offset)), length); break; case DataType::Float16: - thrust_copy(static_cast<const half_float::half*>(src), - static_cast<T*>(rawPtr(offset)), + thrust_copy(static_cast<const half_float::half *>(src), + static_cast<T *>(rawPtr(offset)), length); break; case DataType::Int64: - thrust_copy(static_cast<const int64_t*>(src), - static_cast<T*>(rawPtr(offset)), + thrust_copy(static_cast<const int64_t *>(src), + static_cast<T *>(rawPtr(offset)), length); break; case DataType::UInt64: - thrust_copy(static_cast<const uint64_t*>(src), - static_cast<T*>(rawPtr(offset)), + thrust_copy(static_cast<const uint64_t *>(src), + static_cast<T *>(rawPtr(offset)), length); break; case DataType::Int32: - thrust_copy(static_cast<const int32_t*>(src), - static_cast<T*>(rawPtr(offset)), + thrust_copy(static_cast<const int32_t *>(src), + static_cast<T *>(rawPtr(offset)), length); break; case DataType::UInt32: - thrust_copy(static_cast<const uint32_t*>(src), - static_cast<T*>(rawPtr(offset)), + thrust_copy(static_cast<const uint32_t *>(src), + static_cast<T *>(rawPtr(offset)), length); break; case DataType::Int16: - thrust_copy(static_cast<const int16_t*>(src), - static_cast<T*>(rawPtr(offset)), + thrust_copy(static_cast<const int16_t *>(src), + static_cast<T *>(rawPtr(offset)), length); break; case DataType::UInt16: - thrust_copy(static_cast<const uint16_t*>(src), - static_cast<T*>(rawPtr(offset)), + thrust_copy(static_cast<const uint16_t *>(src), + static_cast<T *>(rawPtr(offset)), length); break; case DataType::Int8: - thrust_copy(static_cast<const int8_t*>(src), - static_cast<T*>(rawPtr(offset)), + thrust_copy(static_cast<const int8_t *>(src), + static_cast<T *>(rawPtr(offset)), length); break; case DataType::UInt8: - thrust_copy(static_cast<const uint8_t*>(src), - static_cast<T*>(rawPtr(offset)), + thrust_copy(static_cast<const uint8_t *>(src), + static_cast<T *>(rawPtr(offset)), length); break; default: - AIDGE_THROW_OR_ABORT(std::runtime_error, "TensorImpl_cuda<{}>::copyCast(): unsupported data type {}.", typeid(T).name(), srcDt); + AIDGE_THROW_OR_ABORT( + std::runtime_error, + "TensorImpl_cuda<{}>::copyCast(): unsupported data type {}.", + typeid(T).name(), + srcDt); break; } } - void copyFromDevice(const void *src, const std::pair<std::string, DeviceIdx_t>& device, NbElts_t length, NbElts_t offset = 0) override { - AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "TensorImpl_cuda<{}>::copyFromDevice(): copy length ({}) is above capacity ({})", typeid(T).name(), length, mNbElts); - CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(offset), src, length * sizeof(T), cudaMemcpyDeviceToDevice)); + void copyFromDevice(const void *src, + const std::pair<std::string, DeviceIdx_t> &device, + NbElts_t length, + NbElts_t offset = 0) override { + AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, + "TensorImpl_cuda<{}>::copyFromDevice(): copy length ({}) " + "is above capacity ({})", + typeid(T).name(), + length, + mNbElts); + CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(offset), + src, + length * sizeof(T), + cudaMemcpyDeviceToDevice)); } - void copyFromHost(const void *src, NbElts_t length, NbElts_t offset = 0) override { - AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "TensorImpl_cuda<{}>::copyFromHost(): copy length ({}) is above capacity ({})", typeid(T).name(), length, mNbElts); - CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(offset), src, length * sizeof(T), cudaMemcpyHostToDevice)); + void copyFromHost(const void *src, + NbElts_t length, + NbElts_t offset = 0) override { + AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, + "TensorImpl_cuda<{}>::copyFromHost(): copy length ({}) " + "is above capacity ({})", + typeid(T).name(), + length, + mNbElts); + CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(offset), + src, + length * sizeof(T), + cudaMemcpyHostToDevice)); } - void copyToHost(void *dst, NbElts_t length, NbElts_t offset = 0) const override { - AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "TensorImpl_cuda<{}>::copyToHost(): copy length ({}) is above capacity ({})", typeid(T).name(), length, mNbElts); - CHECK_CUDA_STATUS(cudaMemcpy(dst, rawPtr(offset), length * sizeof(T), cudaMemcpyDeviceToHost)); + void copyToHost(void *dst, + NbElts_t length, + NbElts_t offset = 0) const override { + AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, + "TensorImpl_cuda<{}>::copyToHost(): copy length ({}) is " + "above capacity ({})", + typeid(T).name(), + length, + mNbElts); + CHECK_CUDA_STATUS(cudaMemcpy(dst, + rawPtr(offset), + length * sizeof(T), + cudaMemcpyDeviceToHost)); } void *rawPtr(NbElts_t offset = 0) override { @@ -191,104 +262,139 @@ public: }; const void *rawPtr(NbElts_t offset = 0) const override { - AIDGE_ASSERT(mData.size() >= mNbElts, "TensorImpl_cuda<{}>::rawPtr(): accessing uninitialized const rawPtr", typeid(T).name()); + AIDGE_ASSERT(mData.size() >= mNbElts, + "TensorImpl_cuda<{}>::rawPtr(): accessing uninitialized " + "const rawPtr", + typeid(T).name()); return (mData.data() + offset); }; - const cudnnTensorDescriptor_t& getCudnnTensorDesc(const Tensor& tensor) const override { + const cudnnTensorDescriptor_t & + getCudnnTensorDesc(const Tensor &tensor) const override { if (mCudnnTensor == nullptr) { CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&mCudnnTensor)); if (tensor.size() > 0) { /** - ** cudNN Tensors are restricted to having at least 4 dimensions : - ** When working with lower dimensionsal data, unused dimensions are set to 1. - ** Referes to the cudnnSetTensorNdDescriptor documentation from : - ** https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html + ** cudNN Tensors are restricted to having at least 4 + *dimensions : + ** When working with lower dimensionsal data, unused + *dimensions are set to 1. + ** Referes to the cudnnSetTensorNdDescriptor documentation + *from : + ** + *https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html **/ - std::vector<int> dims(tensor.dims().cbegin(), tensor.dims().cend()); - std::vector<int> strides(tensor.strides().cbegin(), tensor.strides().cend()); + std::vector<int> dims(tensor.dims().cbegin(), + tensor.dims().cend()); + std::vector<int> strides(tensor.strides().cbegin(), + tensor.strides().cend()); if (dims.size() < 4) { dims.resize(4, 1); strides.resize(4, 1); } - CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(mCudnnTensor, - CudaContext::data_type<T>::value, - dims.size(), - &dims[0], - &strides[0])); + CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor( + mCudnnTensor, + CudaContext::data_type<T>::value, + dims.size(), + &dims[0], + &strides[0])); } - } - else { + } else { // Compare if the shape of the tensor has changed cudnnDataType_t currentDataType; int currentNbDims; - // Since we don't know the nb dims of the current tensor, we init with CUDNN_DIM_MAX then remove the trailing zeros + // Since we don't know the nb dims of the current tensor, we init + // with CUDNN_DIM_MAX then remove the trailing zeros std::vector<int> currentDims(CUDNN_DIM_MAX); std::vector<int> currentStrides(CUDNN_DIM_MAX); - CHECK_CUDNN_STATUS(cudnnGetTensorNdDescriptor(mCudnnTensor, CUDNN_DIM_MAX, ¤tDataType, ¤tNbDims, currentDims.data(), currentStrides.data())); + CHECK_CUDNN_STATUS( + cudnnGetTensorNdDescriptor(mCudnnTensor, + CUDNN_DIM_MAX, + ¤tDataType, + ¤tNbDims, + currentDims.data(), + currentStrides.data())); // Remove the trailing zeros - currentDims.erase(std::find_if(currentDims.rbegin(), currentDims.rend(), [](int x) { return x != 0; }).base(), + currentDims.erase(std::find_if(currentDims.rbegin(), + currentDims.rend(), + [](int x) { return x != 0; }) + .base(), currentDims.end()); - std::vector<int> dims(tensor.dims().cbegin(), tensor.dims().cend()); + std::vector<int> dims(tensor.dims().cbegin(), + tensor.dims().cend()); if (dims.size() < 4) { dims.resize(4, 1); } // Update descriptor if shape has changed - if (dims!=currentDims) { - std::vector<int> strides(tensor.strides().cbegin(), tensor.strides().cend()); + if (dims != currentDims) { + std::vector<int> strides(tensor.strides().cbegin(), + tensor.strides().cend()); if (strides.size() < 4) { strides.resize(4, 1); } - CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(mCudnnTensor, - CudaContext::data_type<T>::value, - dims.size(), - &dims[0], - &strides[0])); + CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor( + mCudnnTensor, + CudaContext::data_type<T>::value, + dims.size(), + &dims[0], + &strides[0])); } } return mCudnnTensor; } void setRawPtr(void *ptr, NbElts_t length) override final { - AIDGE_ASSERT(length >= mNbElts, "TensorImpl_cuda<{}>::setRawPtr(): trying to set raw pointer (length: {}) of insufficient capacity (required: {})", typeid(T).name(), length, mNbElts); + AIDGE_ASSERT( + length >= mNbElts, + "TensorImpl_cuda<{}>::setRawPtr(): trying to set raw pointer " + "(length: {}) of insufficient capacity (required: {})", + typeid(T).name(), + length, + mNbElts); mData = future_std::span<T>(static_cast<T *>(ptr), length); mDataOwner.reset(); }; virtual ~TensorImpl_cuda() = default; -private: + private: void lazyInit() { if (mData.size() < mNbElts) { // Need more data, a re-allocation will occur - AIDGE_ASSERT(mData.empty() || mDataOwner != nullptr, "TensorImpl_cuda<{}>: trying to enlarge non-owned data", typeid(T).name()); + AIDGE_ASSERT( + mData.empty() || mDataOwner != nullptr, + "TensorImpl_cuda<{}>: trying to enlarge non-owned data", + typeid(T).name()); mDataOwner.reset(cudaAlloc(mNbElts)); mData = future_std::span<T>(mDataOwner.get(), mNbElts); } } }; -template <typename T> -const std::string TensorImpl_cuda<T>::Backend = "cuda"; +template <typename T> const std::string TensorImpl_cuda<T>::Backend = "cuda"; namespace { -static Registrar<Tensor> registrarTensorImpl_cuda_Float64( - {"cuda", DataType::Float64}, Aidge::TensorImpl_cuda<double>::create); -static Registrar<Tensor> registrarTensorImpl_cuda_Float32( - {"cuda", DataType::Float32}, Aidge::TensorImpl_cuda<float>::create); +static Registrar<Tensor> + registrarTensorImpl_cuda_Float64({"cuda", DataType::Float64}, + Aidge::TensorImpl_cuda<double>::create); +static Registrar<Tensor> + registrarTensorImpl_cuda_Float32({"cuda", DataType::Float32}, + Aidge::TensorImpl_cuda<float>::create); static Registrar<Tensor> registrarTensorImpl_cuda_Float16( - {"cuda", DataType::Float16}, Aidge::TensorImpl_cuda<half_float::half>::create); -static Registrar<Tensor> registrarTensorImpl_cuda_Int32( - {"cuda", DataType::Int32}, Aidge::TensorImpl_cuda<int32_t>::create); -} // namespace -} // namespace Aidge + {"cuda", DataType::Float16}, + Aidge::TensorImpl_cuda<half_float::half>::create); +static Registrar<Tensor> + registrarTensorImpl_cuda_Int32({"cuda", DataType::Int32}, + Aidge::TensorImpl_cuda<int32_t>::create); +} // namespace +} // namespace Aidge #endif /* AIDGE_BACKEND_CUDA_DATA_TENSORIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/AddImpl.hpp b/include/aidge/backend/cuda/operator/AddImpl.hpp index 42d420f8410f79100fdfdbe3eabb8b43e616a74a..70e7f80f1e4e5489afd02473662f54a4ca34c758 100644 --- a/include/aidge/backend/cuda/operator/AddImpl.hpp +++ b/include/aidge/backend/cuda/operator/AddImpl.hpp @@ -29,10 +29,10 @@ namespace Aidge { // Operator implementation entry point for the backend class AddImpl_cuda : public OperatorImpl { -public: - AddImpl_cuda(const Add_Op& op) : OperatorImpl(op, "cuda") {} + public: + AddImpl_cuda(const Add_Op &op) : OperatorImpl(op, "cuda") {} - static std::unique_ptr<AddImpl_cuda> create(const Add_Op& op) { + static std::unique_ptr<AddImpl_cuda> create(const Add_Op &op) { return std::make_unique<AddImpl_cuda>(op); } @@ -47,13 +47,19 @@ public: void forward() override; void backward() override; -private: - template <class T> void forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides); - template <class T> void backward_(const Tensor& outGrad, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides); + private: + template <class T> + void forward_(const std::vector<Tensor> &inputs, + const std::vector<std::vector<int>> &inputsDims, + const std::vector<std::vector<int>> &inputsStrides); + template <class T> + void backward_(const Tensor &outGrad, + const std::vector<std::vector<int>> &inputsDims, + const std::vector<std::vector<int>> &inputsStrides); }; // Implementation entry point registration to Operator REGISTRAR(Add_Op, "cuda", Aidge::AddImpl_cuda::create); -} // namespace Aidge +} // namespace Aidge #endif /* AIDGE_BACKEND_CUDA_OPERATOR_ADDIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/AndImpl.hpp b/include/aidge/backend/cuda/operator/AndImpl.hpp index e90a4c5fe3d7b4cd529dcb4cb5400a6447f53e3c..69911463ce257b804af21a00a72d9ecf2bb3c5a1 100644 --- a/include/aidge/backend/cuda/operator/AndImpl.hpp +++ b/include/aidge/backend/cuda/operator/AndImpl.hpp @@ -29,10 +29,10 @@ namespace Aidge { // Operator implementation entry point for the backend class AndImpl_cuda : public OperatorImpl { -public: - AndImpl_cuda(const And_Op& op) : OperatorImpl(op, "cuda") {} + public: + AndImpl_cuda(const And_Op &op) : OperatorImpl(op, "cuda") {} - static std::unique_ptr<AndImpl_cuda> create(const And_Op& op) { + static std::unique_ptr<AndImpl_cuda> create(const And_Op &op) { return std::make_unique<AndImpl_cuda>(op); } @@ -46,12 +46,15 @@ public: void forward() override; -private: - template <class T> void forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides); + private: + template <class T> + void forward_(const std::vector<Tensor> &inputs, + const std::vector<std::vector<int>> &inputsDims, + const std::vector<std::vector<int>> &inputsStrides); }; // Implementation entry point registration to Operator REGISTRAR(And_Op, "cuda", Aidge::AndImpl_cuda::create); -} // namespace Aidge +} // namespace Aidge #endif /* AIDGE_BACKEND_CUDA_OPERATOR_ANDIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/AndImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/AndImpl_CUDA_kernels.hpp index bae79a03d03cd5fb7d5fdc4fbebf1dd7562370ae..588581786546f1eb9442a0c095d6346e1f4e32b7 100644 --- a/include/aidge/backend/cuda/operator/AndImpl_CUDA_kernels.hpp +++ b/include/aidge/backend/cuda/operator/AndImpl_CUDA_kernels.hpp @@ -12,26 +12,26 @@ #ifndef AIDGE_CUDA_OPERATOR_ANDIMPL_KERNELS_H_ #define AIDGE_CUDA_OPERATOR_ANDIMPL_KERNELS_H_ -#include <stdexcept> #include <cfloat> #include <cuda.h> -#include <cuda_runtime_api.h> #include <cuda_fp16.h> +#include <cuda_runtime_api.h> +#include <stdexcept> -#include "aidge/data/Data.hpp" #include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/data/Data.hpp" namespace Aidge { template <class T> -void AndForward(const T* input1, const T* input2, T* output, - const std::vector<int>& input1Dims,const std::vector<int>& input2Dims, - const std::vector<int>& inputStrides, const std::vector<int>& input2Strides,const std::vector<int>& outputStrides, +void AndForward(const T *input1, + const T *input2, + T *output, + const std::vector<int> &input1Dims, + const std::vector<int> &input2Dims, + const std::vector<int> &inputStrides, + const std::vector<int> &input2Strides, + const std::vector<int> &outputStrides, int outSize); } #endif /* AIDGE_CUDA_OPERATOR_ANDIMPL_KERNELS_H_ */ - - - - - diff --git a/include/aidge/backend/cuda/operator/ArgMaxImpl.hpp b/include/aidge/backend/cuda/operator/ArgMaxImpl.hpp index 7b4628084a913a10e48302597a4d5b77fb7f6d16..c20016371e7dfdeed8d09ae0ca58049d456281c6 100644 --- a/include/aidge/backend/cuda/operator/ArgMaxImpl.hpp +++ b/include/aidge/backend/cuda/operator/ArgMaxImpl.hpp @@ -29,10 +29,10 @@ namespace Aidge { // Operator implementation entry point for the backend class ArgMaxImpl_cuda : public OperatorImpl { -public: - ArgMaxImpl_cuda(const ArgMax_Op& op) : OperatorImpl(op, "cuda") {} + public: + ArgMaxImpl_cuda(const ArgMax_Op &op) : OperatorImpl(op, "cuda") {} - static std::unique_ptr<ArgMaxImpl_cuda> create(const ArgMax_Op& op) { + static std::unique_ptr<ArgMaxImpl_cuda> create(const ArgMax_Op &op) { return std::make_unique<ArgMaxImpl_cuda>(op); } @@ -46,15 +46,17 @@ public: void forward() override; -private: + private: // CuDNN specific variables std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback; - template <class T> void forward_(const Tensor& input, std::int32_t axis, DimSize_t selectLastIdx); + template <class T> + void + forward_(const Tensor &input, std::int32_t axis, DimSize_t selectLastIdx); }; // Implementation entry point registration to Operator REGISTRAR(ArgMax_Op, "cuda", Aidge::ArgMaxImpl_cuda::create); -} // namespace Aidge +} // namespace Aidge #endif /* AIDGE_BACKEND_CUDA_OPERATOR_ARGMAXIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/ArgMaxImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/ArgMaxImpl_CUDA_kernels.hpp index 8c07bf597f6422a26cedd4176fdb1ef29bcabcef..860bb08bb6bd264b10858af432a952762c8c71cc 100644 --- a/include/aidge/backend/cuda/operator/ArgMaxImpl_CUDA_kernels.hpp +++ b/include/aidge/backend/cuda/operator/ArgMaxImpl_CUDA_kernels.hpp @@ -12,20 +12,23 @@ #ifndef AIDGE_CUDA_OPERATOR_ARGMAXIMPL_KERNEL_H_ #define AIDGE_CUDA_OPERATOR_ARGMAXIMPL_KERNEL_H_ -#include <stdexcept> #include <cfloat> #include <cuda.h> -#include <cuda_runtime_api.h> #include <cuda_fp16.h> +#include <cuda_runtime_api.h> +#include <stdexcept> -#include "aidge/data/Data.hpp" #include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/data/Data.hpp" -namespace Aidge -{ - template <class T> - void ArgMax_cuda_forward_kernel(const T* input, T* output, - const std::vector<int>& inputDims, const std::vector<int>& inputStrides, - int axis, int total_elems, std::size_t selectLastIdx); +namespace Aidge { +template <class T> +void ArgMax_cuda_forward_kernel(const T *input, + T *output, + const std::vector<int> &inputDims, + const std::vector<int> &inputStrides, + int axis, + int total_elems, + std::size_t selectLastIdx); } #endif /* AIDGE_CUDA_OPERATOR_ARGMAXIMPL_KERNEL_H_ */ \ No newline at end of file diff --git a/include/aidge/backend/cuda/operator/AvgPoolingImpl.hpp b/include/aidge/backend/cuda/operator/AvgPoolingImpl.hpp index 1c4efcf66850330fe9747c500093efa4456fa3f1..e6f9a4b8b71d15ee3b916677d029591fa26858f5 100644 --- a/include/aidge/backend/cuda/operator/AvgPoolingImpl.hpp +++ b/include/aidge/backend/cuda/operator/AvgPoolingImpl.hpp @@ -28,12 +28,13 @@ namespace Aidge { // Operator implementation entry point for the backend -template <DimIdx_t DIM> -class AvgPoolingImpl_cuda : public OperatorImpl { -public: - AvgPoolingImpl_cuda(const AvgPooling_Op<DIM>& op) : OperatorImpl(op, "cuda") {} +template <DimIdx_t DIM> class AvgPoolingImpl_cuda : public OperatorImpl { + public: + AvgPoolingImpl_cuda(const AvgPooling_Op<DIM> &op) + : OperatorImpl(op, "cuda") {} - static std::unique_ptr<AvgPoolingImpl_cuda> create(const AvgPooling_Op<DIM>& op) { + static std::unique_ptr<AvgPoolingImpl_cuda> + create(const AvgPooling_Op<DIM> &op) { return std::make_unique<AvgPoolingImpl_cuda>(op); } @@ -49,19 +50,19 @@ public: void backward() override; ~AvgPoolingImpl_cuda(); -private: + private: // CuDNN specific variables cudnnPoolingDescriptor_t mAvgPoolingDesc = nullptr; cudnnPoolingMode_t mMode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback; - template <class T> void forward_(const Tensor& input); - template <class T> void backward_(const Tensor& output_grad); + template <class T> void forward_(const Tensor &input); + template <class T> void backward_(const Tensor &output_grad); }; // Implementation entry point registration to Operator using AvgPooling2D_Op = AvgPooling_Op<2>; REGISTRAR(AvgPooling2D_Op, "cuda", Aidge::AvgPoolingImpl_cuda<2>::create); -} // namespace Aidge +} // namespace Aidge #endif /* AIDGE_BACKEND_CUDA_OPERATOR_AVGPOOLINGIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/BatchNormImpl.hpp b/include/aidge/backend/cuda/operator/BatchNormImpl.hpp index 025ef406fa6a988e758707b11fb2ceab6c829f26..e152653c55340712feadb50f731828b29a548020 100644 --- a/include/aidge/backend/cuda/operator/BatchNormImpl.hpp +++ b/include/aidge/backend/cuda/operator/BatchNormImpl.hpp @@ -28,12 +28,13 @@ namespace Aidge { // Operator implementation entry point for the backend -template <DimIdx_t DIM> -class BatchNormImpl_cuda : public OperatorImpl { -public: - BatchNormImpl_cuda(const BatchNorm_Op<DIM>& op) : OperatorImpl(op, "cuda") {} +template <DimIdx_t DIM> class BatchNormImpl_cuda : public OperatorImpl { + public: + BatchNormImpl_cuda(const BatchNorm_Op<DIM> &op) + : OperatorImpl(op, "cuda") {} - static std::unique_ptr<BatchNormImpl_cuda> create(const BatchNorm_Op<DIM>& op) { + static std::unique_ptr<BatchNormImpl_cuda> + create(const BatchNorm_Op<DIM> &op) { return std::make_unique<BatchNormImpl_cuda>(op); } @@ -49,19 +50,27 @@ public: void backward() override; ~BatchNormImpl_cuda(); -private: + private: // CuDNN specific variables cudnnTensorDescriptor_t mBNDesc = nullptr; cudnnBatchNormMode_t mMode; double mEpsilon; - template <class T> void forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, const Tensor& input3, const Tensor& input4); - template <class T> void backward_(const Tensor& input0, const Tensor& input1, const Tensor& input2); + template <class T> + void forward_(const Tensor &input0, + const Tensor &input1, + const Tensor &input2, + const Tensor &input3, + const Tensor &input4); + template <class T> + void backward_(const Tensor &input0, + const Tensor &input1, + const Tensor &input2); }; // Implementation entry point registration to Operator using BatchNorm2D_Op = BatchNorm_Op<2>; REGISTRAR(BatchNorm2D_Op, "cuda", Aidge::BatchNormImpl_cuda<2>::create); -} // namespace Aidge +} // namespace Aidge #endif /* AIDGE_BACKEND_CUDA_OPERATOR_BATCHNORMIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/ConvImpl.hpp b/include/aidge/backend/cuda/operator/ConvImpl.hpp index 27f3781a6824dd71d228b90c71df58b12ea0a6b3..b58352b8bcba670a5cb571d38c069c663094dedb 100644 --- a/include/aidge/backend/cuda/operator/ConvImpl.hpp +++ b/include/aidge/backend/cuda/operator/ConvImpl.hpp @@ -27,49 +27,53 @@ #include "aidge/backend/cuda/utils/CudaUtils.hpp" - namespace Aidge { // Operator implementation entry point for the backend -template <DimIdx_t DIM> -class ConvImpl_cuda : public OperatorImpl { -public: - ConvImpl_cuda(const Operator&op, bool depthWise = false) : OperatorImpl(op, "cuda"), mDepthWise(depthWise) {} +template <DimIdx_t DIM> class ConvImpl_cuda : public OperatorImpl { + public: + ConvImpl_cuda(const Operator &op, bool depthWise = false) + : OperatorImpl(op, "cuda"), mDepthWise(depthWise) {} - static std::unique_ptr<ConvImpl_cuda<DIM>> create(const Conv_Op<DIM>& op) { + static std::unique_ptr<ConvImpl_cuda<DIM>> create(const Conv_Op<DIM> &op) { return std::make_unique<ConvImpl_cuda<DIM>>(op); } - static std::unique_ptr<ConvImpl_cuda<DIM>> createDW(const ConvDepthWise_Op<DIM> &op) { + static std::unique_ptr<ConvImpl_cuda<DIM>> + createDW(const ConvDepthWise_Op<DIM> &op) { return std::make_unique<ConvImpl_cuda<DIM>>(op, true); } virtual std::vector<ImplSpec> getAvailableImplSpecs() const override { - return { - {DataType::Any} - }; + return {{DataType::Any}}; } void forward() override; void backward() override; ~ConvImpl_cuda(); -private: + private: // CuDNN specific variables cudnnConvolutionDescriptor_t mConvDesc = nullptr; cudnnFilterDescriptor_t mFilterDesc = nullptr; - cudnnConvolutionFwdAlgo_t mFwdAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; + cudnnConvolutionFwdAlgo_t mFwdAlgo = + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; cudnnConvolutionBwdFilterAlgo_t mBwdFilterAlgo; cudnnConvolutionBwdDataAlgo_t mBwdDataAlgo; size_t mWorkspaceSize = 0; - void* mFwdWorkspace = nullptr; - void* mBwdWorkspace = nullptr; + void *mFwdWorkspace = nullptr; + void *mBwdWorkspace = nullptr; std::shared_ptr<Tensor> mInput0Fallback; std::shared_ptr<Tensor> mInput1Fallback; std::shared_ptr<Tensor> mInput2Fallback; bool mDepthWise = false; - template <class T> void forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2); - template <class T> void backward_(const Tensor& input0, const Tensor& input1, const Tensor& input2); + template <class T> + void + forward_(const Tensor &input0, const Tensor &input1, const Tensor &input2); + template <class T> + void backward_(const Tensor &input0, + const Tensor &input1, + const Tensor &input2); }; // Implementation entry point registration to Operator @@ -77,6 +81,6 @@ using Conv2D_Op = Conv_Op<2>; using ConvDepthWise2D_Op = ConvDepthWise_Op<2>; REGISTRAR(Conv2D_Op, "cuda", Aidge::ConvImpl_cuda<2>::create); REGISTRAR(ConvDepthWise2D_Op, "cuda", Aidge::ConvImpl_cuda<2>::createDW); -} // namespace Aidge +} // namespace Aidge #endif /* AIDGE_BACKEND_CUDA_OPERATOR_CONVIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/DivImpl.hpp b/include/aidge/backend/cuda/operator/DivImpl.hpp index fbd3c73f1741d05549f06290ba9166b8d11c604d..90686a177a0e66701a6410dac811dc25e1472341 100644 --- a/include/aidge/backend/cuda/operator/DivImpl.hpp +++ b/include/aidge/backend/cuda/operator/DivImpl.hpp @@ -29,10 +29,10 @@ namespace Aidge { // Operator implementation entry point for the backend class DivImpl_cuda : public OperatorImpl { -public: - DivImpl_cuda(const Div_Op& op) : OperatorImpl(op, "cuda") {} + public: + DivImpl_cuda(const Div_Op &op) : OperatorImpl(op, "cuda") {} - static std::unique_ptr<DivImpl_cuda> create(const Div_Op& op) { + static std::unique_ptr<DivImpl_cuda> create(const Div_Op &op) { return std::make_unique<DivImpl_cuda>(op); } @@ -47,13 +47,16 @@ public: void forward() override; void backward() override; -private: - template <class T> void forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides); - template <class T> void backward_(const Tensor& outGrad); + private: + template <class T> + void forward_(const std::vector<Tensor> &inputs, + const std::vector<std::vector<int>> &inputsDims, + const std::vector<std::vector<int>> &inputsStrides); + template <class T> void backward_(const Tensor &outGrad); }; // Implementation entry point registration to Operator REGISTRAR(Div_Op, "cuda", Aidge::DivImpl_cuda::create); -} // namespace Aidge +} // namespace Aidge #endif /* AIDGE_BACKEND_CUDA_OPERATOR_DIVIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/DivImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/DivImpl_CUDA_kernels.hpp index 512bec77bb63570ffeb8f1681e4e25cd323535fa..cc5999a640eb8342bf24744c35ebf1688a63ccd9 100644 --- a/include/aidge/backend/cuda/operator/DivImpl_CUDA_kernels.hpp +++ b/include/aidge/backend/cuda/operator/DivImpl_CUDA_kernels.hpp @@ -12,28 +12,29 @@ #ifndef AIDGE_CUDA_OPERATOR_DIVIMPL_KERNELS_H_ #define AIDGE_CUDA_OPERATOR_DIVIMPL_KERNELS_H_ -#include <stdexcept> #include <cfloat> #include <cuda.h> -#include <cuda_runtime_api.h> #include <cuda_fp16.h> +#include <cuda_runtime_api.h> +#include <stdexcept> -#include "aidge/data/Data.hpp" #include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/data/Data.hpp" #include "aidge/utils/Types.h" namespace Aidge { template <class T> -void divForward(const T* input1, T* output, const T* intput2, - const std::vector<int>& input1Dims,const std::vector<int>& input2Dims, const std::vector<int>& outputDims, - const std::vector<int>& input1Strides, const std::vector<int>& input2Strides,const std::vector<int>& outputStrides, +void divForward(const T *input1, + T *output, + const T *intput2, + const std::vector<int> &input1Dims, + const std::vector<int> &input2Dims, + const std::vector<int> &outputDims, + const std::vector<int> &input1Strides, + const std::vector<int> &input2Strides, + const std::vector<int> &outputStrides, int outSize); } #endif /* AIDGE_CUDA_OPERATOR_DIVIMPL_KERNELS_H_ */ - - - - - diff --git a/include/aidge/backend/cuda/operator/FCImpl.hpp b/include/aidge/backend/cuda/operator/FCImpl.hpp index 8380754ea2419b2baff6de5126f8b6ff3e640178..3bd1da3560efe18018345fb2fe4586e17e80279f 100644 --- a/include/aidge/backend/cuda/operator/FCImpl.hpp +++ b/include/aidge/backend/cuda/operator/FCImpl.hpp @@ -29,10 +29,10 @@ namespace Aidge { // Operator implementation entry point for the backend class FCImpl_cuda : public OperatorImpl { -public: - FCImpl_cuda(const FC_Op& op) : OperatorImpl(op, "cuda") {} + public: + FCImpl_cuda(const FC_Op &op) : OperatorImpl(op, "cuda") {} - static std::unique_ptr<FCImpl_cuda> create(const FC_Op& op) { + static std::unique_ptr<FCImpl_cuda> create(const FC_Op &op) { return std::make_unique<FCImpl_cuda>(op); } @@ -47,17 +47,25 @@ public: void forward() override; void backward() override; -private: + private: std::shared_ptr<Tensor> mInput0Fallback; std::shared_ptr<Tensor> mInput1Fallback; std::shared_ptr<Tensor> mInput2Fallback; - template <class T> void forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, std::size_t outChannels); - template <class T> void backward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, std::size_t outChannels); + template <class T> + void forward_(const Tensor &input0, + const Tensor &input1, + const Tensor &input2, + std::size_t outChannels); + template <class T> + void backward_(const Tensor &input0, + const Tensor &input1, + const Tensor &input2, + std::size_t outChannels); }; // Implementation entry point registration to Operator REGISTRAR(FC_Op, "cuda", Aidge::FCImpl_cuda::create); -} // namespace Aidge +} // namespace Aidge #endif /* AIDGE_BACKEND_CUDA_OPERATOR_FCIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp index a956960df0a4dccb4ef9eb0634e5f61b9ddede0a..a3ecb6adad818a11d90dfebff33710a2bb36ea60 100644 --- a/include/aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp +++ b/include/aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp @@ -12,34 +12,45 @@ #ifndef AIDGE_CUDA_OPERATOR_FCIMPL_KERNELS_H_ #define AIDGE_CUDA_OPERATOR_FCIMPL_KERNELS_H_ -#include <stdexcept> #include <cfloat> #include <cuda.h> -#include <cuda_runtime_api.h> #include <cuda_fp16.h> +#include <cuda_runtime_api.h> +#include <stdexcept> -#include "aidge/data/Data.hpp" #include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/data/Data.hpp" namespace Aidge { template <class T> cublasStatus_t cublasGemm(cublasHandle_t handle, - cublasOperation_t transa, cublasOperation_t transb, - int m, int n, int k, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, const T *alpha, - const T *A, int lda, - const T *B, int ldb, + const T *A, + int lda, + const T *B, + int ldb, const T *beta, - T *C, int ldc); + T *C, + int ldc); template <class T> -cublasStatus_t cublasGemv(cublasHandle_t handle, cublasOperation_t trans, - int m, int n, - const T *alpha, - const T *A, int lda, - const T *x, int incx, +cublasStatus_t cublasGemv(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const T *alpha, + const T *A, + int lda, + const T *x, + int incx, const T *beta, - T *y, int incy); -} + T *y, + int incy); +} // namespace Aidge #endif /* AIDGE_CUDA_OPERATOR_FCIMPL_KERNELS_H_ */ \ No newline at end of file diff --git a/include/aidge/backend/cuda/operator/GlobalAveragePoolingImpl.hpp b/include/aidge/backend/cuda/operator/GlobalAveragePoolingImpl.hpp index 5b0cf07ab8687b9746d13af2274465ad923e6571..65c97376dae44fc346dd98a0e2b0fc6f85378682 100644 --- a/include/aidge/backend/cuda/operator/GlobalAveragePoolingImpl.hpp +++ b/include/aidge/backend/cuda/operator/GlobalAveragePoolingImpl.hpp @@ -29,35 +29,37 @@ namespace Aidge { // Operator implementation entry point for the backend class GlobalAveragePoolingImpl_cuda : public OperatorImpl { -public: - GlobalAveragePoolingImpl_cuda(const GlobalAveragePooling_Op& op) : OperatorImpl(op, "cuda") {} + public: + GlobalAveragePoolingImpl_cuda(const GlobalAveragePooling_Op &op) + : OperatorImpl(op, "cuda") {} - static std::unique_ptr<GlobalAveragePoolingImpl_cuda> create(const GlobalAveragePooling_Op& op) { + static std::unique_ptr<GlobalAveragePoolingImpl_cuda> + create(const GlobalAveragePooling_Op &op) { return std::make_unique<GlobalAveragePoolingImpl_cuda>(op); } virtual std::vector<ImplSpec> getAvailableImplSpecs() const override { - return { - {DataType::Any} - }; + return {{DataType::Any}}; } void forward() override; void backward() override; ~GlobalAveragePoolingImpl_cuda(); -private: + private: // CuDNN specific variables cudnnPoolingDescriptor_t mGlobalAveragePoolingDesc = nullptr; cudnnPoolingMode_t mMode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback; - template <class T> void forward_(const Tensor& input); - template <class T> void backward_(const Tensor& output_grad); + template <class T> void forward_(const Tensor &input); + template <class T> void backward_(const Tensor &output_grad); }; // Implementation entry point registration to Operator -REGISTRAR(GlobalAveragePooling_Op, "cuda", Aidge::GlobalAveragePoolingImpl_cuda::create); -} // namespace Aidge +REGISTRAR(GlobalAveragePooling_Op, + "cuda", + Aidge::GlobalAveragePoolingImpl_cuda::create); +} // namespace Aidge #endif /* AIDGE_BACKEND_CUDA_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/ILayerNormImpl.hpp b/include/aidge/backend/cuda/operator/ILayerNormImpl.hpp index 0d858c4719899094f996ca4f82f075df547a6fd4..01e0dad76522fc55c638a6c51c29d46297601bb4 100644 --- a/include/aidge/backend/cuda/operator/ILayerNormImpl.hpp +++ b/include/aidge/backend/cuda/operator/ILayerNormImpl.hpp @@ -30,10 +30,11 @@ namespace Aidge { class ILayerNormImpl_cuda : public OperatorImpl { -public: + public: ILayerNormImpl_cuda(const ILayerNorm_Op &op) : OperatorImpl(op, "cuda") {} - static std::unique_ptr<ILayerNormImpl_cuda> create(const ILayerNorm_Op &op) { + static std::unique_ptr<ILayerNormImpl_cuda> + create(const ILayerNorm_Op &op) { return std::make_unique<ILayerNormImpl_cuda>(op); } @@ -48,18 +49,20 @@ public: void forward() override; void backward() override; -private: + private: std::shared_ptr<Tensor> mInput0Fallback; std::shared_ptr<Tensor> mInput1Fallback; std::shared_ptr<Tensor> mInput2Fallback; std::shared_ptr<Tensor> mOutputGradFallback; - template <class T> void forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2); - template <class T> void backward_(const Tensor& output_grad); + template <class T> + void + forward_(const Tensor &input0, const Tensor &input1, const Tensor &input2); + template <class T> void backward_(const Tensor &output_grad); }; // Implementation entry point registration to Operator REGISTRAR(ILayerNorm_Op, "cuda", Aidge::ILayerNormImpl_cuda::create); -} // namespace Aidge +} // namespace Aidge #endif /* AIDGE_BACKEND_CUDA_OPERATOR_ILAYERNORMIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/ILayerNormImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/ILayerNormImpl_CUDA_kernels.hpp index aa54029ea29bc46809f227038a1a23d91bc161ee..c9269380544babc7de7c50065e275155d9bafd8e 100644 --- a/include/aidge/backend/cuda/operator/ILayerNormImpl_CUDA_kernels.hpp +++ b/include/aidge/backend/cuda/operator/ILayerNormImpl_CUDA_kernels.hpp @@ -14,79 +14,113 @@ #ifndef AIDGE_CUDA_OPERATOR_ILAYERNORMIMPL_FORWARD_KERNEL_H_ #define AIDGE_CUDA_OPERATOR_ILAYERNORMIMPL_FORWARD_KERNEL_H_ -#include <stdexcept> #include <cfloat> #include <cuda.h> -#include <cuda_runtime_api.h> #include <cuda_fp16.h> +#include <cuda_runtime_api.h> +#include <stdexcept> -#include "aidge/data/Data.hpp" #include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/data/Data.hpp" namespace Aidge { /** - * @brief Compute the forward for ILayerNorm - * @param input: Input tensor - * @param SF: Scaling factor of input tensor - * @param dims: Dimensions of input tensor - * @param quantized_tensor: Quantized output tensor - * @param square_tensor: Tensor use for computation - * @param weight: weight of ILayerNorm layer - * @param bias: bias of ILayerNorm layer - * @param new_SF: Scaling factor of output that can be use to dequantify -*/ + * @brief Compute the forward for ILayerNorm + * @param input: Input tensor + * @param SF: Scaling factor of input tensor + * @param dims: Dimensions of input tensor + * @param quantized_tensor: Quantized output tensor + * @param square_tensor: Tensor use for computation + * @param weight: weight of ILayerNorm layer + * @param bias: bias of ILayerNorm layer + * @param new_SF: Scaling factor of output that can be use to dequantify + */ template <class T> -__global__ void ILayerNormforward_(T* input, double SF, int* dims, int* quantized_tensor,long long int* square_tensor, T* weight, T* biase, double new_SF); +__global__ void ILayerNormforward_(T *input, + double SF, + int *dims, + int *quantized_tensor, + long long int *square_tensor, + T *weight, + T *biase, + double new_SF); /** - * @brief Wrapper function to execute ILayerNormforward_ - * @note Output correspond to the non-quantized tensor, to obtain the quantized tensor we need to copy quantized_tensor and not input_cuda_tensor - * @param input: Input tensor - * @param output: Output tensor (not quantized) - * @param SF: Scaling factor of input tensor - * @param weight_raw: weight of ILayerNorm layer - * @param bias_raw: bias of ILayerNorm layer - * @param size: Number of elements in the input tensor - * @param dims: Dimensions of input tensor -*/ + * @brief Wrapper function to execute ILayerNormforward_ + * @note Output correspond to the non-quantized tensor, to obtain the quantized + * tensor we need to copy quantized_tensor and not input_cuda_tensor + * @param input: Input tensor + * @param output: Output tensor (not quantized) + * @param SF: Scaling factor of input tensor + * @param weight_raw: weight of ILayerNorm layer + * @param bias_raw: bias of ILayerNorm layer + * @param size: Number of elements in the input tensor + * @param dims: Dimensions of input tensor + */ template <class T> -void ILayerNormforward(const T* input, T* output, double SF, const T* weight_raw, const T* bias_raw, size_t size, std::vector<long unsigned int> dims_input); +void ILayerNormforward(const T *input, + T *output, + double SF, + const T *weight_raw, + const T *bias_raw, + size_t size, + std::vector<long unsigned int> dims_input); /** - * @brief Compute the backward for ILayerNorm - * @param output_grad: Gradient of output tensor - * @param input_tensor: Input tensor - * @param output_tensor: Output tensor obtained after forward - * @param mean: Arithmetic mean of input tensor - * @param var: Arithmetic variance of input tensor - * @param weight: weight of ILayerNorm layer - * @param bias: bias of ILayerNorm layer - * @param input_grad: Gradient of input tensor - * @param weight_grad: Gradient of ILayerNorm weight - * @param bias_grad: Gradient of ILayerNorm bias - * @param size: Number of elements in the input tensor -*/ + * @brief Compute the backward for ILayerNorm + * @param output_grad: Gradient of output tensor + * @param input_tensor: Input tensor + * @param output_tensor: Output tensor obtained after forward + * @param mean: Arithmetic mean of input tensor + * @param var: Arithmetic variance of input tensor + * @param weight: weight of ILayerNorm layer + * @param bias: bias of ILayerNorm layer + * @param input_grad: Gradient of input tensor + * @param weight_grad: Gradient of ILayerNorm weight + * @param bias_grad: Gradient of ILayerNorm bias + * @param size: Number of elements in the input tensor + */ template <class T> -__global__ void ILayerNormbackward_(T* output_grad, T* input_tensor, T* output_tensor, T* mean, T* var, T* weight, T* bias, T* input_grad, T* weight_grad, T* bias_grad, int size); +__global__ void ILayerNormbackward_(T *output_grad, + T *input_tensor, + T *output_tensor, + T *mean, + T *var, + T *weight, + T *bias, + T *input_grad, + T *weight_grad, + T *bias_grad, + int size); /** - * @brief Wrapper function to execute ILayerNormbackward_ - * @param input_tensor: Input tensor - * @param output_grad: Gradient of output tensor - * @param output_tensor: Output tensor obtained after forward - * @param mean: Arithmetic mean of input tensor - * @param var: Arithmetic variance of input tensor - * @param weight: weight of ILayerNorm layer - * @param bias: bias of ILayerNorm layer - * @param input_grad: Gradient of input tensor - * @param weight_grad: Gradient of ILayerNorm weight - * @param bias_grad: Gradient of ILayerNorm bias - * @param size: Number of elements in the input tensor -*/ + * @brief Wrapper function to execute ILayerNormbackward_ + * @param input_tensor: Input tensor + * @param output_grad: Gradient of output tensor + * @param output_tensor: Output tensor obtained after forward + * @param mean: Arithmetic mean of input tensor + * @param var: Arithmetic variance of input tensor + * @param weight: weight of ILayerNorm layer + * @param bias: bias of ILayerNorm layer + * @param input_grad: Gradient of input tensor + * @param weight_grad: Gradient of ILayerNorm weight + * @param bias_grad: Gradient of ILayerNorm bias + * @param size: Number of elements in the input tensor + */ template <class T> -void ILayerNormbackward(const T* input_tensor, const T* output_grad, const T* output_tensor,const T* mean,const T* var, const T* weight, const T* bias, T* input_grad, T* weight_grad, T* bias_grad, size_t size); +void ILayerNormbackward(const T *input_tensor, + const T *output_grad, + const T *output_tensor, + const T *mean, + const T *var, + const T *weight, + const T *bias, + T *input_grad, + T *weight_grad, + T *bias_grad, + size_t size); -} +} // namespace Aidge #endif /* AIDGE_CUDA_OPERATOR_ILAYERNORMIMPL_FORWARD_KERNEL_H_ */ \ No newline at end of file diff --git a/include/aidge/backend/cuda/operator/LnImpl.hpp b/include/aidge/backend/cuda/operator/LnImpl.hpp index fbbccc11275b5c11bbaa86d05a2c19a1a46c11c1..a72ddb0dbd7bc37202bf96fbad9681bfc11f5a72 100644 --- a/include/aidge/backend/cuda/operator/LnImpl.hpp +++ b/include/aidge/backend/cuda/operator/LnImpl.hpp @@ -29,10 +29,10 @@ namespace Aidge { // Operator implementation entry point for the backend class LnImpl_cuda : public OperatorImpl { -public: - LnImpl_cuda(const Ln_Op& op) : OperatorImpl(op, "cuda") {} + public: + LnImpl_cuda(const Ln_Op &op) : OperatorImpl(op, "cuda") {} - static std::unique_ptr<LnImpl_cuda> create(const Ln_Op& op) { + static std::unique_ptr<LnImpl_cuda> create(const Ln_Op &op) { return std::make_unique<LnImpl_cuda>(op); } @@ -47,16 +47,16 @@ public: void forward() override; void backward() override; -private: + private: std::shared_ptr<Tensor> mInputFallback; std::shared_ptr<Tensor> mOutputGradFallback; - template <class T> void forward_(const Tensor& input); - template <class T> void backward_(const Tensor& output_grad); + template <class T> void forward_(const Tensor &input); + template <class T> void backward_(const Tensor &output_grad); }; // Implementation entry point registration to Operator REGISTRAR(Ln_Op, "cuda", Aidge::LnImpl_cuda::create); -} // namespace Aidge +} // namespace Aidge #endif /* AIDGE_BACKEND_CUDA_OPERATOR_LNIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/LnImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/LnImpl_CUDA_kernels.hpp index 9652d88116ca2cac92abbc517f8bc650655f43cc..305a201edd3cb9169630cf554a005715df3c4970 100644 --- a/include/aidge/backend/cuda/operator/LnImpl_CUDA_kernels.hpp +++ b/include/aidge/backend/cuda/operator/LnImpl_CUDA_kernels.hpp @@ -12,25 +12,19 @@ #ifndef AIDGE_CUDA_OPERATOR_LNIMPL_KERNELS_H_ #define AIDGE_CUDA_OPERATOR_LNIMPL_KERNELS_H_ -#include <stdexcept> #include <cfloat> #include <cuda.h> -#include <cuda_runtime_api.h> #include <cuda_fp16.h> +#include <cuda_runtime_api.h> +#include <stdexcept> -#include "aidge/data/Data.hpp" #include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/data/Data.hpp" #include "aidge/utils/Types.h" namespace Aidge { -template <class T> -void lnForward(const T* input, T* output, int size); +template <class T> void lnForward(const T *input, T *output, int size); } #endif /* AIDGE_CUDA_OPERATOR_LNIMPL_KERNELS_H_ */ - - - - - diff --git a/include/aidge/backend/cuda/operator/MaxPoolingImpl.hpp b/include/aidge/backend/cuda/operator/MaxPoolingImpl.hpp index 474a408f9697e8e91ffe9c8e2a79a79d7968e80a..44e09d0b8d5ed94c698d3774db2ac320bfd1fca8 100644 --- a/include/aidge/backend/cuda/operator/MaxPoolingImpl.hpp +++ b/include/aidge/backend/cuda/operator/MaxPoolingImpl.hpp @@ -28,38 +28,37 @@ namespace Aidge { // Operator implementation entry point for the backend -template <DimIdx_t DIM> -class MaxPoolingImpl_cuda : public OperatorImpl { -public: - MaxPoolingImpl_cuda(const MaxPooling_Op<DIM>& op) : OperatorImpl(op, "cuda") {} +template <DimIdx_t DIM> class MaxPoolingImpl_cuda : public OperatorImpl { + public: + MaxPoolingImpl_cuda(const MaxPooling_Op<DIM> &op) + : OperatorImpl(op, "cuda") {} - static std::unique_ptr<MaxPoolingImpl_cuda> create(const MaxPooling_Op<DIM>& op) { + static std::unique_ptr<MaxPoolingImpl_cuda> + create(const MaxPooling_Op<DIM> &op) { return std::make_unique<MaxPoolingImpl_cuda>(op); } virtual std::vector<ImplSpec> getAvailableImplSpecs() const override { - return { - {DataType::Any} - }; + return {{DataType::Any}}; } void forward() override; void backward() override; ~MaxPoolingImpl_cuda(); -private: + private: // CuDNN specific variables cudnnPoolingDescriptor_t mMaxPoolingDesc = nullptr; cudnnPoolingMode_t mMode = CUDNN_POOLING_MAX; std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback; - template <class T> void forward_(const Tensor& input); - template <class T> void backward_(const Tensor& output_grad); + template <class T> void forward_(const Tensor &input); + template <class T> void backward_(const Tensor &output_grad); }; // Implementation entry point registration to Operator using MaxPooling2D_Op = MaxPooling_Op<2>; REGISTRAR(MaxPooling2D_Op, "cuda", Aidge::MaxPoolingImpl_cuda<2>::create); -} // namespace Aidge +} // namespace Aidge #endif /* AIDGE_BACKEND_CUDA_OPERATOR_MAXPOOLINGIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/MulImpl.hpp b/include/aidge/backend/cuda/operator/MulImpl.hpp index 9a1a4d79d32c7a962d2086319d948e60a9f51049..d4995a1a7ecbdf8c6777769fe68f9e5c851e917c 100644 --- a/include/aidge/backend/cuda/operator/MulImpl.hpp +++ b/include/aidge/backend/cuda/operator/MulImpl.hpp @@ -29,10 +29,10 @@ namespace Aidge { // Operator implementation entry point for the backend class MulImpl_cuda : public OperatorImpl { -public: - MulImpl_cuda(const Mul_Op& op) : OperatorImpl(op, "cuda") {} + public: + MulImpl_cuda(const Mul_Op &op) : OperatorImpl(op, "cuda") {} - static std::unique_ptr<MulImpl_cuda> create(const Mul_Op& op) { + static std::unique_ptr<MulImpl_cuda> create(const Mul_Op &op) { return std::make_unique<MulImpl_cuda>(op); } @@ -47,13 +47,19 @@ public: void forward() override; void backward() override; -private: - template <class T> void forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides); - template <class T> void backward_(const Tensor& outputGrad, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides); + private: + template <class T> + void forward_(const std::vector<Tensor> &inputs, + const std::vector<std::vector<int>> &inputsDims, + const std::vector<std::vector<int>> &inputsStrides); + template <class T> + void backward_(const Tensor &outputGrad, + const std::vector<std::vector<int>> &inputsDims, + const std::vector<std::vector<int>> &inputsStrides); }; // Implementation entry point registration to Operator REGISTRAR(Mul_Op, "cuda", Aidge::MulImpl_cuda::create); -} // namespace Aidge +} // namespace Aidge #endif /* AIDGE_BACKEND_CUDA_OPERATOR_MULIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/PadImpl.hpp b/include/aidge/backend/cuda/operator/PadImpl.hpp index a0f7037c811cd3cb130cffed0bb7746e33220074..14482b2c78780100e4b301235834665ed92e7441 100644 --- a/include/aidge/backend/cuda/operator/PadImpl.hpp +++ b/include/aidge/backend/cuda/operator/PadImpl.hpp @@ -28,12 +28,11 @@ namespace Aidge { // Operator implementation entry point for the backend -template <DimIdx_t DIM> -class PadImpl_cuda : public OperatorImpl { -public: - PadImpl_cuda(const Pad_Op<DIM>& op) : OperatorImpl(op, "cuda") {} +template <DimIdx_t DIM> class PadImpl_cuda : public OperatorImpl { + public: + PadImpl_cuda(const Pad_Op<DIM> &op) : OperatorImpl(op, "cuda") {} - static std::unique_ptr<PadImpl_cuda> create(const Pad_Op<DIM>& op) { + static std::unique_ptr<PadImpl_cuda> create(const Pad_Op<DIM> &op) { return std::make_unique<PadImpl_cuda>(op); } @@ -48,20 +47,20 @@ public: void forward() override; void backward() override; -private: + private: // CuDNN specific variables std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback; int mLeftPad, mTopPad; double mPadVal; unsigned int mPadType; - template <class T> void forward_(const Tensor& input); - template <class T> void backward_(const Tensor& outGrad); + template <class T> void forward_(const Tensor &input); + template <class T> void backward_(const Tensor &outGrad); }; // Implementation entry point registration to Operator using Pad2D_Op = Pad_Op<2>; REGISTRAR(Pad2D_Op, "cuda", Aidge::PadImpl_cuda<2>::create); -} // namespace Aidge +} // namespace Aidge #endif /* AIDGE_BACKEND_CUDA_OPERATOR_PADIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/PadImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/PadImpl_CUDA_kernels.hpp index 11ddb0ea8b0e6603bf009c4ae0a7fa3247a8904f..5924a65d43b15946f3adeeea4ad9992fcbe90e8e 100644 --- a/include/aidge/backend/cuda/operator/PadImpl_CUDA_kernels.hpp +++ b/include/aidge/backend/cuda/operator/PadImpl_CUDA_kernels.hpp @@ -12,26 +12,25 @@ #ifndef AIDGE_CUDA_OPERATOR_PADIMPL_KERNELS_H_ #define AIDGE_CUDA_OPERATOR_PADIMPL_KERNELS_H_ -#include "aidge/data/Data.hpp" #include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/data/Data.hpp" -namespace Aidge -{ +namespace Aidge { - template <class T> - void cudaPadding(const cudaDeviceProp &deviceProp, - unsigned int nbOutputs, - unsigned int outputsWidth, - unsigned int outputsHeight, - unsigned int nbChannels, - unsigned int batchSize, - unsigned int inputWidth, - unsigned int inputHeight, - int leftPad, - int topPad, - unsigned int padType, - T padValue, - const T *input, - T *outputs); +template <class T> +void cudaPadding(const cudaDeviceProp &deviceProp, + unsigned int nbOutputs, + unsigned int outputsWidth, + unsigned int outputsHeight, + unsigned int nbChannels, + unsigned int batchSize, + unsigned int inputWidth, + unsigned int inputHeight, + int leftPad, + int topPad, + unsigned int padType, + T padValue, + const T *input, + T *outputs); } #endif /* AIDGE_CUDA_OPERATOR_PADIMPL_KERNELS_H_ */ \ No newline at end of file diff --git a/include/aidge/backend/cuda/operator/PowImpl.hpp b/include/aidge/backend/cuda/operator/PowImpl.hpp index 9b53d8dc04985794238f79cff9c78c44408fb6d7..5a8b31978bc90de4334ec3adb8ef373d6a4720bd 100644 --- a/include/aidge/backend/cuda/operator/PowImpl.hpp +++ b/include/aidge/backend/cuda/operator/PowImpl.hpp @@ -29,10 +29,10 @@ namespace Aidge { // Operator implementation entry point for the backend class PowImpl_cuda : public OperatorImpl { -public: - PowImpl_cuda(const Pow_Op& op) : OperatorImpl(op, "cuda") {} + public: + PowImpl_cuda(const Pow_Op &op) : OperatorImpl(op, "cuda") {} - static std::unique_ptr<PowImpl_cuda> create(const Pow_Op& op) { + static std::unique_ptr<PowImpl_cuda> create(const Pow_Op &op) { return std::make_unique<PowImpl_cuda>(op); } @@ -47,13 +47,16 @@ public: void forward() override; void backward() override; -private: - template <class T> void forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides); - template <class T> void backward_(const Tensor& outGrad); + private: + template <class T> + void forward_(const std::vector<Tensor> &inputs, + const std::vector<std::vector<int>> &inputsDims, + const std::vector<std::vector<int>> &inputsStrides); + template <class T> void backward_(const Tensor &outGrad); }; // Implementation entry point registration to Operator REGISTRAR(Pow_Op, "cuda", Aidge::PowImpl_cuda::create); -} // namespace Aidge +} // namespace Aidge #endif /* AIDGE_BACKEND_CUDA_OPERATOR_POWIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/PowImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/PowImpl_CUDA_kernels.hpp index e89bea53ba766b0bd90f0c7acd631b0370d96298..84de2e6b9758cd2b49fc29bf28e06092d166726d 100644 --- a/include/aidge/backend/cuda/operator/PowImpl_CUDA_kernels.hpp +++ b/include/aidge/backend/cuda/operator/PowImpl_CUDA_kernels.hpp @@ -12,27 +12,28 @@ #ifndef AIDGE_CUDA_OPERATOR_POWIMPL_KERNELS_H_ #define AIDGE_CUDA_OPERATOR_POWIMPL_KERNELS_H_ -#include <stdexcept> #include <cfloat> #include <cuda.h> -#include <cuda_runtime_api.h> #include <cuda_fp16.h> +#include <cuda_runtime_api.h> +#include <stdexcept> -#include "aidge/data/Data.hpp" #include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/data/Data.hpp" namespace Aidge { template <class T> -void powForward(const T* input, T* output, const T* exponent, - const std::vector<int>& inputDims,const std::vector<int>& exponentDims, const std::vector<int>& outputDims, - const std::vector<int>& inputStrides, const std::vector<int>& exponentStrides,const std::vector<int>& outputStrides, +void powForward(const T *input, + T *output, + const T *exponent, + const std::vector<int> &inputDims, + const std::vector<int> &exponentDims, + const std::vector<int> &outputDims, + const std::vector<int> &inputStrides, + const std::vector<int> &exponentStrides, + const std::vector<int> &outputStrides, int outSize); } #endif /* AIDGE_CUDA_OPERATOR_POWIMPL_KERNELS_H_ */ - - - - - diff --git a/include/aidge/backend/cuda/operator/ReLUImpl.hpp b/include/aidge/backend/cuda/operator/ReLUImpl.hpp index 306a56c4d0959dc4d818a6791173c375f5435360..7f8aaaf1b6e96a0b8708b00ddb458058e8ca88b3 100644 --- a/include/aidge/backend/cuda/operator/ReLUImpl.hpp +++ b/include/aidge/backend/cuda/operator/ReLUImpl.hpp @@ -29,39 +29,37 @@ namespace Aidge { // Operator implementation entry point for the backend class ReLUImpl_cuda : public OperatorImpl { -public: - ReLUImpl_cuda(const ReLU_Op& op) : OperatorImpl(op, "cuda") {} + public: + ReLUImpl_cuda(const ReLU_Op &op) : OperatorImpl(op, "cuda") {} - static std::unique_ptr<ReLUImpl_cuda> create(const ReLU_Op& op) { + static std::unique_ptr<ReLUImpl_cuda> create(const ReLU_Op &op) { return std::make_unique<ReLUImpl_cuda>(op); } virtual std::vector<ImplSpec> getAvailableImplSpecs() const override { - return { - {DataType::Any} - }; + return {{DataType::Any}}; } void forward() override; void backward() override; ~ReLUImpl_cuda(); -private: - // CuDNN specific variables - #if CUDNN_VERSION >= 5000 - cudnnActivationDescriptor_t mReLUDesc = nullptr; - #else - cudnnActivationMode_t mReLUDesc = nullptr; - #endif + private: +// CuDNN specific variables +#if CUDNN_VERSION >= 5000 + cudnnActivationDescriptor_t mReLUDesc = nullptr; +#else + cudnnActivationMode_t mReLUDesc = nullptr; +#endif std::shared_ptr<Tensor> mInputFallback; std::shared_ptr<Tensor> mOutputGradFallback; - template <class T> void forward_(const Tensor& input); - template <class T> void backward_(const Tensor& output_grad); + template <class T> void forward_(const Tensor &input); + template <class T> void backward_(const Tensor &output_grad); }; // Implementation entry point registration to Operator REGISTRAR(ReLU_Op, "cuda", Aidge::ReLUImpl_cuda::create); -} // namespace Aidge +} // namespace Aidge #endif /* AIDGE_BACKEND_CUDA_OPERATOR_RELUIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/ReduceImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/ReduceImpl_CUDA_kernels.hpp index 9d352b8b1d14aeaa4230accd7aa81c279c18b7a8..385f274843490267ddb753c261eb6e60c4dd95c3 100644 --- a/include/aidge/backend/cuda/operator/ReduceImpl_CUDA_kernels.hpp +++ b/include/aidge/backend/cuda/operator/ReduceImpl_CUDA_kernels.hpp @@ -12,19 +12,18 @@ #ifndef AIDGE_CUDA_OPERATOR_REDUCEIMPL_KERNEL_H_ #define AIDGE_CUDA_OPERATOR_REDUCEIMPL_KERNEL_H_ -#include "aidge/data/Data.hpp" #include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/data/Data.hpp" -namespace Aidge -{ +namespace Aidge { - template <class T> - void ReduceBackward(const T* input, - T* output, - const std::vector<std::size_t>& inputDims, - const std::vector<std::size_t>& outputDims, - const std::vector<int>& axes, - const std::vector<std::size_t>& factors, - int outSize); +template <class T> +void ReduceBackward(const T *input, + T *output, + const std::vector<std::size_t> &inputDims, + const std::vector<std::size_t> &outputDims, + const std::vector<int> &axes, + const std::vector<std::size_t> &factors, + int outSize); } #endif /* AIDGE_CUDA_OPERATOR_REDUCEIMPL_KERNEL_H_ */ \ No newline at end of file diff --git a/include/aidge/backend/cuda/operator/ReduceMeanImpl.hpp b/include/aidge/backend/cuda/operator/ReduceMeanImpl.hpp index 1f6878480d69e19f8c73a12862cc12b2d675440d..3eda27795289871bbe82414c6ea5e618b1608060 100644 --- a/include/aidge/backend/cuda/operator/ReduceMeanImpl.hpp +++ b/include/aidge/backend/cuda/operator/ReduceMeanImpl.hpp @@ -29,10 +29,11 @@ namespace Aidge { // Operator implementation entry point for the backend class ReduceMeanImpl_cuda : public OperatorImpl { -public: - ReduceMeanImpl_cuda(const ReduceMean_Op& op) : OperatorImpl(op, "cuda") {} + public: + ReduceMeanImpl_cuda(const ReduceMean_Op &op) : OperatorImpl(op, "cuda") {} - static std::unique_ptr<ReduceMeanImpl_cuda> create(const ReduceMean_Op& op) { + static std::unique_ptr<ReduceMeanImpl_cuda> + create(const ReduceMean_Op &op) { return std::make_unique<ReduceMeanImpl_cuda>(op); } @@ -47,16 +48,19 @@ public: void forward() override; void backward() override; -private: + private: // CuDNN specific variables std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback; - template <class T> void forward_(const Tensor& input, const std::vector<int>& axes, bool keepDims); - template <class T> void backward_(const Tensor& output_grad, const std::vector<int>& axes); + template <class T> + void + forward_(const Tensor &input, const std::vector<int> &axes, bool keepDims); + template <class T> + void backward_(const Tensor &output_grad, const std::vector<int> &axes); }; // Implementation entry point registration to Operator REGISTRAR(ReduceMean_Op, "cuda", Aidge::ReduceMeanImpl_cuda::create); -} // namespace Aidge +} // namespace Aidge #endif /* AIDGE_BACKEND_CUDA_OPERATOR_REDUCEMEANIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/ReduceSumImpl.hpp b/include/aidge/backend/cuda/operator/ReduceSumImpl.hpp index 10af90ba3a4ffc1d1464dd73f15313315b0c0032..16538964b7dced5082e41fd0416593af98694e13 100644 --- a/include/aidge/backend/cuda/operator/ReduceSumImpl.hpp +++ b/include/aidge/backend/cuda/operator/ReduceSumImpl.hpp @@ -29,10 +29,10 @@ namespace Aidge { // Operator implementation entry point for the backend class ReduceSumImpl_cuda : public OperatorImpl { -public: - ReduceSumImpl_cuda(const ReduceSum_Op& op) : OperatorImpl(op, "cuda") {} + public: + ReduceSumImpl_cuda(const ReduceSum_Op &op) : OperatorImpl(op, "cuda") {} - static std::unique_ptr<ReduceSumImpl_cuda> create(const ReduceSum_Op& op) { + static std::unique_ptr<ReduceSumImpl_cuda> create(const ReduceSum_Op &op) { return std::make_unique<ReduceSumImpl_cuda>(op); } @@ -47,16 +47,19 @@ public: void forward() override; void backward() override; -private: + private: // CuDNN specific variables std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback; - template <class T> void forward_(const Tensor& input, const std::vector<int>& axes, bool keepDims); - template <class T> void backward_(const Tensor& output_grad, const std::vector<int>& axes); + template <class T> + void + forward_(const Tensor &input, const std::vector<int> &axes, bool keepDims); + template <class T> + void backward_(const Tensor &output_grad, const std::vector<int> &axes); }; // Implementation entry point registration to Operator REGISTRAR(ReduceSum_Op, "cuda", Aidge::ReduceSumImpl_cuda::create); -} // namespace Aidge +} // namespace Aidge #endif /* AIDGE_BACKEND_CUDA_OPERATOR_REDUCESUMIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/ReshapeImpl.hpp b/include/aidge/backend/cuda/operator/ReshapeImpl.hpp index 2c8ebd68cff0313031279f83109043eb17d919b5..e8e231b9d66bf8016dee86a22d73282c3dc7ea67 100644 --- a/include/aidge/backend/cuda/operator/ReshapeImpl.hpp +++ b/include/aidge/backend/cuda/operator/ReshapeImpl.hpp @@ -29,10 +29,10 @@ namespace Aidge { // Operator implementation entry point for the backend class ReshapeImpl_cuda : public OperatorImpl { -public: - ReshapeImpl_cuda(const Reshape_Op& op) : OperatorImpl(op, "cuda") {} + public: + ReshapeImpl_cuda(const Reshape_Op &op) : OperatorImpl(op, "cuda") {} - static std::unique_ptr<ReshapeImpl_cuda> create(const Reshape_Op& op) { + static std::unique_ptr<ReshapeImpl_cuda> create(const Reshape_Op &op) { return std::make_unique<ReshapeImpl_cuda>(op); } @@ -47,12 +47,12 @@ public: void forward() override; void backward() override; -private: + private: std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback; }; // Implementation entry point registration to Operator REGISTRAR(Reshape_Op, "cuda", Aidge::ReshapeImpl_cuda::create); -} // namespace Aidge +} // namespace Aidge #endif /* AIDGE_BACKEND_CUDA_OPERATOR_RESHAPEIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/ShiftGELUImpl.hpp b/include/aidge/backend/cuda/operator/ShiftGELUImpl.hpp index 1eff6dfbb1777d8dbd823d7bc9b94894bb2646b9..c501d0f3ef9ad6249fd15f90f67a51cb2dae6fed 100644 --- a/include/aidge/backend/cuda/operator/ShiftGELUImpl.hpp +++ b/include/aidge/backend/cuda/operator/ShiftGELUImpl.hpp @@ -30,7 +30,7 @@ namespace Aidge { class ShiftGELUImpl_cuda : public OperatorImpl { -public: + public: ShiftGELUImpl_cuda(const ShiftGELU_Op &op) : OperatorImpl(op, "cuda") {} static std::unique_ptr<ShiftGELUImpl_cuda> create(const ShiftGELU_Op &op) { @@ -45,21 +45,19 @@ public: }; } - void forward() override; void backward() override; -private: + private: std::shared_ptr<Tensor> mInputFallback; std::shared_ptr<Tensor> mOutputGradFallback; - template <class T> void forward_(const Tensor& input); - template <class T> void backward_(const Tensor& output_grad); - + template <class T> void forward_(const Tensor &input); + template <class T> void backward_(const Tensor &output_grad); }; // Implementation entry point registration to Operator REGISTRAR(ShiftGELU_Op, "cuda", Aidge::ShiftGELUImpl_cuda::create); -} // namespace Aidge +} // namespace Aidge #endif /* AIDGE_BACKEND_CUDA_OPERATOR_SHIFTGELUIMPL_H_ */ \ No newline at end of file diff --git a/include/aidge/backend/cuda/operator/ShiftGELUImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/ShiftGELUImpl_CUDA_kernels.hpp index 14268521451a631ccb9194d44ed7543af8d494f5..4ac4cd96fcdc59901282447d3e6e729988ee4d5e 100644 --- a/include/aidge/backend/cuda/operator/ShiftGELUImpl_CUDA_kernels.hpp +++ b/include/aidge/backend/cuda/operator/ShiftGELUImpl_CUDA_kernels.hpp @@ -14,65 +14,91 @@ #ifndef AIDGE_CUDA_OPERATOR_SHIFTGELUIMPL_KERNELS_H_ #define AIDGE_CUDA_OPERATOR_SHIFTGELUIMPL_KERNELS_H_ -#include <stdexcept> #include <cfloat> #include <cuda.h> -#include <cuda_runtime_api.h> #include <cuda_fp16.h> +#include <cuda_runtime_api.h> +#include <stdexcept> -#include "aidge/data/Data.hpp" #include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/data/Data.hpp" namespace Aidge { /** - * @brief Compute the forward for ShiftGELU - * @param input: Input tensor - * @param quantized_tensor: Quantized output tensor - * @param GELUtensor: Pointer to an empty memory block allocated on the GPU (just use for computation) - * @param SumTensor: Pointer to an empty memory block allocated on the GPU (just use for computation) - * @param dims: Dimensions of input tensor - * @param SF: Scaling factor of input tensor - * @param N: Arithmetic precision, currently set at 15 like I-ViT (the greater the N, the more precise the operation, but the greater the number of bits required) - * @param output_bits: Desired bit precision (8 for int8, for example) -*/ + * @brief Compute the forward for ShiftGELU + * @param input: Input tensor + * @param quantized_tensor: Quantized output tensor + * @param GELUtensor: Pointer to an empty memory block allocated on the GPU + * (just use for computation) + * @param SumTensor: Pointer to an empty memory block allocated on the GPU + * (just use for computation) + * @param dims: Dimensions of input tensor + * @param SF: Scaling factor of input tensor + * @param N: Arithmetic precision, currently set at 15 like I-ViT (the greater + * the N, the more precise the operation, but the greater the number of bits + * required) + * @param output_bits: Desired bit precision (8 for int8, for example) + */ template <class T> -__global__ void ShiftGELUforward_(T* input,int* quantized_tensor,int* GELUtensor,int* SumTensor, int* dims, double SF, int N, int output_bits); +__global__ void ShiftGELUforward_(T *input, + int *quantized_tensor, + int *GELUtensor, + int *SumTensor, + int *dims, + double SF, + int N, + int output_bits); /** - * @brief Wrapper function to execute ShiftGELUforward_ - * @note Output correspond to the non-quantized tensor, to obtain the quantized tensor we need to copy quantized_tensor and not input_cuda_tensor - * @param input: Input tensor - * @param output: Output tensor (not quantized) - * @param SF: Scaling factor of input tensor - * @param N: Arithmetic precision, currently set at 15 like I-ViT (the greater the N, the more precise the operation, but the greater the number of bits required) - * @param output_bits: Desired bit precision (8 for int8, for example) - * @param size: Number of elements in the input tensor - * @param dims_input: Dimensions of input tensor -*/ + * @brief Wrapper function to execute ShiftGELUforward_ + * @note Output correspond to the non-quantized tensor, to obtain the quantized + * tensor we need to copy quantized_tensor and not input_cuda_tensor + * @param input: Input tensor + * @param output: Output tensor (not quantized) + * @param SF: Scaling factor of input tensor + * @param N: Arithmetic precision, currently set at 15 like I-ViT (the greater + * the N, the more precise the operation, but the greater the number of bits + * required) + * @param output_bits: Desired bit precision (8 for int8, for example) + * @param size: Number of elements in the input tensor + * @param dims_input: Dimensions of input tensor + */ template <class T> -void ShiftGELUforward(const T* input, T* output, double SF,int N, int output_bits, size_t size, std::vector<long unsigned int> dims_input); +void ShiftGELUforward(const T *input, + T *output, + double SF, + int N, + int output_bits, + size_t size, + std::vector<long unsigned int> dims_input); /** - * @brief Compute the backward for ShiftGELU - * @param input_grad: Gradient of input tensor (that we want to obtain) - * @param output_tensor: Output tensor obtained after forward - * @param output_grad: Gradient of output tensor - * @param size: Number of elements in the input tensor -*/ + * @brief Compute the backward for ShiftGELU + * @param input_grad: Gradient of input tensor (that we want to obtain) + * @param output_tensor: Output tensor obtained after forward + * @param output_grad: Gradient of output tensor + * @param size: Number of elements in the input tensor + */ template <class T> -__global__ void ShiftGELUbackward_(T* input_grad, const T* output_tensor, const T* output_grad, int size); +__global__ void ShiftGELUbackward_(T *input_grad, + const T *output_tensor, + const T *output_grad, + int size); /** - * @brief Wrapper function to execute ShiftGELUbackward_ - * @param output_tensor: Output tensor obtained after forward - * @param output_grad: Gradient of output tensor - * @param input_grad: Gradient of input tensor (that we want to obtain) - * @param size: Number of elements in the input tensor -*/ + * @brief Wrapper function to execute ShiftGELUbackward_ + * @param output_tensor: Output tensor obtained after forward + * @param output_grad: Gradient of output tensor + * @param input_grad: Gradient of input tensor (that we want to obtain) + * @param size: Number of elements in the input tensor + */ template <class T> -void ShiftGELUbackward(const T* output_tensor, const T* output_grad, T* input_grad, size_t size); +void ShiftGELUbackward(const T *output_tensor, + const T *output_grad, + T *input_grad, + size_t size); -} +} // namespace Aidge #endif /* AIDGE_CUDA_OPERATOR_SHIFTGELUIMPL_FORWARD_KERNEL_H_ */ diff --git a/include/aidge/backend/cuda/operator/ShiftMaxImpl.hpp b/include/aidge/backend/cuda/operator/ShiftMaxImpl.hpp index 3e6e3744cb544d0928a9229aa5110cf776f0c507..b21182e821934f29a69bc95c72a97507d7daf901 100644 --- a/include/aidge/backend/cuda/operator/ShiftMaxImpl.hpp +++ b/include/aidge/backend/cuda/operator/ShiftMaxImpl.hpp @@ -30,7 +30,7 @@ namespace Aidge { class ShiftMaxImpl_cuda : public OperatorImpl { -public: + public: ShiftMaxImpl_cuda(const ShiftMax_Op &op) : OperatorImpl(op, "cuda") {} static std::unique_ptr<ShiftMaxImpl_cuda> create(const ShiftMax_Op &op) { @@ -48,17 +48,16 @@ public: void forward() override; void backward() override; -private: + private: std::shared_ptr<Tensor> mInputFallback; std::shared_ptr<Tensor> mOutputGradFallback; - template <class T> void forward_(const Tensor& input); - template <class T> void backward_(const Tensor& output_grad); - + template <class T> void forward_(const Tensor &input); + template <class T> void backward_(const Tensor &output_grad); }; // Implementation entry point registration to Operator REGISTRAR(ShiftMax_Op, "cuda", Aidge::ShiftMaxImpl_cuda::create); -} // namespace Aidge +} // namespace Aidge #endif /* AIDGE_BACKEND_CUDA_OPERATOR_SHIFTMAXIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/ShiftMaxImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/ShiftMaxImpl_CUDA_kernels.hpp index 037a7cbb6362a8eca5a9e6f5a277b29a6a6bd907..5ed878f3f350ac2746c937b2054f97fc14479825 100644 --- a/include/aidge/backend/cuda/operator/ShiftMaxImpl_CUDA_kernels.hpp +++ b/include/aidge/backend/cuda/operator/ShiftMaxImpl_CUDA_kernels.hpp @@ -14,66 +14,92 @@ #ifndef AIDGE_CUDA_OPERATOR_SHIFTMAXIMPL_KERNELS_H_ #define AIDGE_CUDA_OPERATOR_SHIFTMAXIMPL_KERNELS_H_ -#include <stdexcept> #include <cfloat> #include <cuda.h> -#include <cuda_runtime_api.h> #include <cuda_fp16.h> +#include <cuda_runtime_api.h> +#include <stdexcept> -#include "aidge/data/Data.hpp" #include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/data/Data.hpp" namespace Aidge { /** - * @brief Compute the forward for ShiftMax - * @param input: Input tensor - * @param quantized_tensor: Quantized output tensor - * @param factor: Pointer to an empty memory block allocated on the GPU (just use for computation) - * @param dims: Dimensions of input tensor - * @param SF: Scaling factor of input tensor - * @param N: Arithmetic precision, currently set at 15 like I-ViT (the greater the N, the more precise the operation, but the greater the number of bits required) - * @param output_bits: Desired bit precision (8 for int8, for example) - * @param new_SF: Scaling factor of output that can be use to dequantify -*/ + * @brief Compute the forward for ShiftMax + * @param input: Input tensor + * @param quantized_tensor: Quantized output tensor + * @param factor: Pointer to an empty memory block allocated on the GPU (just + * use for computation) + * @param dims: Dimensions of input tensor + * @param SF: Scaling factor of input tensor + * @param N: Arithmetic precision, currently set at 15 like I-ViT (the greater + * the N, the more precise the operation, but the greater the number of bits + * required) + * @param output_bits: Desired bit precision (8 for int8, for example) + * @param new_SF: Scaling factor of output that can be use to dequantify + */ template <class T> -__global__ void ShiftMaxforward_(T* input,int* quantized_tensor,int* factor, int* dims, double SF, int N, int output_bits,double new_SF); +__global__ void ShiftMaxforward_(T *input, + int *quantized_tensor, + int *factor, + int *dims, + double SF, + int N, + int output_bits, + double new_SF); /** - * @brief Wrapper function to execute ShiftMaxforward_ - * @note Output correspond to the non-quantized tensor, to obtain the quantized tensor we need to copy quantized_tensor and not input_cuda_tensor - * @param input: Input tensor - * @param output: Output tensor (not quantized) - * @param SF: Scaling factor of input tensor - * @param N: Arithmetic precision, currently set at 15 like I-ViT (the greater the N, the more precise the operation, but the greater the number of bits required) - * @param output_bits: Desired bit precision (8 for int8, for example) - * @param size: Number of elements in the input tensor - * @param dims_input: Dimensions of input tensor -*/ + * @brief Wrapper function to execute ShiftMaxforward_ + * @note Output correspond to the non-quantized tensor, to obtain the quantized + * tensor we need to copy quantized_tensor and not input_cuda_tensor + * @param input: Input tensor + * @param output: Output tensor (not quantized) + * @param SF: Scaling factor of input tensor + * @param N: Arithmetic precision, currently set at 15 like I-ViT (the greater + * the N, the more precise the operation, but the greater the number of bits + * required) + * @param output_bits: Desired bit precision (8 for int8, for example) + * @param size: Number of elements in the input tensor + * @param dims_input: Dimensions of input tensor + */ template <class T> -void ShiftMaxforward(const T* input, T* output, double SF,int N, int output_bits, size_t size, std::vector<long unsigned int> dims_input); +void ShiftMaxforward(const T *input, + T *output, + double SF, + int N, + int output_bits, + size_t size, + std::vector<long unsigned int> dims_input); /** - * @brief Compute the backward for ShiftMax - * @param input_grad: Gradient of input tensor (that we want to obtain) - * @param output_tensor: Output tensor obtained after forward - * @param output_grad: Gradient of output tensor - * @param dims: Dimensions of input tensor -*/ + * @brief Compute the backward for ShiftMax + * @param input_grad: Gradient of input tensor (that we want to obtain) + * @param output_tensor: Output tensor obtained after forward + * @param output_grad: Gradient of output tensor + * @param dims: Dimensions of input tensor + */ template <class T> -__global__ void ShiftMaxbackward_(T* input_grad, const T* output_tensor, const T* output_grad, const int* dims); +__global__ void ShiftMaxbackward_(T *input_grad, + const T *output_tensor, + const T *output_grad, + const int *dims); /** - * @brief Wrapper function to execute ShiftMaxbackward_ - * @param output_tensor: Output tensor obtained after forward - * @param output_grad: Gradient of output tensor - * @param input_grad: Gradient of input tensor (that we want to obtain) - * @param size: Number of elements in the input tensor - * @param dims: Dimensions of input tensor -*/ + * @brief Wrapper function to execute ShiftMaxbackward_ + * @param output_tensor: Output tensor obtained after forward + * @param output_grad: Gradient of output tensor + * @param input_grad: Gradient of input tensor (that we want to obtain) + * @param size: Number of elements in the input tensor + * @param dims: Dimensions of input tensor + */ template <class T> -void ShiftMaxbackward(const T* output_tensor, const T* output_grad, T* input_grad, size_t size, std::vector<long unsigned int> dims); +void ShiftMaxbackward(const T *output_tensor, + const T *output_grad, + T *input_grad, + size_t size, + std::vector<long unsigned int> dims); -} +} // namespace Aidge #endif /* AIDGE_CUDA_OPERATOR_SHIFTMAXIMPL_FORWARD_KERNEL_H_ */ diff --git a/include/aidge/backend/cuda/operator/SigmoidImpl.hpp b/include/aidge/backend/cuda/operator/SigmoidImpl.hpp index dc1434c8ecc8568bd4f82c7c7ce5db78cc1885a9..2437d5cf58a68c6282d21aaf1b3b7868c57c3221 100644 --- a/include/aidge/backend/cuda/operator/SigmoidImpl.hpp +++ b/include/aidge/backend/cuda/operator/SigmoidImpl.hpp @@ -29,39 +29,37 @@ namespace Aidge { // Operator implementation entry point for the backend class SigmoidImpl_cuda : public OperatorImpl { -public: - SigmoidImpl_cuda(const Sigmoid_Op& op) : OperatorImpl(op, "cuda") {} + public: + SigmoidImpl_cuda(const Sigmoid_Op &op) : OperatorImpl(op, "cuda") {} - static std::unique_ptr<SigmoidImpl_cuda> create(const Sigmoid_Op& op) { + static std::unique_ptr<SigmoidImpl_cuda> create(const Sigmoid_Op &op) { return std::make_unique<SigmoidImpl_cuda>(op); } virtual std::vector<ImplSpec> getAvailableImplSpecs() const override { - return { - {DataType::Any} - }; + return {{DataType::Any}}; } void forward() override; void backward() override; ~SigmoidImpl_cuda(); -private: - // CuDNN specific variables - #if CUDNN_VERSION >= 5000 - cudnnActivationDescriptor_t mSigmoidDesc = nullptr; - #else - cudnnActivationMode_t mSigmoidDesc = nullptr; - #endif + private: +// CuDNN specific variables +#if CUDNN_VERSION >= 5000 + cudnnActivationDescriptor_t mSigmoidDesc = nullptr; +#else + cudnnActivationMode_t mSigmoidDesc = nullptr; +#endif std::shared_ptr<Tensor> mInputFallback; std::shared_ptr<Tensor> mOutputGradFallback; - template <class T> void forward_(const Tensor& input); - template <class T> void backward_(const Tensor& output_grad); + template <class T> void forward_(const Tensor &input); + template <class T> void backward_(const Tensor &output_grad); }; // Implementation entry point registration to Operator REGISTRAR(Sigmoid_Op, "cuda", Aidge::SigmoidImpl_cuda::create); -} // namespace Aidge +} // namespace Aidge #endif /* AIDGE_BACKEND_CUDA_OPERATOR_SIGMOIDIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/SubImpl.hpp b/include/aidge/backend/cuda/operator/SubImpl.hpp index 529d0b2b2dd4a0ec8a3dae5bf0219f8a4f2968c6..973bd1a1f0322bfcc8267af569b35f392ab71deb 100644 --- a/include/aidge/backend/cuda/operator/SubImpl.hpp +++ b/include/aidge/backend/cuda/operator/SubImpl.hpp @@ -29,10 +29,10 @@ namespace Aidge { // Operator implementation entry point for the backend class SubImpl_cuda : public OperatorImpl { -public: - SubImpl_cuda(const Sub_Op& op) : OperatorImpl(op, "cuda") {} + public: + SubImpl_cuda(const Sub_Op &op) : OperatorImpl(op, "cuda") {} - static std::unique_ptr<SubImpl_cuda> create(const Sub_Op& op) { + static std::unique_ptr<SubImpl_cuda> create(const Sub_Op &op) { return std::make_unique<SubImpl_cuda>(op); } @@ -47,13 +47,19 @@ public: void forward() override; void backward() override; -private: - template <class T> void forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides); - template <class T> void backward_(const Tensor& outGrad, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides); + private: + template <class T> + void forward_(const std::vector<Tensor> &inputs, + const std::vector<std::vector<int>> &inputsDims, + const std::vector<std::vector<int>> &inputsStrides); + template <class T> + void backward_(const Tensor &outGrad, + const std::vector<std::vector<int>> &inputsDims, + const std::vector<std::vector<int>> &inputsStrides); }; // Implementation entry point registration to Operator REGISTRAR(Sub_Op, "cuda", Aidge::SubImpl_cuda::create); -} // namespace Aidge +} // namespace Aidge #endif /* AIDGE_BACKEND_CUDA_OPERATOR_SUBIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/TanhImpl.hpp b/include/aidge/backend/cuda/operator/TanhImpl.hpp index a87d7bd8c318149cb625a3cf0122f7eac1ea6149..c83061fc2b79152bc49d009bfbc4a271f9b52b9b 100644 --- a/include/aidge/backend/cuda/operator/TanhImpl.hpp +++ b/include/aidge/backend/cuda/operator/TanhImpl.hpp @@ -29,39 +29,37 @@ namespace Aidge { // Operator implementation entry point for the backend class TanhImpl_cuda : public OperatorImpl { -public: - TanhImpl_cuda(const Tanh_Op& op) : OperatorImpl(op, "cuda") {} + public: + TanhImpl_cuda(const Tanh_Op &op) : OperatorImpl(op, "cuda") {} - static std::unique_ptr<TanhImpl_cuda> create(const Tanh_Op& op) { + static std::unique_ptr<TanhImpl_cuda> create(const Tanh_Op &op) { return std::make_unique<TanhImpl_cuda>(op); } virtual std::vector<ImplSpec> getAvailableImplSpecs() const override { - return { - {DataType::Any} - }; + return {{DataType::Any}}; } void forward() override; void backward() override; ~TanhImpl_cuda(); -private: - // CuDNN specific variables - #if CUDNN_VERSION >= 5000 - cudnnActivationDescriptor_t mTanhDesc = nullptr; - #else - cudnnActivationMode_t mTanhDesc = nullptr; - #endif + private: +// CuDNN specific variables +#if CUDNN_VERSION >= 5000 + cudnnActivationDescriptor_t mTanhDesc = nullptr; +#else + cudnnActivationMode_t mTanhDesc = nullptr; +#endif std::shared_ptr<Tensor> mInputFallback; std::shared_ptr<Tensor> mOutputGradFallback; - template <class T> void forward_(const Tensor& input); - template <class T> void backward_(const Tensor& output_grad); + template <class T> void forward_(const Tensor &input); + template <class T> void backward_(const Tensor &output_grad); }; // Implementation entry point registration to Operator REGISTRAR(Tanh_Op, "cuda", Aidge::TanhImpl_cuda::create); -} // namespace Aidge +} // namespace Aidge #endif /* AIDGE_BACKEND_CUDA_OPERATOR_TANHIMPL_H_ */ diff --git a/include/aidge/backend/cuda/utils/CudaContext.hpp b/include/aidge/backend/cuda/utils/CudaContext.hpp index f21886e502b9017aa55e250e7257d16bc5d04501..1d66f4eab386ea55bcda1455de279fabaf68b147 100644 --- a/include/aidge/backend/cuda/utils/CudaContext.hpp +++ b/include/aidge/backend/cuda/utils/CudaContext.hpp @@ -3,19 +3,18 @@ #include <vector> -#include "aidge/utils/ErrorHandling.hpp" #include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/utils/ErrorHandling.hpp" namespace Aidge { class CudaContext { -public: - static int nbDevice(){ + public: + static int nbDevice() { int count = 1; CHECK_CUDA_STATUS(cudaGetDeviceCount(&count)); return count; } - static void setDevice(int device = -1) - { + static void setDevice(int device = -1) { static int prevDevice = 0; if (device >= 0) @@ -26,27 +25,25 @@ public: CHECK_CUDA_STATUS(cudaSetDevice(device)); } - static std::pair<size_t, size_t> getMemInfo(){ + static std::pair<size_t, size_t> getMemInfo() { size_t free; size_t total; - CHECK_CUDA_STATUS(cudaMemGetInfo (&free, &total)); + CHECK_CUDA_STATUS(cudaMemGetInfo(&free, &total)); return std::make_pair(free, total); } - - static int getDevice(){ + static int getDevice() { int dev; CHECK_CUDA_STATUS(cudaGetDevice(&dev)); return dev; } - static const cudaDeviceProp& getDeviceProp() - { + static const cudaDeviceProp &getDeviceProp() { static std::vector<cudaDeviceProp> deviceProp; static std::vector<bool> init; if (deviceProp.empty()) { -//#pragma omp critical(CudaContext__getDeviceProp) + // #pragma omp critical(CudaContext__getDeviceProp) if (deviceProp.empty()) { int count = 1; CHECK_CUDA_STATUS(cudaGetDeviceCount(&count)); @@ -68,12 +65,11 @@ public: } // Declare cublas handle - static cublasHandle_t& cublasHandle() - { + static cublasHandle_t &cublasHandle() { static std::vector<cublasHandle_t> cublas_h; if (cublas_h.empty()) { -//#pragma omp critical(CudaContext__cublasHandle) + // #pragma omp critical(CudaContext__cublasHandle) if (cublas_h.empty()) { int count = 1; CHECK_CUDA_STATUS(cudaGetDeviceCount(&count)); @@ -94,12 +90,11 @@ public: } // Declare cudnn handle - static cudnnHandle_t& cudnnHandle() - { + static cudnnHandle_t &cudnnHandle() { static std::vector<cudnnHandle_t> cudnn_h; if (cudnn_h.empty()) { -//#pragma omp critical(CudaContext__cudnnHandle) + // #pragma omp critical(CudaContext__cudnnHandle) if (cudnn_h.empty()) { int count = 1; CHECK_CUDA_STATUS(cudaGetDeviceCount(&count)); @@ -119,54 +114,50 @@ public: return cudnn_h[dev]; } - template <class T> - struct data_type { + template <class T> struct data_type { static const cudnnDataType_t value = CUDNN_DATA_FLOAT; - // Dummy value by default + // Dummy value by default }; }; -} +} // namespace Aidge namespace Aidge { - template <> - struct CudaContext::data_type<half_float::half> { - static const cudnnDataType_t value = CUDNN_DATA_HALF; - }; +template <> struct CudaContext::data_type<half_float::half> { + static const cudnnDataType_t value = CUDNN_DATA_HALF; +}; - template <> - struct CudaContext::data_type<float> { - static const cudnnDataType_t value = CUDNN_DATA_FLOAT; - }; +template <> struct CudaContext::data_type<float> { + static const cudnnDataType_t value = CUDNN_DATA_FLOAT; +}; - template <> - struct CudaContext::data_type<double> { - static const cudnnDataType_t value = CUDNN_DATA_DOUBLE; - }; +template <> struct CudaContext::data_type<double> { + static const cudnnDataType_t value = CUDNN_DATA_DOUBLE; +}; - inline cudnnDataType_t DataTypeToCudnn(DataType type) { - switch (type) { - case DataType::Float64: - return CUDNN_DATA_DOUBLE; - case DataType::Float32: - return CUDNN_DATA_FLOAT; - case DataType::Float16: - return CUDNN_DATA_HALF; - case DataType::Int8: - return CUDNN_DATA_INT8; - case DataType::UInt8: - return CUDNN_DATA_UINT8; - case DataType::Int32: - return CUDNN_DATA_INT32; +inline cudnnDataType_t DataTypeToCudnn(DataType type) { + switch (type) { + case DataType::Float64: + return CUDNN_DATA_DOUBLE; + case DataType::Float32: + return CUDNN_DATA_FLOAT; + case DataType::Float16: + return CUDNN_DATA_HALF; + case DataType::Int8: + return CUDNN_DATA_INT8; + case DataType::UInt8: + return CUDNN_DATA_UINT8; + case DataType::Int32: + return CUDNN_DATA_INT32; #if CUDNN_VERSION >= 8100 - case DataType::Int64: - return CUDNN_DATA_INT64; + case DataType::Int64: + return CUDNN_DATA_INT64; #endif - default: - assert(false && "Unsupported CuDNN type"); - } - - return CUDNN_DATA_FLOAT; // TODO: undefined behavior + default: + assert(false && "Unsupported CuDNN type"); } + + return CUDNN_DATA_FLOAT; // TODO: undefined behavior } +} // namespace Aidge #endif // AIDGE_BACKEND_CUDA_CUDA_CONTEXT_H diff --git a/include/aidge/backend/cuda/utils/CudaUtils.hpp b/include/aidge/backend/cuda/utils/CudaUtils.hpp index ab7c805224ed6fe073baf2036b84f4ed6f49b077..1601dd0c0944992fa0a978dcf38cd4bcb6c9771c 100644 --- a/include/aidge/backend/cuda/utils/CudaUtils.hpp +++ b/include/aidge/backend/cuda/utils/CudaUtils.hpp @@ -1,11 +1,11 @@ #ifndef AIDGE_BACKEND_CUDA_CUDA_UTILS_H #define AIDGE_BACKEND_CUDA_CUDA_UTILS_H -#include <string> +#include <iostream> #include <memory> #include <sstream> -#include <iostream> #include <stdexcept> +#include <string> #include <cublas_v2.h> #include <cuda.h> @@ -14,86 +14,85 @@ #include "aidge/data/half.hpp" #include "aidge/utils/ErrorHandling.hpp" -#define CHECK_CUDNN_STATUS(status) \ - do { \ - const cudnnStatus_t e = (status); \ - if (e != CUDNN_STATUS_SUCCESS) { \ - std::stringstream error; \ - error << "CUDNN failure: " << cudnnGetErrorString(e) << " (" \ - << static_cast<int>(e) << ") in " << __FILE__ << ':' << __LINE__; \ - int status_dev; \ - if (cudaGetDevice(&status_dev) == cudaSuccess) \ - error << " on device #" << status_dev; \ - std::cerr << error.str() << std::endl; \ - cudaDeviceReset(); \ - throw std::runtime_error(error.str()); \ - } \ - } while(0) +#define CHECK_CUDNN_STATUS(status) \ + do { \ + const cudnnStatus_t e = (status); \ + if (e != CUDNN_STATUS_SUCCESS) { \ + std::stringstream error; \ + error << "CUDNN failure: " << cudnnGetErrorString(e) << " (" \ + << static_cast<int>(e) << ") in " << __FILE__ << ':' \ + << __LINE__; \ + int status_dev; \ + if (cudaGetDevice(&status_dev) == cudaSuccess) \ + error << " on device #" << status_dev; \ + std::cerr << error.str() << std::endl; \ + cudaDeviceReset(); \ + throw std::runtime_error(error.str()); \ + } \ + } while (0) -#define CHECK_CUDA_STATUS(status) \ - do { \ - const cudaError_t e = (status); \ - if ((e) != cudaSuccess) { \ - std::stringstream error; \ - error << "Cuda failure: " << cudaGetErrorString(e) << " (" \ - << static_cast<int>(e) << ") in " << __FILE__ << ':' << __LINE__; \ - int status_dev; \ - if (cudaGetDevice(&status_dev) == cudaSuccess) \ - error << " on device #" << status_dev; \ - std::cerr << error.str() << std::endl; \ - cudaDeviceReset(); \ - throw std::runtime_error(error.str()); \ - } \ - } while(0) +#define CHECK_CUDA_STATUS(status) \ + do { \ + const cudaError_t e = (status); \ + if ((e) != cudaSuccess) { \ + std::stringstream error; \ + error << "Cuda failure: " << cudaGetErrorString(e) << " (" \ + << static_cast<int>(e) << ") in " << __FILE__ << ':' \ + << __LINE__; \ + int status_dev; \ + if (cudaGetDevice(&status_dev) == cudaSuccess) \ + error << " on device #" << status_dev; \ + std::cerr << error.str() << std::endl; \ + cudaDeviceReset(); \ + throw std::runtime_error(error.str()); \ + } \ + } while (0) -#define CHECK_CUBLAS_STATUS(status) \ - do { \ - const cublasStatus_t e = (status); \ - if (e != CUBLAS_STATUS_SUCCESS) { \ - std::stringstream error; \ - error << "Cublas failure: " \ - << Aidge::Cuda::cublasGetErrorString(e) << " (" \ - << static_cast<int>(e) << ") in " << __FILE__ << ':' << __LINE__; \ - int status_dev; \ - if (cudaGetDevice(&status_dev) == cudaSuccess) \ - error << " on device #" << status_dev; \ - std::cerr << error.str() << std::endl; \ - cudaDeviceReset(); \ - throw std::runtime_error(error.str()); \ - } \ - } while(0) +#define CHECK_CUBLAS_STATUS(status) \ + do { \ + const cublasStatus_t e = (status); \ + if (e != CUBLAS_STATUS_SUCCESS) { \ + std::stringstream error; \ + error << "Cublas failure: " \ + << Aidge::Cuda::cublasGetErrorString(e) << " (" \ + << static_cast<int>(e) << ") in " << __FILE__ << ':' \ + << __LINE__; \ + int status_dev; \ + if (cudaGetDevice(&status_dev) == cudaSuccess) \ + error << " on device #" << status_dev; \ + std::cerr << error.str() << std::endl; \ + cudaDeviceReset(); \ + throw std::runtime_error(error.str()); \ + } \ + } while (0) namespace Aidge { namespace Cuda { - // CuDNN scaling parameters are typically "alpha" and "beta". - // Their type must be "float" for HALF and FLOAT (default template) - // and "double" for DOUBLE (specialized template) - template <class T> - struct cudnn_scaling_type { - typedef float type; - }; +// CuDNN scaling parameters are typically "alpha" and "beta". +// Their type must be "float" for HALF and FLOAT (default template) +// and "double" for DOUBLE (specialized template) +template <class T> struct cudnn_scaling_type { + typedef float type; +}; - template <> - struct cudnn_scaling_type<double> { - typedef double type; - }; +template <> struct cudnn_scaling_type<double> { + typedef double type; +}; - template <class T> - struct cuda_type { - typedef T type; - }; +template <class T> struct cuda_type { + typedef T type; +}; - template <> - struct cuda_type<half_float::half> { - typedef __half type; - }; +template <> struct cuda_type<half_float::half> { + typedef __half type; +}; - const char* cublasGetErrorString(cublasStatus_t error); +const char *cublasGetErrorString(cublasStatus_t error); - // Enable Peer-to-Peer communications between devices - // when it is possible - void setMultiDevicePeerAccess(unsigned int size, unsigned int* devices); -} -} +// Enable Peer-to-Peer communications between devices +// when it is possible +void setMultiDevicePeerAccess(unsigned int size, unsigned int *devices); +} // namespace Cuda +} // namespace Aidge #endif // AIDGE_BACKEND_CUDA_CUDA_UTILS_H diff --git a/include/aidge/utils/sys_info/CudaVersionInfo.hpp b/include/aidge/utils/sys_info/CudaVersionInfo.hpp index 17490476b18d62da66671a28f76709349e3ba805..7e8e8dbbb42cff84cbb28a9310d92643998ad508 100644 --- a/include/aidge/utils/sys_info/CudaVersionInfo.hpp +++ b/include/aidge/utils/sys_info/CudaVersionInfo.hpp @@ -1,7 +1,7 @@ #ifndef AIDGE_UTILS_SYS_INFO_CUDA_VERSION_INFO_H #define AIDGE_UTILS_SYS_INFO_CUDA_VERSION_INFO_H -#include "aidge/backend/cuda/utils/CudaUtils.hpp" // CHECK_CUDA_STATUS +#include "aidge/backend/cuda/utils/CudaUtils.hpp" // CHECK_CUDA_STATUS #include "aidge/utils/Log.hpp" namespace Aidge { @@ -16,9 +16,15 @@ namespace Aidge { #define CUDA_COMPILER_VERSION "Unknown version" #endif void showCudaVersion() { - Log::info("Aidge backend CUDA: {} ({}), {} {}", PROJECT_VERSION, GIT_COMMIT_HASH, __DATE__, __TIME__); + Log::info("Aidge backend CUDA: {} ({}), {} {}", + PROJECT_VERSION, + GIT_COMMIT_HASH, + __DATE__, + __TIME__); Log::info("CUDA compiler version: {}", CUDA_COMPILER_VERSION); - Log::info("CuDNN version: {}.{}.{}\n", CUDNN_MAJOR, CUDNN_MINOR, + Log::info("CuDNN version: {}.{}.{}\n", + CUDNN_MAJOR, + CUDNN_MINOR, CUDNN_PATCHLEVEL); int deviceCount = 0; @@ -43,11 +49,14 @@ void showCudaVersion() { cudaRuntimeGetVersion(&runtimeVersion); Log::info( "\tCUDA Driver Version / Runtime Version: {}.{} / {}.{}", - (driverVersion / 1000), ((driverVersion % 100) / 10), - (runtimeVersion / 1000), ((runtimeVersion % 100) / 10)); + (driverVersion / 1000), + ((driverVersion % 100) / 10), + (runtimeVersion / 1000), + ((runtimeVersion % 100) / 10)); Log::info("\tCUDA Capability Major/Minor version number: {}.{}", - deviceProp.major, deviceProp.minor); + deviceProp.major, + deviceProp.minor); } } -} // namespace Aidge -#endif // AIDGE_UTILS_SYS_INFO_CUDA_VERSION_INFO_H +} // namespace Aidge +#endif // AIDGE_UTILS_SYS_INFO_CUDA_VERSION_INFO_H diff --git a/python_binding/pybind_backend_cuda.cpp b/python_binding/pybind_backend_cuda.cpp index 3d7564459781d6933827aa66b405b03085806467..dcbc785a2c8b0974ebb375f885694030dc566fd4 100644 --- a/python_binding/pybind_backend_cuda.cpp +++ b/python_binding/pybind_backend_cuda.cpp @@ -17,9 +17,9 @@ namespace py = pybind11; namespace Aidge { -void init_cuda_sys_info(py::module& m); +void init_cuda_sys_info(py::module &m); -void init_Aidge(py::module& m){ +void init_Aidge(py::module &m) { init_cuda_sys_info(m); } diff --git a/python_binding/utils/sys_info/pybind_CudaVersionInfo.cpp b/python_binding/utils/sys_info/pybind_CudaVersionInfo.cpp index 64f650903ec75d579ffd58dbd6d7db7bbaf573a2..5e6f2db9868de3f1bf374f25f1f4574846ab0c45 100644 --- a/python_binding/utils/sys_info/pybind_CudaVersionInfo.cpp +++ b/python_binding/utils/sys_info/pybind_CudaVersionInfo.cpp @@ -1,9 +1,9 @@ -#include <pybind11/pybind11.h> #include "aidge/utils/sys_info/CudaVersionInfo.hpp" +#include <pybind11/pybind11.h> namespace py = pybind11; namespace Aidge { -void init_cuda_sys_info(py::module& m){ +void init_cuda_sys_info(py::module &m) { m.def("show_cuda_version", &showCudaVersion); } -} +} // namespace Aidge diff --git a/src/operator/AddImpl.cpp b/src/operator/AddImpl.cpp index de7ea925554906ea5fe1e5dcba268b17a06a47bd..9ed7874a8ef1ecfc228720e054b35de903030a57 100644 --- a/src/operator/AddImpl.cpp +++ b/src/operator/AddImpl.cpp @@ -22,27 +22,39 @@ #include "aidge/utils/Types.h" void Aidge::AddImpl_cuda::forward() { - const Add_Op& op = static_cast<const Add_Op&>(mOp); + const Add_Op &op = static_cast<const Add_Op &>(mOp); // Check inputs AIDGE_ASSERT(op.getInput(0), "missing input in Add operator"); - AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Add forward because the 0-th input has no implementation."); + AIDGE_ASSERT(op.getInput(0)->hasImpl(), + "cannot run Add forward because the 0-th input has no " + "implementation."); DataType datatypeFirstInput = op.getInput(0)->dataType(); for (IOIndex_t i = 1; i < op.nbInputs(); ++i) { AIDGE_ASSERT(op.getInput(i), "missing input in Add operator"); - AIDGE_ASSERT(op.getInput(i)->hasImpl(), "cannot run Add forward because the {}-th input has no implementation.", i); - AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, "Cannot add inputs with two differents data type."); + AIDGE_ASSERT(op.getInput(i)->hasImpl(), + "cannot run Add forward because the {}-th input has no " + "implementation.", + i); + AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, + "Cannot add inputs with two differents data type."); } std::vector<std::shared_ptr<Tensor>> inputFallbacks(op.nbInputs()); std::vector<Tensor> inputs(op.nbInputs()); std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims - std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides + std::vector<std::vector<int>> strides( + op.nbInputs()); // For the cooresponding strides for (IOIndex_t i = 0; i < op.nbInputs(); ++i) { - inputs[i] = op.getInput(i)->refCastFrom(inputFallbacks[i], *op.getOutput(0)); + inputs[i] = + op.getInput(i)->refCastFrom(inputFallbacks[i], *op.getOutput(0)); // Get tensor dims and broadcast them - std::copy(inputs[i].dims().begin(), inputs[i].dims().end(), std::back_inserter(dims[i])); - dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1)); + std::copy(inputs[i].dims().begin(), + inputs[i].dims().end(), + std::back_inserter(dims[i])); + dims[i].insert(dims[i].cbegin(), + op.getOutput(0)->nbDims() - dims[i].size(), + int(1)); if (dims[i].size() < 4) { dims[i].resize(4, 1); @@ -58,76 +70,106 @@ void Aidge::AddImpl_cuda::forward() { strides[i] = tensorStrides; } - switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { - case DataType::Float64: - forward_<double>(inputs, dims, strides); - break; - case DataType::Float32: - forward_<float>(inputs, dims, strides); - break; - case DataType::Float16: - forward_<half>(inputs, dims, strides); - break; - default: - AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + switch ( + std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + forward_<double>(inputs, dims, strides); + break; + case DataType::Float32: + forward_<float>(inputs, dims, strides); + break; + case DataType::Float16: + forward_<half>(inputs, dims, strides); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, + "Data type is not supported by Backend Cuda"); } } template <class T> -void Aidge::AddImpl_cuda::forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); +void Aidge::AddImpl_cuda::forward_( + const std::vector<Tensor> &inputs, + const std::vector<std::vector<int>> &inputsDims, + const std::vector<std::vector<int>> &inputsStrides) { + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; // Create a Tensor descriptor with the broadcasted dims and strides cudnnTensorDescriptor_t tensorDesc; CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc)); - CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc, CudaContext::data_type<T>::value, inputsDims[0].size(), inputsDims[0].data(), inputsStrides[0].data())); - // Add first input CHECK_CUDNN_STATUS( - cudnnAddTensor(CudaContext::cudnnHandle(), - &alpha, - tensorDesc, - inputs[0].getImpl()->rawPtr(), - &beta, - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), - std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()) - ); + cudnnSetTensorNdDescriptor(tensorDesc, + CudaContext::data_type<T>::value, + inputsDims[0].size(), + inputsDims[0].data(), + inputsStrides[0].data())); + // Add first input + CHECK_CUDNN_STATUS(cudnnAddTensor( + CudaContext::cudnnHandle(), + &alpha, + tensorDesc, + inputs[0].getImpl()->rawPtr(), + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl()) + ->getCudnnTensorDesc(*op.getOutput(0)), + std::static_pointer_cast<Tensor>(op.getRawOutput(0)) + ->getImpl() + ->rawPtr())); // Add other inputs if there are any - for (size_t i = 1; i < op.nbInputs(); ++i) - { - CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc, CudaContext::data_type<T>::value, inputsDims[i].size(), inputsDims[i].data(), inputsStrides[i].data())); + for (size_t i = 1; i < op.nbInputs(); ++i) { + CHECK_CUDNN_STATUS( + cudnnSetTensorNdDescriptor(tensorDesc, + CudaContext::data_type<T>::value, + inputsDims[i].size(), + inputsDims[i].data(), + inputsStrides[i].data())); CHECK_CUDNN_STATUS( cudnnAddTensor(CudaContext::cudnnHandle(), - &alpha, - tensorDesc, - inputs[i].getImpl()->rawPtr(), - &alpha, - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), - std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()) - ); + &alpha, + tensorDesc, + inputs[i].getImpl()->rawPtr(), + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>( + op.getOutput(0)->getImpl()) + ->getCudnnTensorDesc(*op.getOutput(0)), + std::static_pointer_cast<Tensor>(op.getRawOutput(0)) + ->getImpl() + ->rawPtr())); } CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc)); } void Aidge::AddImpl_cuda::backward() { - const Add_Op& op = static_cast<const Add_Op&>(mOp); + const Add_Op &op = static_cast<const Add_Op &>(mOp); // Check output - AIDGE_ASSERT(op.getOutput(0)->grad(), "missing output gradient in Add operator"); - AIDGE_ASSERT(op.getOutput(0)->grad()->hasImpl(), "cannot run Add backward because the output gradient has no implementation."); + AIDGE_ASSERT(op.getOutput(0)->grad(), + "missing output gradient in Add operator"); + AIDGE_ASSERT(op.getOutput(0)->grad()->hasImpl(), + "cannot run Add backward because the output gradient has no " + "implementation."); std::shared_ptr<Tensor> outputGradFallback; - const auto& outputGrad = op.getOutput(0)->grad()->refCastFrom(outputGradFallback, *op.getOutput(0)->grad()); + const auto &outputGrad = + op.getOutput(0)->grad()->refCastFrom(outputGradFallback, + *op.getOutput(0)->grad()); std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims - std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides + std::vector<std::vector<int>> strides( + op.nbInputs()); // For the cooresponding strides for (IOIndex_t i = 0; i < op.nbInputs(); ++i) { std::shared_ptr<Tensor> inputFallback; - const Tensor input = op.getInput(i)->refCastFrom(inputFallback, *op.getOutput(0)); + const Tensor input = + op.getInput(i)->refCastFrom(inputFallback, *op.getOutput(0)); // Get tensor dims and broadcast them - std::copy(input.dims().begin(), input.dims().end(), std::back_inserter(dims[i])); - dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1)); + std::copy(input.dims().begin(), + input.dims().end(), + std::back_inserter(dims[i])); + dims[i].insert(dims[i].cbegin(), + op.getOutput(0)->nbDims() - dims[i].size(), + int(1)); // Compute the corresponding strides std::vector<int> tensorStrides(dims[i].size()); @@ -139,77 +181,89 @@ void Aidge::AddImpl_cuda::backward() { strides[i] = tensorStrides; } - switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { - case DataType::Float64: - backward_<double>(outputGrad, dims, strides); - break; - case DataType::Float32: - backward_<float>(outputGrad, dims, strides); - break; - case DataType::Float16: - backward_<half>(outputGrad, dims, strides); - break; - default: - AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + switch ( + std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + backward_<double>(outputGrad, dims, strides); + break; + case DataType::Float32: + backward_<float>(outputGrad, dims, strides); + break; + case DataType::Float16: + backward_<half>(outputGrad, dims, strides); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, + "Data type is not supported by Backend Cuda"); } } template <class T> -void Aidge::AddImpl_cuda::backward_(const Tensor& outputGrad, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); +void Aidge::AddImpl_cuda::backward_( + const Tensor &outputGrad, + const std::vector<std::vector<int>> &inputsDims, + const std::vector<std::vector<int>> &inputsStrides) { + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; - for (std::size_t i = 0; i < inputsDims.size(); i++) - { - if (op.getInput(i)->size() == op.getOutput(0)->size()) - { + for (std::size_t i = 0; i < inputsDims.size(); i++) { + if (op.getInput(i)->size() == op.getOutput(0)->size()) { // TODO: Test if we can avoid copy and simply set rawPtr - op.getInput(i)->grad()->getImpl()->copy(outputGrad.getImpl()->rawPtr(), op.getInput(i)->grad()->size()); - } - else // In case of broadcasting + op.getInput(i)->grad()->getImpl()->copy( + outputGrad.getImpl()->rawPtr(), + op.getInput(i)->grad()->size()); + } else // In case of broadcasting { - // Gradient with respect to input_i: sum outputGrad over the broadcasted dimensions using cudnnReduceTensor + // Gradient with respect to input_i: sum outputGrad over the + // broadcasted dimensions using cudnnReduceTensor cudnnReduceTensorDescriptor_t reduceDesc; CHECK_CUDNN_STATUS(cudnnCreateReduceTensorDescriptor(&reduceDesc)); - CHECK_CUDNN_STATUS(cudnnSetReduceTensorDescriptor(reduceDesc, - CUDNN_REDUCE_TENSOR_ADD, - CudaContext::data_type<T>::value, - CUDNN_PROPAGATE_NAN, - CUDNN_REDUCE_TENSOR_NO_INDICES, - CUDNN_32BIT_INDICES)); - - cudnnTensorDescriptor_t outputDesc = std::dynamic_pointer_cast<TensorImpl_cuda_>(outputGrad.getImpl())->getCudnnTensorDesc(*op.getOutput(0)); + CHECK_CUDNN_STATUS(cudnnSetReduceTensorDescriptor( + reduceDesc, + CUDNN_REDUCE_TENSOR_ADD, + CudaContext::data_type<T>::value, + CUDNN_PROPAGATE_NAN, + CUDNN_REDUCE_TENSOR_NO_INDICES, + CUDNN_32BIT_INDICES)); + + cudnnTensorDescriptor_t outputDesc = + std::dynamic_pointer_cast<TensorImpl_cuda_>( + outputGrad.getImpl()) + ->getCudnnTensorDesc(*op.getOutput(0)); // Create a Tensor descriptor with the broadcasted dims and strides cudnnTensorDescriptor_t tensorDesc; CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc)); - CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc, - CudaContext::data_type<T>::value, - inputsDims[i].size(), - inputsDims[i].data(), - inputsStrides[i].data())); + CHECK_CUDNN_STATUS( + cudnnSetTensorNdDescriptor(tensorDesc, + CudaContext::data_type<T>::value, + inputsDims[i].size(), + inputsDims[i].data(), + inputsStrides[i].data())); size_t workspaceSize; - CHECK_CUDNN_STATUS(cudnnGetReductionWorkspaceSize(CudaContext::cudnnHandle(), - reduceDesc, - outputDesc, - tensorDesc, - &workspaceSize)); + CHECK_CUDNN_STATUS( + cudnnGetReductionWorkspaceSize(CudaContext::cudnnHandle(), + reduceDesc, + outputDesc, + tensorDesc, + &workspaceSize)); void *d_workspace; CHECK_CUDA_STATUS(cudaMalloc(&d_workspace, workspaceSize)); - CHECK_CUDNN_STATUS(cudnnReduceTensor(CudaContext::cudnnHandle(), - reduceDesc, - NULL, - 0, - d_workspace, - workspaceSize, - &alpha, - outputDesc, - outputGrad.getImpl()->rawPtr(), - &beta, - tensorDesc, - op.getInput(i)->grad()->getImpl()->rawPtr())); + CHECK_CUDNN_STATUS(cudnnReduceTensor( + CudaContext::cudnnHandle(), + reduceDesc, + NULL, + 0, + d_workspace, + workspaceSize, + &alpha, + outputDesc, + outputGrad.getImpl()->rawPtr(), + &beta, + tensorDesc, + op.getInput(i)->grad()->getImpl()->rawPtr())); CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc)); } diff --git a/src/operator/AndImpl.cpp b/src/operator/AndImpl.cpp index e1ee9ebcb9437b89666da21a915907b5434ece26..ddbf69ab48d23c10a970bc30c807c6639b54cef4 100644 --- a/src/operator/AndImpl.cpp +++ b/src/operator/AndImpl.cpp @@ -23,27 +23,39 @@ #include "aidge/utils/Types.h" void Aidge::AndImpl_cuda::forward() { - const And_Op& op = static_cast<const And_Op&>(mOp); + const And_Op &op = static_cast<const And_Op &>(mOp); // Check inputs AIDGE_ASSERT(op.getInput(0), "missing input in And operator"); - AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run And forward because the 0-th input has no implementation."); + AIDGE_ASSERT(op.getInput(0)->hasImpl(), + "cannot run And forward because the 0-th input has no " + "implementation."); DataType datatypeFirstInput = op.getInput(0)->dataType(); for (IOIndex_t i = 1; i < op.nbInputs(); ++i) { AIDGE_ASSERT(op.getInput(i), "missing input in And operator"); - AIDGE_ASSERT(op.getInput(i)->hasImpl(), "cannot run And forward because the {}-th input has no implementation.", i); - AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, "Cannot And inputs with two differents data type."); + AIDGE_ASSERT(op.getInput(i)->hasImpl(), + "cannot run And forward because the {}-th input has no " + "implementation.", + i); + AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, + "Cannot And inputs with two differents data type."); } std::vector<std::shared_ptr<Tensor>> inputFallbacks(op.nbInputs()); std::vector<Tensor> inputs(op.nbInputs()); std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims - std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides + std::vector<std::vector<int>> strides( + op.nbInputs()); // For the cooresponding strides for (IOIndex_t i = 0; i < op.nbInputs(); ++i) { - inputs[i] = op.getInput(i)->refCastFrom(inputFallbacks[i], *op.getOutput(0)); + inputs[i] = + op.getInput(i)->refCastFrom(inputFallbacks[i], *op.getOutput(0)); // Get tensor dims and broadcast them - std::copy(inputs[i].dims().begin(), inputs[i].dims().end(), std::back_inserter(dims[i])); - dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1)); + std::copy(inputs[i].dims().begin(), + inputs[i].dims().end(), + std::back_inserter(dims[i])); + dims[i].insert(dims[i].cbegin(), + op.getOutput(0)->nbDims() - dims[i].size(), + int(1)); if (dims[i].size() < 4) { dims[i].resize(4, 1); @@ -59,37 +71,48 @@ void Aidge::AndImpl_cuda::forward() { strides[i] = tensorStrides; } - switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { - case DataType::Float64: - forward_<double>(inputs, dims, strides); - break; - case DataType::Float32: - forward_<float>(inputs, dims, strides); - break; - case DataType::Float16: - forward_<half>(inputs, dims, strides); - break; - default: - AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + switch ( + std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + forward_<double>(inputs, dims, strides); + break; + case DataType::Float32: + forward_<float>(inputs, dims, strides); + break; + case DataType::Float16: + forward_<half>(inputs, dims, strides); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, + "Data type is not supported by Backend Cuda"); } } template <class T> -void Aidge::AndImpl_cuda::forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); - const T * input1Ptr = static_cast<const T*>(inputs[0].getImpl()->rawPtr()); - const T * input2Ptr = static_cast<const T*>(inputs[1].getImpl()->rawPtr()); - T * outputPtr = static_cast<T*>(op.getOutput(0)->getImpl()->rawPtr()); +void Aidge::AndImpl_cuda::forward_( + const std::vector<Tensor> &inputs, + const std::vector<std::vector<int>> &inputsDims, + const std::vector<std::vector<int>> &inputsStrides) { + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); + const T *input1Ptr = static_cast<const T *>(inputs[0].getImpl()->rawPtr()); + const T *input2Ptr = static_cast<const T *>(inputs[1].getImpl()->rawPtr()); + T *outputPtr = static_cast<T *>(op.getOutput(0)->getImpl()->rawPtr()); std::vector<int> outputStrides(op.getOutput(0)->nbDims(), 1); - if(op.getOutput(0)->nbDims()>1) { - for (int i = op.getOutput(0)->nbDims()-2; i >= 0; i--) { - outputStrides[i] = outputStrides[i+1] * op.getOutput(0)->dims()[i+1]; + if (op.getOutput(0)->nbDims() > 1) { + for (int i = op.getOutput(0)->nbDims() - 2; i >= 0; i--) { + outputStrides[i] = + outputStrides[i + 1] * op.getOutput(0)->dims()[i + 1]; } } - Aidge::AndForward<T>(input1Ptr, input2Ptr, outputPtr, - inputsDims[0], inputsDims[1], - inputsStrides[0], inputsStrides[1], outputStrides, - static_cast<int>(op.getOutput(0)->size())); + Aidge::AndForward<T>(input1Ptr, + input2Ptr, + outputPtr, + inputsDims[0], + inputsDims[1], + inputsStrides[0], + inputsStrides[1], + outputStrides, + static_cast<int>(op.getOutput(0)->size())); } \ No newline at end of file diff --git a/src/operator/ArgMaxImpl.cpp b/src/operator/ArgMaxImpl.cpp index 50d00592ca70333d6fbdd7a10761a0ea2e9beb4b..2820ba52ad550a7c00e7e0c382ef4a17bc0fda2c 100644 --- a/src/operator/ArgMaxImpl.cpp +++ b/src/operator/ArgMaxImpl.cpp @@ -23,52 +23,66 @@ #include "aidge/utils/Types.h" void Aidge::ArgMaxImpl_cuda::forward() { - const ArgMax_Op& op = dynamic_cast<const ArgMax_Op&>(mOp); + const ArgMax_Op &op = dynamic_cast<const ArgMax_Op &>(mOp); AIDGE_ASSERT(mOp.getRawInput(0), "missing input in ArgMax operator"); - AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run ArgMax forward because the input has no implementation."); + AIDGE_ASSERT( + op.getInput(0)->hasImpl(), + "cannot run ArgMax forward because the input has no implementation."); - const auto& input = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->refCastFrom(mInputFallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); + const auto &input = + std::static_pointer_cast<Tensor>(mOp.getRawInput(0)) + ->refCastFrom( + mInputFallback, + *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); const std::int32_t axis = op.axis(); const DimSize_t selectLastIdx = op.selectLastIndex(); - switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { - case DataType::Float64: - forward_<double>(input, axis, selectLastIdx); - break; - case DataType::Float32: - forward_<float>(input, axis, selectLastIdx); - break; - case DataType::Float16: - forward_<half>(input, axis, selectLastIdx); - break; - default: - AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + switch ( + std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + forward_<double>(input, axis, selectLastIdx); + break; + case DataType::Float32: + forward_<float>(input, axis, selectLastIdx); + break; + case DataType::Float16: + forward_<half>(input, axis, selectLastIdx); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, + "Data type is not supported by Backend Cuda"); } } - template <class T> -void Aidge::ArgMaxImpl_cuda::forward_(const Tensor& input, std::int32_t axis, DimSize_t selectLastIdx) { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); - +void Aidge::ArgMaxImpl_cuda::forward_(const Tensor &input, + std::int32_t axis, + DimSize_t selectLastIdx) { + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); - const T * inputPtr = static_cast<const T*>(input.getImpl()->rawPtr()); - T * outputPtr = static_cast<T*>(op.getOutput(0)->getImpl()->rawPtr()); + const T *inputPtr = static_cast<const T *>(input.getImpl()->rawPtr()); + T *outputPtr = static_cast<T *>(op.getOutput(0)->getImpl()->rawPtr()); std::vector<int> inputStrides(op.getInput(0)->nbDims(), 1); - if(op.getInput(0)->nbDims()>1) { - for (int i = op.getInput(0)->nbDims()-2; i >= 0; i--) { - inputStrides[i] = inputStrides[i+1] * op.getInput(0)->dims()[i+1]; + if (op.getInput(0)->nbDims() > 1) { + for (int i = op.getInput(0)->nbDims() - 2; i >= 0; i--) { + inputStrides[i] = + inputStrides[i + 1] * op.getInput(0)->dims()[i + 1]; } } std::vector<int> inputShape(input.nbDims()); // Use std::transform to convert each element - std::transform(input.dims().begin(), input.dims().end(), inputShape.begin(), - [](size_t value) { - return static_cast<int>(value); - }); - Aidge::ArgMax_cuda_forward_kernel<T>(inputPtr, outputPtr, - inputShape, inputStrides, - axis, static_cast<int>(op.getInput(0)->size()), selectLastIdx); + std::transform(input.dims().begin(), + input.dims().end(), + inputShape.begin(), + [](size_t value) { return static_cast<int>(value); }); + Aidge::ArgMax_cuda_forward_kernel<T>( + inputPtr, + outputPtr, + inputShape, + inputStrides, + axis, + static_cast<int>(op.getInput(0)->size()), + selectLastIdx); } diff --git a/src/operator/AvgPoolingImpl.cpp b/src/operator/AvgPoolingImpl.cpp index d1270ee4b0a556e1053f3cfde8d71ec5efbee279..3ae52d114f005408722e5f2259013db91a6a0f1c 100644 --- a/src/operator/AvgPoolingImpl.cpp +++ b/src/operator/AvgPoolingImpl.cpp @@ -21,27 +21,30 @@ template <Aidge::DimIdx_t DIM> void Aidge::AvgPoolingImpl_cuda<DIM>::forward() { - const AvgPooling_Op<DIM>& op = dynamic_cast<const AvgPooling_Op<DIM>&>(mOp); + const AvgPooling_Op<DIM> &op = + dynamic_cast<const AvgPooling_Op<DIM> &>(mOp); AIDGE_ASSERT(mOp.getRawInput(0), "missing input #0"); - const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0)); + const auto &input = + op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0)); // Lazy-initialize CuDNN AvgPooling descriptor if (mAvgPoolingDesc == nullptr) { - const std::vector<int> strides(op.strideDims().begin(), op.strideDims().end()); + const std::vector<int> strides(op.strideDims().begin(), + op.strideDims().end()); const std::vector<int> paddings(DIM, 0); - const std::vector<int> window_dims(op.kernelDims().begin(), op.kernelDims().end()); + const std::vector<int> window_dims(op.kernelDims().begin(), + op.kernelDims().end()); CHECK_CUDNN_STATUS(cudnnCreatePoolingDescriptor(&mAvgPoolingDesc)); - CHECK_CUDNN_STATUS( - cudnnSetPoolingNdDescriptor(mAvgPoolingDesc, - mMode, - CUDNN_NOT_PROPAGATE_NAN, - DIM, - &window_dims[0], - &paddings[0], - &strides[0])); + CHECK_CUDNN_STATUS(cudnnSetPoolingNdDescriptor(mAvgPoolingDesc, + mMode, + CUDNN_NOT_PROPAGATE_NAN, + DIM, + &window_dims[0], + &paddings[0], + &strides[0])); } // Do the actual forward computation @@ -49,77 +52,85 @@ void Aidge::AvgPoolingImpl_cuda<DIM>::forward() { // excepted when the convolution is performed in double precision. if (op.getOutput(0)->dataType() == DataType::Float64) { forward_<double>(input); - } - else { + } else { forward_<float>(input); } } template <Aidge::DimIdx_t DIM> template <class T> -void Aidge::AvgPoolingImpl_cuda<DIM>::forward_(const Tensor& input) { - const AvgPooling_Op<DIM>& op = dynamic_cast<const AvgPooling_Op<DIM>&>(mOp); +void Aidge::AvgPoolingImpl_cuda<DIM>::forward_(const Tensor &input) { + const AvgPooling_Op<DIM> &op = + dynamic_cast<const AvgPooling_Op<DIM> &>(mOp); const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; - CHECK_CUDNN_STATUS( - cudnnPoolingForward( - CudaContext::cudnnHandle(), - mAvgPoolingDesc, - &alpha, - std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input), - input.getImpl()->rawPtr(), - &beta, - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), - op.getOutput(0)->getImpl()->rawPtr() - ) - ); + CHECK_CUDNN_STATUS(cudnnPoolingForward( + CudaContext::cudnnHandle(), + mAvgPoolingDesc, + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl()) + ->getCudnnTensorDesc(input), + input.getImpl()->rawPtr(), + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl()) + ->getCudnnTensorDesc(*op.getOutput(0)), + op.getOutput(0)->getImpl()->rawPtr())); } template <Aidge::DimIdx_t DIM> void Aidge::AvgPoolingImpl_cuda<DIM>::backward() { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); - AIDGE_ASSERT(mAvgPoolingDesc != nullptr, "AvgPool descriptor must be created during forward!"); + AIDGE_ASSERT(mAvgPoolingDesc != nullptr, + "AvgPool descriptor must be created during forward!"); AIDGE_ASSERT(op.getOutput(0)->grad(), "missing output grad #0"); - const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad()); + const auto &output_grad = + op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, + *op.getOutput(0)->grad()); // Do the actual backward computation // Template is only for scaling parameters, which are always in float // excepted when the convolution is performed in double precision. if (op.getInput(0)->grad()->dataType() == DataType::Float64) { backward_<double>(output_grad); - } - else { + } else { backward_<float>(output_grad); } } template <Aidge::DimIdx_t DIM> template <class T> -void Aidge::AvgPoolingImpl_cuda<DIM>::backward_(const Tensor& output_grad) { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); +void Aidge::AvgPoolingImpl_cuda<DIM>::backward_(const Tensor &output_grad) { + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); const T alpha = 1.0f; const T beta = 0.0f; - CHECK_CUDNN_STATUS( - cudnnPoolingBackward(CudaContext::cudnnHandle(), - mAvgPoolingDesc, - &alpha, - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), - std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr(), - std::dynamic_pointer_cast<TensorImpl_cuda_>(output_grad.getImpl())->getCudnnTensorDesc(output_grad), - output_grad.getImpl()->rawPtr(), - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->getImpl())->getCudnnTensorDesc(*op.getInput(0)), - op.getInput(0)->getImpl()->rawPtr(), - &beta, - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->grad()->getImpl())->getCudnnTensorDesc(*op.getInput(0)), - op.getInput(0)->grad()->getImpl()->rawPtr())); + CHECK_CUDNN_STATUS(cudnnPoolingBackward( + CudaContext::cudnnHandle(), + mAvgPoolingDesc, + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl()) + ->getCudnnTensorDesc(*op.getOutput(0)), + std::static_pointer_cast<Tensor>(op.getRawOutput(0)) + ->getImpl() + ->rawPtr(), + std::dynamic_pointer_cast<TensorImpl_cuda_>(output_grad.getImpl()) + ->getCudnnTensorDesc(output_grad), + output_grad.getImpl()->rawPtr(), + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->getImpl()) + ->getCudnnTensorDesc(*op.getInput(0)), + op.getInput(0)->getImpl()->rawPtr(), + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>( + op.getInput(0)->grad()->getImpl()) + ->getCudnnTensorDesc(*op.getInput(0)), + op.getInput(0)->grad()->getImpl()->rawPtr())); } template <Aidge::DimIdx_t DIM> Aidge::AvgPoolingImpl_cuda<DIM>::~AvgPoolingImpl_cuda() { - if(mAvgPoolingDesc != nullptr) + if (mAvgPoolingDesc != nullptr) cudnnDestroyPoolingDescriptor(mAvgPoolingDesc); } diff --git a/src/operator/BatchNormImpl.cpp b/src/operator/BatchNormImpl.cpp index 5cf079326a0ea003fb72875bcaebefe847086ecb..dee6bcfed60ddd3e1b1aeab26495e722a1fb4231 100644 --- a/src/operator/BatchNormImpl.cpp +++ b/src/operator/BatchNormImpl.cpp @@ -20,8 +20,7 @@ #include "aidge/operator/BatchNorm.hpp" #include "aidge/utils/Types.h" -template <Aidge::DimIdx_t DIM> -void Aidge::BatchNormImpl_cuda<DIM>::forward() { +template <Aidge::DimIdx_t DIM> void Aidge::BatchNormImpl_cuda<DIM>::forward() { // FIXME: uncomment the following code once memory handling will work AIDGE_ASSERT(mOp.getRawInput(0), "missing input #0"); AIDGE_ASSERT(mOp.getRawInput(1), "missing input #1"); @@ -29,17 +28,37 @@ void Aidge::BatchNormImpl_cuda<DIM>::forward() { AIDGE_ASSERT(mOp.getRawInput(3), "missing input #3"); AIDGE_ASSERT(mOp.getRawInput(4), "missing input #4"); + std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback, + input3Fallback, input4Fallback; + const auto &input0 = + std::static_pointer_cast<Tensor>(mOp.getRawInput(0)) + ->refCastFrom( + input0Fallback, + *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); + const auto &input1 = + std::static_pointer_cast<Tensor>(mOp.getRawInput(1)) + ->refCastFrom( + input1Fallback, + *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); + const auto &input2 = + std::static_pointer_cast<Tensor>(mOp.getRawInput(2)) + ->refCastFrom( + input2Fallback, + *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); + const auto &input3 = + std::static_pointer_cast<Tensor>(mOp.getRawInput(3)) + ->refCastFrom( + input3Fallback, + *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); + const auto &input4 = + std::static_pointer_cast<Tensor>(mOp.getRawInput(4)) + ->refCastFrom( + input4Fallback, + *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); - std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback, input3Fallback, input4Fallback; - const auto& input0 = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->refCastFrom(input0Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); - const auto& input1 = std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->refCastFrom(input1Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); - const auto& input2 = std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->refCastFrom(input2Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); - const auto& input3 = std::static_pointer_cast<Tensor>(mOp.getRawInput(3))->refCastFrom(input3Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); - const auto& input4 = std::static_pointer_cast<Tensor>(mOp.getRawInput(4))->refCastFrom(input4Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); - - if (mBNDesc == nullptr) - { - const BatchNorm_Op<DIM>& bnOp = static_cast<const BatchNorm_Op<DIM>&>(mOp); + if (mBNDesc == nullptr) { + const BatchNorm_Op<DIM> &bnOp = + static_cast<const BatchNorm_Op<DIM> &>(mOp); mEpsilon = static_cast<double>(bnOp.epsilon()); mMode = CUDNN_BATCHNORM_SPATIAL; @@ -50,8 +69,10 @@ void Aidge::BatchNormImpl_cuda<DIM>::forward() { CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&mBNDesc)); CHECK_CUDNN_STATUS(cudnnDeriveBNTensorDescriptor( - mBNDesc, std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0), mMode)); - + mBNDesc, + std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl()) + ->getCudnnTensorDesc(input0), + mMode)); cudnnDataType_t dataType; const unsigned int nbDimsRequested = DIM; @@ -59,159 +80,218 @@ void Aidge::BatchNormImpl_cuda<DIM>::forward() { std::vector<int> strides(nbDimsRequested); int nbDims; CHECK_CUDNN_STATUS(cudnnGetTensorNdDescriptor(mBNDesc, - nbDimsRequested, - &dataType, - &nbDims, - &dims[0], - &strides[0])); + nbDimsRequested, + &dataType, + &nbDims, + &dims[0], + &strides[0])); dims.resize(nbDims); strides.resize(nbDims); } - switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { - case DataType::Float64: - forward_<double>(input0, input1, input2, input3, input4); - break; - case DataType::Float32: - forward_<float>(input0, input1, input2, input3, input4); - break; - case DataType::Float16: - forward_<half>(input0, input1, input2, input3, input4); - break; - default: - AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + switch ( + std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + forward_<double>(input0, input1, input2, input3, input4); + break; + case DataType::Float32: + forward_<float>(input0, input1, input2, input3, input4); + break; + case DataType::Float16: + forward_<half>(input0, input1, input2, input3, input4); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, + "Data type is not supported by Backend Cuda"); } } template <Aidge::DimIdx_t DIM> template <class T> -void Aidge::BatchNormImpl_cuda<DIM>::forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, const Tensor& input3, const Tensor& input4) { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); +void Aidge::BatchNormImpl_cuda<DIM>::forward_(const Tensor &input0, + const Tensor &input1, + const Tensor &input2, + const Tensor &input3, + const Tensor &input4) { + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; cudnnTensorDescriptor_t tensorDesc; - // For scale, bias, var and mean, if we have a 1D tensor, the dim should go on the channels - if (input1.nbDims() == 1) - { + // For scale, bias, var and mean, if we have a 1D tensor, the dim should go + // on the channels + if (input1.nbDims() == 1) { CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc)); - const std::vector<int> dims = {1, static_cast<int>(input1.size()),1, 1}; - const std::vector<int> strides = {static_cast<int>(input1.size()), 1, 1, 1}; - CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc, CudaContext::data_type<T>::value, dims.size(), dims.data(), strides.data())); - } - else { - tensorDesc = std::dynamic_pointer_cast<TensorImpl_cuda_>(input1.getImpl())->getCudnnTensorDesc(input1); + const std::vector<int> dims = {1, + static_cast<int>(input1.size()), + 1, + 1}; + const std::vector<int> strides = {static_cast<int>(input1.size()), + 1, + 1, + 1}; + CHECK_CUDNN_STATUS( + cudnnSetTensorNdDescriptor(tensorDesc, + CudaContext::data_type<T>::value, + dims.size(), + dims.data(), + strides.data())); + } else { + tensorDesc = + std::dynamic_pointer_cast<TensorImpl_cuda_>(input1.getImpl()) + ->getCudnnTensorDesc(input1); } - CHECK_CUDNN_STATUS( - cudnnBatchNormalizationForwardInference( - CudaContext::cudnnHandle(), - mMode, - &alpha, - &beta, - std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0), - input0.getImpl()->rawPtr(), - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), - std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr(), - tensorDesc, - input1.getImpl()->rawPtr(), - input2.getImpl()->rawPtr(), - input3.getImpl()->rawPtr(), - input4.getImpl()->rawPtr(), - mEpsilon) - ); - if (input1.nbDims() == 1) - { + CHECK_CUDNN_STATUS(cudnnBatchNormalizationForwardInference( + CudaContext::cudnnHandle(), + mMode, + &alpha, + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl()) + ->getCudnnTensorDesc(input0), + input0.getImpl()->rawPtr(), + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl()) + ->getCudnnTensorDesc(*op.getOutput(0)), + std::static_pointer_cast<Tensor>(op.getRawOutput(0)) + ->getImpl() + ->rawPtr(), + tensorDesc, + input1.getImpl()->rawPtr(), + input2.getImpl()->rawPtr(), + input3.getImpl()->rawPtr(), + input4.getImpl()->rawPtr(), + mEpsilon)); + if (input1.nbDims() == 1) { CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc)); } } template <Aidge::DimIdx_t DIM> void Aidge::BatchNormImpl_cuda<DIM>::backward() { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); - AIDGE_ASSERT(mBNDesc != nullptr, "BatchNorm descriptor must be created during forward!"); + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); + AIDGE_ASSERT(mBNDesc != nullptr, + "BatchNorm descriptor must be created during forward!"); for (IOIndex_t i = 0; i < (op.nbInputs() - 2); ++i) { - AIDGE_ASSERT(op.getInput(i), "missing input # {} in BatchNorm operator", i); - AIDGE_ASSERT(op.getInput(i)->hasImpl(), "cannot run BatchNorm backward because the {}-th input has no implementation.", i); + AIDGE_ASSERT(op.getInput(i), + "missing input # {} in BatchNorm operator", + i); + AIDGE_ASSERT(op.getInput(i)->hasImpl(), + "cannot run BatchNorm backward because the {}-th input " + "has no implementation.", + i); } - AIDGE_ASSERT(op.getOutput(0)->grad(), "missing outputGrad in BatchNorm operator"); - AIDGE_ASSERT(op.getOutput(0)->grad()->hasImpl(), "cannot run BatchNorm backward because the output grad has no implementation."); - - std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback, outputGradFallback; - const auto& input0 = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->refCastFrom(input0Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); - const auto& weights = std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->refCastFrom(input1Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); - const auto& bias = std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->refCastFrom(input2Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); - const auto& outputGrad = op.getOutput(0)->grad()->refCastFrom(outputGradFallback, *op.getOutput(0)->grad()); - - - switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { - case DataType::Float64: - backward_<double>(input0, outputGrad, weights); - break; - case DataType::Float32: - backward_<float>(input0, outputGrad, weights); - break; - case DataType::Float16: - backward_<half>(input0, outputGrad, weights); - break; - default: - AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + AIDGE_ASSERT(op.getOutput(0)->grad(), + "missing outputGrad in BatchNorm operator"); + AIDGE_ASSERT(op.getOutput(0)->grad()->hasImpl(), + "cannot run BatchNorm backward because the output grad has " + "no implementation."); + + std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback, + outputGradFallback; + const auto &input0 = + std::static_pointer_cast<Tensor>(mOp.getRawInput(0)) + ->refCastFrom( + input0Fallback, + *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); + const auto &weights = + std::static_pointer_cast<Tensor>(mOp.getRawInput(1)) + ->refCastFrom( + input1Fallback, + *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); + const auto &bias = + std::static_pointer_cast<Tensor>(mOp.getRawInput(2)) + ->refCastFrom( + input2Fallback, + *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); + const auto &outputGrad = + op.getOutput(0)->grad()->refCastFrom(outputGradFallback, + *op.getOutput(0)->grad()); + + switch ( + std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + backward_<double>(input0, outputGrad, weights); + break; + case DataType::Float32: + backward_<float>(input0, outputGrad, weights); + break; + case DataType::Float16: + backward_<half>(input0, outputGrad, weights); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, + "Data type is not supported by Backend Cuda"); } } template <Aidge::DimIdx_t DIM> template <class T> -void Aidge::BatchNormImpl_cuda<DIM>::backward_(const Tensor& input0, const Tensor& outputGrad, const Tensor& weights) { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); +void Aidge::BatchNormImpl_cuda<DIM>::backward_(const Tensor &input0, + const Tensor &outputGrad, + const Tensor &weights) { + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; const typename Cuda::cudnn_scaling_type<T>::type alphaData = 1.0f; const typename Cuda::cudnn_scaling_type<T>::type betaData = 0.0f; cudnnTensorDescriptor_t scaleBiasDesc; - // For scale, bias, var and mean, if we have a 1D tensor, the dim should go on the channels - if (weights.nbDims() == 1) - { + // For scale, bias, var and mean, if we have a 1D tensor, the dim should go + // on the channels + if (weights.nbDims() == 1) { CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&scaleBiasDesc)); - const std::vector<int> dims = {1, static_cast<int>(weights.size()),1, 1}; - const std::vector<int> strides = {static_cast<int>(weights.size()), 1, 1, 1}; - CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(scaleBiasDesc, CudaContext::data_type<T>::value, dims.size(), dims.data(), strides.data())); - } - else { - scaleBiasDesc = std::dynamic_pointer_cast<TensorImpl_cuda_>(weights.getImpl())->getCudnnTensorDesc(weights); + const std::vector<int> dims = {1, + static_cast<int>(weights.size()), + 1, + 1}; + const std::vector<int> strides = {static_cast<int>(weights.size()), + 1, + 1, + 1}; + CHECK_CUDNN_STATUS( + cudnnSetTensorNdDescriptor(scaleBiasDesc, + CudaContext::data_type<T>::value, + dims.size(), + dims.data(), + strides.data())); + } else { + scaleBiasDesc = + std::dynamic_pointer_cast<TensorImpl_cuda_>(weights.getImpl()) + ->getCudnnTensorDesc(weights); } - CHECK_CUDNN_STATUS( - cudnnBatchNormalizationBackward( - CudaContext::cudnnHandle(), - mMode, - &alphaData, - &betaData, - &alpha, - &beta, - std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0), - input0.getImpl()->rawPtr(), - std::dynamic_pointer_cast<TensorImpl_cuda_>(outputGrad.getImpl())->getCudnnTensorDesc(outputGrad), - outputGrad.getImpl()->rawPtr(), - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->grad()->getImpl())->getCudnnTensorDesc(*op.getInput(0)), - op.getInput(0)->grad()->getImpl()->rawPtr(), - scaleBiasDesc, - weights.getImpl()->rawPtr(), - op.getInput(1)->grad()->getImpl()->rawPtr(), - op.getInput(2)->grad()->getImpl()->rawPtr(), - mEpsilon, - nullptr, - nullptr) // TODO add savedMean and savedVar? + CHECK_CUDNN_STATUS(cudnnBatchNormalizationBackward( + CudaContext::cudnnHandle(), + mMode, + &alphaData, + &betaData, + &alpha, + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl()) + ->getCudnnTensorDesc(input0), + input0.getImpl()->rawPtr(), + std::dynamic_pointer_cast<TensorImpl_cuda_>(outputGrad.getImpl()) + ->getCudnnTensorDesc(outputGrad), + outputGrad.getImpl()->rawPtr(), + std::dynamic_pointer_cast<TensorImpl_cuda_>( + op.getInput(0)->grad()->getImpl()) + ->getCudnnTensorDesc(*op.getInput(0)), + op.getInput(0)->grad()->getImpl()->rawPtr(), + scaleBiasDesc, + weights.getImpl()->rawPtr(), + op.getInput(1)->grad()->getImpl()->rawPtr(), + op.getInput(2)->grad()->getImpl()->rawPtr(), + mEpsilon, + nullptr, + nullptr) // TODO add savedMean and savedVar? ); - if (weights.nbDims() == 1) - { + if (weights.nbDims() == 1) { CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(scaleBiasDesc)); } } template <Aidge::DimIdx_t DIM> Aidge::BatchNormImpl_cuda<DIM>::~BatchNormImpl_cuda() { - if(mBNDesc != nullptr) - { + if (mBNDesc != nullptr) { cudnnDestroyTensorDescriptor(mBNDesc); } } diff --git a/src/operator/ConvImpl.cpp b/src/operator/ConvImpl.cpp index 24e01db03692ffaa884b31a224a1947a9e1645a0..ca9f3aa32cf748b410d67c1d76ade216a0e5a829 100644 --- a/src/operator/ConvImpl.cpp +++ b/src/operator/ConvImpl.cpp @@ -20,21 +20,25 @@ #include "aidge/operator/ConvDepthWise.hpp" #include "aidge/utils/Types.h" -template <Aidge::DimIdx_t DIM> -void Aidge::ConvImpl_cuda<DIM>::forward() { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); +template <Aidge::DimIdx_t DIM> void Aidge::ConvImpl_cuda<DIM>::forward() { + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); AIDGE_ASSERT(op.getInput(0), "missing input #0"); - AIDGE_ASSERT(op.getInput(0)->hasImpl(), "the 0-th input has no implementation."); + AIDGE_ASSERT(op.getInput(0)->hasImpl(), + "the 0-th input has no implementation."); AIDGE_ASSERT(op.getInput(1), "missing input #1"); - AIDGE_ASSERT(op.getInput(1)->hasImpl(), "the 1-th input has no implementation."); + AIDGE_ASSERT(op.getInput(1)->hasImpl(), + "the 1-th input has no implementation."); // Convert input data (no overhead if not needed!) - const auto& input0 = op.getInput(0)->refCastFrom(mInput0Fallback, *op.getOutput(0)); - const auto& input1 = op.getInput(1)->refCastFrom(mInput1Fallback, *op.getOutput(0)); + const auto &input0 = + op.getInput(0)->refCastFrom(mInput0Fallback, *op.getOutput(0)); + const auto &input1 = + op.getInput(1)->refCastFrom(mInput1Fallback, *op.getOutput(0)); Tensor input2; - if(op.getInput(2) && op.getInput(2)->hasImpl()) { - input2 = op.getInput(2)->refCastFrom(mInput2Fallback, *op.getOutput(0)); + if (op.getInput(2) && op.getInput(2)->hasImpl()) { + input2 = + op.getInput(2)->refCastFrom(mInput2Fallback, *op.getOutput(0)); } // Lazy-initialize CuDNN convolution descriptor @@ -42,18 +46,24 @@ void Aidge::ConvImpl_cuda<DIM>::forward() { const std::vector<int> paddings(DIM, 0); std::vector<int> strides, upscales; if (mDepthWise) { - const ConvDepthWise_Op<DIM>& convDWOp = static_cast<const ConvDepthWise_Op<DIM>&>(mOp); - strides = std::vector<int>(convDWOp.strideDims().begin(), convDWOp.strideDims().end()); - upscales = std::vector<int>(convDWOp.dilationDims().begin(), convDWOp.dilationDims().end()); - } - else { - const Conv_Op<DIM>& convOp = static_cast<const Conv_Op<DIM>&>(mOp); - strides = std::vector<int>(convOp.strideDims().begin(), convOp.strideDims().end()); - upscales = std::vector<int>(convOp.dilationDims().begin(), convOp.dilationDims().end()); + const ConvDepthWise_Op<DIM> &convDWOp = + static_cast<const ConvDepthWise_Op<DIM> &>(mOp); + strides = std::vector<int>(convDWOp.strideDims().begin(), + convDWOp.strideDims().end()); + upscales = std::vector<int>(convDWOp.dilationDims().begin(), + convDWOp.dilationDims().end()); + } else { + const Conv_Op<DIM> &convOp = + static_cast<const Conv_Op<DIM> &>(mOp); + strides = std::vector<int>(convOp.strideDims().begin(), + convOp.strideDims().end()); + upscales = std::vector<int>(convOp.dilationDims().begin(), + convOp.dilationDims().end()); } CHECK_CUDNN_STATUS(cudnnCreateConvolutionDescriptor(&mConvDesc)); - CHECK_CUDNN_STATUS(cudnnSetConvolutionNdDescriptor(mConvDesc, + CHECK_CUDNN_STATUS(cudnnSetConvolutionNdDescriptor( + mConvDesc, DIM, &paddings[0], &strides[0], @@ -64,27 +74,33 @@ void Aidge::ConvImpl_cuda<DIM>::forward() { // Lazy-initialize CuDNN filter descriptor if (mFilterDesc == nullptr) { - const std::vector<int> kernels(input1.dims().begin(), input1.dims().end()); + const std::vector<int> kernels(input1.dims().begin(), + input1.dims().end()); CHECK_CUDNN_STATUS(cudnnCreateFilterDescriptor(&mFilterDesc)); - CHECK_CUDNN_STATUS(cudnnSetFilterNdDescriptor(mFilterDesc, - DataTypeToCudnn(input1.dataType()), - CUDNN_TENSOR_NCHW, - kernels.size(), - &kernels[0])); + CHECK_CUDNN_STATUS( + cudnnSetFilterNdDescriptor(mFilterDesc, + DataTypeToCudnn(input1.dataType()), + CUDNN_TENSOR_NCHW, + kernels.size(), + &kernels[0])); } // Set forward algorithm and allocate the required workspace if (mFwdWorkspace == nullptr) { - // Allocate the workspace required by the chosen CuDNN forward algorithm + // Allocate the workspace required by the chosen CuDNN forward + // algorithm size_t workspaceSize = 0; CHECK_CUDNN_STATUS(cudnnGetConvolutionForwardWorkspaceSize( CudaContext::cudnnHandle(), - std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0), + std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl()) + ->getCudnnTensorDesc(input0), mFilterDesc, mConvDesc, - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), + std::dynamic_pointer_cast<TensorImpl_cuda_>( + op.getOutput(0)->getImpl()) + ->getCudnnTensorDesc(*op.getOutput(0)), mFwdAlgo, &workspaceSize)); @@ -97,21 +113,24 @@ void Aidge::ConvImpl_cuda<DIM>::forward() { // excepted when the convolution is performed in double precision. if (op.getOutput(0)->dataType() == DataType::Float64) { forward_<double>(input0, input1, input2); - } - else { + } else { forward_<float>(input0, input1, input2); } } template <Aidge::DimIdx_t DIM> template <class T> -void Aidge::ConvImpl_cuda<DIM>::forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2) { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); +void Aidge::ConvImpl_cuda<DIM>::forward_(const Tensor &input0, + const Tensor &input1, + const Tensor &input2) { + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; - CHECK_CUDNN_STATUS(cudnnConvolutionForward(CudaContext::cudnnHandle(), + CHECK_CUDNN_STATUS(cudnnConvolutionForward( + CudaContext::cudnnHandle(), &alpha, - std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0), + std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl()) + ->getCudnnTensorDesc(input0), input0.getImpl()->rawPtr(), mFilterDesc, input1.getImpl()->rawPtr(), @@ -120,63 +139,81 @@ void Aidge::ConvImpl_cuda<DIM>::forward_(const Tensor& input0, const Tensor& inp mFwdWorkspace, mWorkspaceSize, &beta, - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl()) + ->getCudnnTensorDesc(*op.getOutput(0)), op.getOutput(0)->getImpl()->rawPtr())); // Add bias (if there is any) if (mOp.getRawInput(2) && input2.size() > 0) { - // Bias tensor needs to have the same number of dims than output tensor for cudnnAddTensor() - std::vector<DimSize_t> biasDims(DIM+2, 1); + // Bias tensor needs to have the same number of dims than output tensor + // for cudnnAddTensor() + std::vector<DimSize_t> biasDims(DIM + 2, 1); biasDims[1] = input2.size(); - // Create a dummy tensor with the right dims in order to get a CuDNN tensor descriptor (with getCudnnTensorDesc()) + // Create a dummy tensor with the right dims in order to get a CuDNN + // tensor descriptor (with getCudnnTensorDesc()) Tensor bias(input2.dataType()); bias.setBackend("cuda"); bias.resize(biasDims); // TODO: find a more elegant solution(?) - CHECK_CUDNN_STATUS(cudnnAddTensor(CudaContext::cudnnHandle(), + CHECK_CUDNN_STATUS(cudnnAddTensor( + CudaContext::cudnnHandle(), &alpha, - std::dynamic_pointer_cast<TensorImpl_cuda_>(bias.getImpl())->getCudnnTensorDesc(bias), + std::dynamic_pointer_cast<TensorImpl_cuda_>(bias.getImpl()) + ->getCudnnTensorDesc(bias), input2.getImpl()->rawPtr(), &alpha, - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), + std::dynamic_pointer_cast<TensorImpl_cuda_>( + op.getOutput(0)->getImpl()) + ->getCudnnTensorDesc(*op.getOutput(0)), op.getOutput(0)->getImpl()->rawPtr())); } } -template <Aidge::DimIdx_t DIM> -void Aidge::ConvImpl_cuda<DIM>::backward() { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); +template <Aidge::DimIdx_t DIM> void Aidge::ConvImpl_cuda<DIM>::backward() { + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); AIDGE_ASSERT(op.getInput(0), "missing input #0"); - AIDGE_ASSERT(op.getInput(0)->hasImpl(), "the 0-th input has no implementation."); + AIDGE_ASSERT(op.getInput(0)->hasImpl(), + "the 0-th input has no implementation."); AIDGE_ASSERT(op.getInput(1), "missing input #1"); - AIDGE_ASSERT(op.getInput(1)->hasImpl(), "the 1-th input has no implementation."); + AIDGE_ASSERT(op.getInput(1)->hasImpl(), + "the 1-th input has no implementation."); // Convert input data (no overhead if not needed!) - const auto& input0 = op.getInput(0)->refCastFrom(mInput0Fallback, *op.getOutput(0)); - const auto& input1 = op.getInput(1)->refCastFrom(mInput1Fallback, *op.getOutput(0)); + const auto &input0 = + op.getInput(0)->refCastFrom(mInput0Fallback, *op.getOutput(0)); + const auto &input1 = + op.getInput(1)->refCastFrom(mInput1Fallback, *op.getOutput(0)); Tensor input2; - if(op.getInput(2) && op.getInput(2)->hasImpl()) { - input2 = op.getInput(2)->refCastFrom(mInput2Fallback, *op.getOutput(0)); + if (op.getInput(2) && op.getInput(2)->hasImpl()) { + input2 = + op.getInput(2)->refCastFrom(mInput2Fallback, *op.getOutput(0)); } // Set forward algorithm and allocate the required workspace if (mBwdWorkspace == nullptr) { - // Find the best CuDNN backward algorithm (the one with the lowest compute time) + // Find the best CuDNN backward algorithm (the one with the lowest + // compute time) int maxAlgoIterations = 0; - cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(CudaContext::cudnnHandle(), - &maxAlgoIterations); - assert(maxAlgoIterations > 0 && "No available CUDNN ConvolutionBackwardFilterAlgorithm"); + cudnnGetConvolutionBackwardFilterAlgorithmMaxCount( + CudaContext::cudnnHandle(), + &maxAlgoIterations); + assert(maxAlgoIterations > 0 && + "No available CUDNN ConvolutionBackwardFilterAlgorithm"); int returnAlgoCounts = 0; - std::vector<cudnnConvolutionBwdFilterAlgoPerf_t> returnBwdFilterAlgo(maxAlgoIterations); + std::vector<cudnnConvolutionBwdFilterAlgoPerf_t> returnBwdFilterAlgo( + maxAlgoIterations); CHECK_CUDNN_STATUS(cudnnFindConvolutionBackwardFilterAlgorithm( CudaContext::cudnnHandle(), - std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0), - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), + std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl()) + ->getCudnnTensorDesc(input0), + std::dynamic_pointer_cast<TensorImpl_cuda_>( + op.getOutput(0)->getImpl()) + ->getCudnnTensorDesc(*op.getOutput(0)), mConvDesc, mFilterDesc, maxAlgoIterations, @@ -186,33 +223,43 @@ void Aidge::ConvImpl_cuda<DIM>::backward() { mBwdFilterAlgo = returnBwdFilterAlgo[0].algo; maxAlgoIterations = 0; - cudnnGetConvolutionBackwardDataAlgorithmMaxCount(CudaContext::cudnnHandle(), - &maxAlgoIterations); - assert(maxAlgoIterations > 0 && "No available CUDNN ConvolutionBackwardDataAlgorithm"); + cudnnGetConvolutionBackwardDataAlgorithmMaxCount( + CudaContext::cudnnHandle(), + &maxAlgoIterations); + assert(maxAlgoIterations > 0 && + "No available CUDNN ConvolutionBackwardDataAlgorithm"); returnAlgoCounts = 0; - std::vector<cudnnConvolutionBwdDataAlgoPerf_t> returnBwdDataAlgo(maxAlgoIterations); + std::vector<cudnnConvolutionBwdDataAlgoPerf_t> returnBwdDataAlgo( + maxAlgoIterations); CHECK_CUDNN_STATUS(cudnnFindConvolutionBackwardDataAlgorithm( CudaContext::cudnnHandle(), mFilterDesc, - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), + std::dynamic_pointer_cast<TensorImpl_cuda_>( + op.getOutput(0)->getImpl()) + ->getCudnnTensorDesc(*op.getOutput(0)), mConvDesc, - std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0), + std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl()) + ->getCudnnTensorDesc(input0), maxAlgoIterations, &returnAlgoCounts, &returnBwdDataAlgo[0])); mBwdDataAlgo = returnBwdDataAlgo[0].algo; - // Allocate the workspace required by the chosen CuDNN backward algorithm + // Allocate the workspace required by the chosen CuDNN backward + // algorithm size_t workspaceSize = 0; CHECK_CUDNN_STATUS(cudnnGetConvolutionBackwardFilterWorkspaceSize( CudaContext::cudnnHandle(), // same arguments as cudnnGetConvolutionBackwardFilterAlgorithm() // --> - std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0), - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), + std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl()) + ->getCudnnTensorDesc(input0), + std::dynamic_pointer_cast<TensorImpl_cuda_>( + op.getOutput(0)->getImpl()) + ->getCudnnTensorDesc(*op.getOutput(0)), mConvDesc, mFilterDesc, // <-- @@ -224,9 +271,12 @@ void Aidge::ConvImpl_cuda<DIM>::backward() { CudaContext::cudnnHandle(), // same arguments as cudnnGetConvolutionBackwardDataAlgorithm() --> mFilterDesc, - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), + std::dynamic_pointer_cast<TensorImpl_cuda_>( + op.getOutput(0)->getImpl()) + ->getCudnnTensorDesc(*op.getOutput(0)), mConvDesc, - std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0), + std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl()) + ->getCudnnTensorDesc(input0), // <-- mBwdDataAlgo, &workspaceSizeData)); @@ -250,19 +300,22 @@ void Aidge::ConvImpl_cuda<DIM>::backward() { // excepted when the convolution is performed in double precision. if (op.getOutput(0)->dataType() == DataType::Float64) { backward_<double>(input0, input1, input2); - } - else { + } else { backward_<float>(input0, input1, input2); } } template <Aidge::DimIdx_t DIM> template <class T> -void Aidge::ConvImpl_cuda<DIM>::backward_(const Tensor& input0, const Tensor& input1, const Tensor& input2) { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); +void Aidge::ConvImpl_cuda<DIM>::backward_(const Tensor &input0, + const Tensor &input1, + const Tensor &input2) { + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); std::shared_ptr<Tensor> gradOutputFallback; - const auto& gradOutput = op.getOutput(0)->grad()->refCastFrom(gradOutputFallback, *(op.getInput(0)->grad())); + const auto &gradOutput = + op.getOutput(0)->grad()->refCastFrom(gradOutputFallback, + *(op.getInput(0)->grad())); const T alpha = 1.0f; const T beta = 0.0f; @@ -270,9 +323,11 @@ void Aidge::ConvImpl_cuda<DIM>::backward_(const Tensor& input0, const Tensor& in CHECK_CUDNN_STATUS(cudnnConvolutionBackwardFilter( CudaContext::cudnnHandle(), &alpha, - std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0), + std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl()) + ->getCudnnTensorDesc(input0), input0.getImpl()->rawPtr(), - std::dynamic_pointer_cast<TensorImpl_cuda_>(gradOutput.getImpl())->getCudnnTensorDesc(gradOutput), + std::dynamic_pointer_cast<TensorImpl_cuda_>(gradOutput.getImpl()) + ->getCudnnTensorDesc(gradOutput), gradOutput.getImpl()->rawPtr(), mConvDesc, mBwdFilterAlgo, @@ -287,40 +342,47 @@ void Aidge::ConvImpl_cuda<DIM>::backward_(const Tensor& input0, const Tensor& in &alpha, mFilterDesc, input1.getImpl()->rawPtr(), - std::dynamic_pointer_cast<TensorImpl_cuda_>(gradOutput.getImpl())->getCudnnTensorDesc(gradOutput), + std::dynamic_pointer_cast<TensorImpl_cuda_>(gradOutput.getImpl()) + ->getCudnnTensorDesc(gradOutput), gradOutput.getImpl()->rawPtr(), mConvDesc, mBwdDataAlgo, mBwdWorkspace, mWorkspaceSize, &beta, - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->grad()->getImpl())->getCudnnTensorDesc(*op.getInput(0)), + std::dynamic_pointer_cast<TensorImpl_cuda_>( + op.getInput(0)->grad()->getImpl()) + ->getCudnnTensorDesc(*op.getInput(0)), op.getInput(0)->grad()->getImpl()->rawPtr())); // Add bias (if there is any) if (mOp.getRawInput(2) && input2.size() > 0) { - // Bias tensor needs to have the same number of dims than output tensor for cudnnAddTensor() - std::vector<DimSize_t> gradBiasDims(DIM+2, 1); + // Bias tensor needs to have the same number of dims than output tensor + // for cudnnAddTensor() + std::vector<DimSize_t> gradBiasDims(DIM + 2, 1); gradBiasDims[1] = op.getInput(2)->grad()->size(); - // Create a dummy tensor with the right dims in order to get a CuDNN tensor descriptor (with getCudnnTensorDesc()) + // Create a dummy tensor with the right dims in order to get a CuDNN + // tensor descriptor (with getCudnnTensorDesc()) Tensor gradBias(op.getInput(2)->grad()->dataType()); gradBias.setBackend("cuda"); gradBias.resize(gradBiasDims); // TODO: find a more elegant solution(?) - CHECK_CUDNN_STATUS(cudnnConvolutionBackwardBias(CudaContext::cudnnHandle(), + CHECK_CUDNN_STATUS(cudnnConvolutionBackwardBias( + CudaContext::cudnnHandle(), &alpha, - std::dynamic_pointer_cast<TensorImpl_cuda_>(gradOutput.getImpl())->getCudnnTensorDesc(gradOutput), + std::dynamic_pointer_cast<TensorImpl_cuda_>(gradOutput.getImpl()) + ->getCudnnTensorDesc(gradOutput), gradOutput.getImpl()->rawPtr(), &beta, - std::dynamic_pointer_cast<TensorImpl_cuda_>(gradBias.getImpl())->getCudnnTensorDesc(gradBias), + std::dynamic_pointer_cast<TensorImpl_cuda_>(gradBias.getImpl()) + ->getCudnnTensorDesc(gradBias), op.getInput(2)->grad()->getImpl()->rawPtr())); } } -template <Aidge::DimIdx_t DIM> -Aidge::ConvImpl_cuda<DIM>::~ConvImpl_cuda() { +template <Aidge::DimIdx_t DIM> Aidge::ConvImpl_cuda<DIM>::~ConvImpl_cuda() { if (mConvDesc != nullptr) { cudnnDestroyConvolutionDescriptor(mConvDesc); } @@ -334,7 +396,6 @@ Aidge::ConvImpl_cuda<DIM>::~ConvImpl_cuda() { } } - // Template declarations template class Aidge::ConvImpl_cuda<1>; template class Aidge::ConvImpl_cuda<2>; diff --git a/src/operator/DivImpl.cpp b/src/operator/DivImpl.cpp index 0326a60c1a3aabf43ca3a1d892328991d6d72366..4ae5c2b28cdd61956bc3c3b0bb7886ce99dcdf85 100644 --- a/src/operator/DivImpl.cpp +++ b/src/operator/DivImpl.cpp @@ -23,27 +23,39 @@ #include "aidge/utils/Types.h" void Aidge::DivImpl_cuda::forward() { - const Div_Op& op = static_cast<const Div_Op&>(mOp); + const Div_Op &op = static_cast<const Div_Op &>(mOp); // Check inputs AIDGE_ASSERT(op.getInput(0), "missing input in Div operator"); - AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Div forward because the 0-th input has no implementation."); + AIDGE_ASSERT(op.getInput(0)->hasImpl(), + "cannot run Div forward because the 0-th input has no " + "implementation."); DataType datatypeFirstInput = op.getInput(0)->dataType(); for (IOIndex_t i = 1; i < op.nbInputs(); ++i) { AIDGE_ASSERT(op.getInput(i), "missing input in Div operator"); - AIDGE_ASSERT(op.getInput(i)->hasImpl(), "cannot run Div forward because the {}-th input has no implementation.", i); - AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, "Cannot Div inputs with two differents data type."); + AIDGE_ASSERT(op.getInput(i)->hasImpl(), + "cannot run Div forward because the {}-th input has no " + "implementation.", + i); + AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, + "Cannot Div inputs with two differents data type."); } std::vector<std::shared_ptr<Tensor>> inputFallbacks(op.nbInputs()); std::vector<Tensor> inputs(op.nbInputs()); std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims - std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides + std::vector<std::vector<int>> strides( + op.nbInputs()); // For the cooresponding strides for (IOIndex_t i = 0; i < op.nbInputs(); ++i) { - inputs[i] = op.getInput(i)->refCastFrom(inputFallbacks[i], *op.getOutput(0)); + inputs[i] = + op.getInput(i)->refCastFrom(inputFallbacks[i], *op.getOutput(0)); // Get tensor dims and broadcast them - std::copy(inputs[i].dims().begin(), inputs[i].dims().end(), std::back_inserter(dims[i])); - dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1)); + std::copy(inputs[i].dims().begin(), + inputs[i].dims().end(), + std::back_inserter(dims[i])); + dims[i].insert(dims[i].cbegin(), + op.getOutput(0)->nbDims() - dims[i].size(), + int(1)); if (dims[i].size() < 4) { dims[i].resize(4, 1); @@ -59,54 +71,67 @@ void Aidge::DivImpl_cuda::forward() { strides[i] = tensorStrides; } - switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { - case DataType::Float64: - forward_<double>(inputs, dims, strides); - break; - case DataType::Float32: - forward_<float>(inputs, dims, strides); - break; - case DataType::Float16: - forward_<half>(inputs, dims, strides); - break; - default: - AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + switch ( + std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + forward_<double>(inputs, dims, strides); + break; + case DataType::Float32: + forward_<float>(inputs, dims, strides); + break; + case DataType::Float16: + forward_<half>(inputs, dims, strides); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, + "Data type is not supported by Backend Cuda"); } } template <class T> -void Aidge::DivImpl_cuda::forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); +void Aidge::DivImpl_cuda::forward_( + const std::vector<Tensor> &inputs, + const std::vector<std::vector<int>> &inputsDims, + const std::vector<std::vector<int>> &inputsStrides) { + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); // const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; // const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; - const T * input1Ptr = static_cast<const T*>(inputs[0].getImpl()->rawPtr()); - const T * input2Ptr = static_cast<const T*>(inputs[1].getImpl()->rawPtr()); - T * outputPtr = static_cast<T*>(op.getOutput(0)->getImpl()->rawPtr()); + const T *input1Ptr = static_cast<const T *>(inputs[0].getImpl()->rawPtr()); + const T *input2Ptr = static_cast<const T *>(inputs[1].getImpl()->rawPtr()); + T *outputPtr = static_cast<T *>(op.getOutput(0)->getImpl()->rawPtr()); std::vector<int> outputStrides(op.getOutput(0)->nbDims(), 1); - if(op.getOutput(0)->nbDims()>1) { - for (int i = op.getOutput(0)->nbDims()-2; i >= 0; i--) { - outputStrides[i] = outputStrides[i+1] * op.getOutput(0)->dims()[i+1]; + if (op.getOutput(0)->nbDims() > 1) { + for (int i = op.getOutput(0)->nbDims() - 2; i >= 0; i--) { + outputStrides[i] = + outputStrides[i + 1] * op.getOutput(0)->dims()[i + 1]; } } - std::vector<int> outDims(std::max(op.getOutput(0)->nbDims(),std::size_t(4)), 1); + std::vector<int> outDims( + std::max(op.getOutput(0)->nbDims(), std::size_t(4)), + 1); for (std::size_t i = 0; i < op.getOutput(0)->nbDims(); i++) { outDims[i] = static_cast<int>(op.getOutput(0)->dims()[i]); } - Aidge::divForward<T>(input1Ptr, outputPtr, input2Ptr, - inputsDims[0], inputsDims[1], outDims, - inputsStrides[0], inputsStrides[1], outputStrides, - static_cast<int>(op.getOutput(0)->size())); + Aidge::divForward<T>(input1Ptr, + outputPtr, + input2Ptr, + inputsDims[0], + inputsDims[1], + outDims, + inputsStrides[0], + inputsStrides[1], + outputStrides, + static_cast<int>(op.getOutput(0)->size())); } void Aidge::DivImpl_cuda::backward() { // TODO } -template <class T> -void Aidge::DivImpl_cuda::backward_(const Tensor& outGrad) { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); +template <class T> void Aidge::DivImpl_cuda::backward_(const Tensor &outGrad) { + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; // TODO diff --git a/src/operator/FCImpl.cpp b/src/operator/FCImpl.cpp index 1a7bb8edb51312d08467354e20723ad19176bfee..0478c14cd4086ce4d43343735bbd01ed6aef58f1 100644 --- a/src/operator/FCImpl.cpp +++ b/src/operator/FCImpl.cpp @@ -28,91 +28,110 @@ void Aidge::FCImpl_cuda::forward() { AIDGE_ASSERT(mOp.getRawInput(1), "missing input #1"); AIDGE_ASSERT(mOp.getRawInput(2), "missing input #2"); - const auto& fcOp = static_cast<const FC_Op&>(mOp); + const auto &fcOp = static_cast<const FC_Op &>(mOp); std::size_t outChannels = fcOp.outChannels(); - const auto& input0 = fcOp.getInput(0)->refCastFrom(mInput0Fallback, *fcOp.getOutput(0)); - const auto& input1 = fcOp.getInput(1)->refCastFrom(mInput1Fallback, *fcOp.getOutput(0)); - const auto& input2 = (fcOp.getInput(2)) ? fcOp.getInput(2)->refCastFrom(mInput2Fallback, *fcOp.getOutput(0)) : Tensor(); - - switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { - case DataType::Float64: - forward_<double>(input0, input1, input2, outChannels); - break; - case DataType::Float32: - forward_<float>(input0, input1, input2, outChannels); - break; - case DataType::Float16: - forward_<half>(input0, input1, input2, outChannels); - break; - default: - AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + const auto &input0 = + fcOp.getInput(0)->refCastFrom(mInput0Fallback, *fcOp.getOutput(0)); + const auto &input1 = + fcOp.getInput(1)->refCastFrom(mInput1Fallback, *fcOp.getOutput(0)); + const auto &input2 = + (fcOp.getInput(2)) ? fcOp.getInput(2)->refCastFrom(mInput2Fallback, + *fcOp.getOutput(0)) + : Tensor(); + + switch ( + std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + forward_<double>(input0, input1, input2, outChannels); + break; + case DataType::Float32: + forward_<float>(input0, input1, input2, outChannels); + break; + case DataType::Float16: + forward_<half>(input0, input1, input2, outChannels); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, + "Data type is not supported by Backend Cuda"); } } -template<class T> -void Aidge::FCImpl_cuda::forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, std::size_t outChannels) -{ - const T * input = static_cast<const T*>(input0.getImpl()->rawPtr()); - const T * weights = static_cast<const T*>(input1.getImpl()->rawPtr()); - T * output = static_cast<T*>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr()); +template <class T> +void Aidge::FCImpl_cuda::forward_(const Tensor &input0, + const Tensor &input1, + const Tensor &input2, + std::size_t outChannels) { + const T *input = static_cast<const T *>(input0.getImpl()->rawPtr()); + const T *weights = static_cast<const T *>(input1.getImpl()->rawPtr()); + T *output = + static_cast<T *>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)) + ->getImpl() + ->rawPtr()); // Performing output = T(weights) * input // [n x m] = [n x k] * [k x m] - // cublas is column-major so instead of transposing inputs, computing output [m x n] and transposing output, we compute output as [n x m] + // cublas is column-major so instead of transposing inputs, computing + // output [m x n] and transposing output, we compute output as [n x m] int n = outChannels; - int m = std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->size()/n; - int k = input0.size()/m; - int lda = k; // leading dimension of weights - int ldb = k; // leading dimension of input - int ldc = n; // leading dimension of output + int m = std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)) + ->getImpl() + ->size() / + n; + int k = input0.size() / m; + int lda = k; // leading dimension of weights + int ldb = k; // leading dimension of input + int ldc = n; // leading dimension of output const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; - CHECK_CUBLAS_STATUS(cublasGemm(CudaContext::cublasHandle(), - CUBLAS_OP_T, - CUBLAS_OP_N, - n, - m, - k, - reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&alpha), - weights, - ldb, - input, - lda, - reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&beta), - output, - ldc)); - - if(!input2.empty()){ - T* onesVector; - CHECK_CUDA_STATUS(cudaMalloc((void**)&onesVector, m * sizeof(T))); + CHECK_CUBLAS_STATUS(cublasGemm( + CudaContext::cublasHandle(), + CUBLAS_OP_T, + CUBLAS_OP_N, + n, + m, + k, + reinterpret_cast<const typename Cuda::cuda_type<T>::type *>(&alpha), + weights, + ldb, + input, + lda, + reinterpret_cast<const typename Cuda::cuda_type<T>::type *>(&beta), + output, + ldc)); + + if (!input2.empty()) { + T *onesVector; + CHECK_CUDA_STATUS(cudaMalloc((void **)&onesVector, m * sizeof(T))); // Fill the vector with ones std::vector<T> onesVec(m, T(1.0)); CHECK_CUDA_STATUS(cudaMemcpy(onesVector, - &onesVec[0], - m * sizeof(T), - cudaMemcpyHostToDevice)); - const T * biases = static_cast<const T*>(input2.getImpl()->rawPtr()); + &onesVec[0], + m * sizeof(T), + cudaMemcpyHostToDevice)); + const T *biases = static_cast<const T *>(input2.getImpl()->rawPtr()); // Performing output = biases * onesVector + output // [n x m] = [n x 1] * [1 x m] + [n x m] - CHECK_CUBLAS_STATUS(cublasGemm(CudaContext::cublasHandle(), - CUBLAS_OP_N, - CUBLAS_OP_N, - n, - m, - 1, - reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&alpha), - biases, - n, - onesVector, - 1, - reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&alpha), - output, - n)); + CHECK_CUBLAS_STATUS(cublasGemm( + CudaContext::cublasHandle(), + CUBLAS_OP_N, + CUBLAS_OP_N, + n, + m, + 1, + reinterpret_cast<const typename Cuda::cuda_type<T>::type *>( + &alpha), + biases, + n, + onesVector, + 1, + reinterpret_cast<const typename Cuda::cuda_type<T>::type *>( + &alpha), + output, + n)); CHECK_CUDA_STATUS(cudaFree(onesVector)); } - } void Aidge::FCImpl_cuda::backward() { @@ -120,45 +139,56 @@ void Aidge::FCImpl_cuda::backward() { AIDGE_ASSERT(mOp.getRawInput(1), "missing input #1"); AIDGE_ASSERT(mOp.getRawInput(2), "missing input #2"); - const auto& fcOp = static_cast<const FC_Op&>(mOp); + const auto &fcOp = static_cast<const FC_Op &>(mOp); std::size_t outChannels = fcOp.outChannels(); - const auto& input0 = fcOp.getInput(0)->refCastFrom(mInput0Fallback, *fcOp.getOutput(0)); - const auto& input1 = fcOp.getInput(1)->refCastFrom(mInput1Fallback, *fcOp.getOutput(0)); - const auto& input2 = (fcOp.getInput(2)) ? fcOp.getInput(2)->refCastFrom(mInput2Fallback, *fcOp.getOutput(0)) : Tensor(); - - switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { - case DataType::Float64: - backward_<double>(input0, input1, input2, outChannels); - break; - case DataType::Float32: - backward_<float>(input0, input1, input2, outChannels); - break; - case DataType::Float16: - backward_<half>(input0, input1, input2, outChannels); - break; - default: - AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + const auto &input0 = + fcOp.getInput(0)->refCastFrom(mInput0Fallback, *fcOp.getOutput(0)); + const auto &input1 = + fcOp.getInput(1)->refCastFrom(mInput1Fallback, *fcOp.getOutput(0)); + const auto &input2 = + (fcOp.getInput(2)) ? fcOp.getInput(2)->refCastFrom(mInput2Fallback, + *fcOp.getOutput(0)) + : Tensor(); + + switch ( + std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + backward_<double>(input0, input1, input2, outChannels); + break; + case DataType::Float32: + backward_<float>(input0, input1, input2, outChannels); + break; + case DataType::Float16: + backward_<half>(input0, input1, input2, outChannels); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, + "Data type is not supported by Backend Cuda"); } } -template<class T> -void Aidge::FCImpl_cuda::backward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, std::size_t outChannels) -{ +template <class T> +void Aidge::FCImpl_cuda::backward_(const Tensor &input0, + const Tensor &input1, + const Tensor &input2, + std::size_t outChannels) { const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; const typename Cuda::cudnn_scaling_type<T>::type betaData = 0.0f; - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); - const T * input = static_cast<const T*>(input0.getImpl()->rawPtr()); - const T * weights = static_cast<const T*>(input1.getImpl()->rawPtr()); - const T * outputGrad = static_cast<const T*>(op.getOutput(0)->grad()->getImpl()->rawPtr()); - T * weightsGrad = static_cast<T*>(op.getInput(1)->grad()->getImpl()->rawPtr()); + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); + const T *input = static_cast<const T *>(input0.getImpl()->rawPtr()); + const T *weights = static_cast<const T *>(input1.getImpl()->rawPtr()); + const T *outputGrad = + static_cast<const T *>(op.getOutput(0)->grad()->getImpl()->rawPtr()); + T *weightsGrad = + static_cast<T *>(op.getInput(1)->grad()->getImpl()->rawPtr()); // Performing weightsGrad = (input) * T(outputGrad) // [n x m] = [n x k] * [k x m] int m = input1.dims()[1]; - int k = input0.size()/m; - int n = input1.size()/m; + int k = input0.size() / m; + int n = input1.size() / m; CHECK_CUBLAS_STATUS(cublasGemm( CudaContext::cublasHandle(), CUBLAS_OP_N, @@ -166,38 +196,41 @@ void Aidge::FCImpl_cuda::backward_(const Tensor& input0, const Tensor& input1, c m, n, k, - reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&alpha), + reinterpret_cast<const typename Cuda::cuda_type<T>::type *>(&alpha), input, m, outputGrad, n, - reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&beta), + reinterpret_cast<const typename Cuda::cuda_type<T>::type *>(&beta), weightsGrad, m)); - if(!input2.empty()){ - T * biasGrad = static_cast<T*>(op.getInput(2)->grad()->getImpl()->rawPtr()); - T* onesVector; - CHECK_CUDA_STATUS(cudaMalloc((void**)&onesVector, m * sizeof(T))); + if (!input2.empty()) { + T *biasGrad = + static_cast<T *>(op.getInput(2)->grad()->getImpl()->rawPtr()); + T *onesVector; + CHECK_CUDA_STATUS(cudaMalloc((void **)&onesVector, m * sizeof(T))); // Fill the vector with ones std::vector<T> onesVec(m, T(1.0)); CHECK_CUDA_STATUS(cudaMemcpy(onesVector, - &onesVec[0], - m * sizeof(T), - cudaMemcpyHostToDevice)); + &onesVec[0], + m * sizeof(T), + cudaMemcpyHostToDevice)); // Performing biasGrad = outputGrad * onesVector - CHECK_CUBLAS_STATUS(cublasGemv(CudaContext::cublasHandle(), - CUBLAS_OP_N, - outChannels, - k, - reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&alpha), - outputGrad, - outChannels, - onesVector, - 1, - reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&beta), - biasGrad, - 1)); + CHECK_CUBLAS_STATUS(cublasGemv( + CudaContext::cublasHandle(), + CUBLAS_OP_N, + outChannels, + k, + reinterpret_cast<const typename Cuda::cuda_type<T>::type *>( + &alpha), + outputGrad, + outChannels, + onesVector, + 1, + reinterpret_cast<const typename Cuda::cuda_type<T>::type *>(&beta), + biasGrad, + 1)); CHECK_CUDA_STATUS(cudaFree(onesVector)); } // Performing inputGrad = (weights) * (outputGrad) @@ -205,16 +238,15 @@ void Aidge::FCImpl_cuda::backward_(const Tensor& input0, const Tensor& input1, c CudaContext::cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, - op.getInput(1)->grad()->size()/outChannels, + op.getInput(1)->grad()->size() / outChannels, k, outChannels, - reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&alpha), - weights,//w - op.getInput(1)->grad()->size()/outChannels, - outputGrad,//dY + reinterpret_cast<const typename Cuda::cuda_type<T>::type *>(&alpha), + weights, // w + op.getInput(1)->grad()->size() / outChannels, + outputGrad, // dY outChannels, - reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&betaData), - static_cast<T*>(op.getInput(0)->grad()->getImpl()->rawPtr()),//dX - op.getInput(1)->grad()->size()/outChannels)); - + reinterpret_cast<const typename Cuda::cuda_type<T>::type *>(&betaData), + static_cast<T *>(op.getInput(0)->grad()->getImpl()->rawPtr()), // dX + op.getInput(1)->grad()->size() / outChannels)); } diff --git a/src/operator/GlobalAveragePoolingImpl.cpp b/src/operator/GlobalAveragePoolingImpl.cpp index 8c83d477094d9cce41807d888cca57bd614e9cc6..d8392f9c8e52d8d226fec8713148e35468460975 100644 --- a/src/operator/GlobalAveragePoolingImpl.cpp +++ b/src/operator/GlobalAveragePoolingImpl.cpp @@ -20,92 +20,113 @@ #include "aidge/utils/Types.h" void Aidge::GlobalAveragePoolingImpl_cuda::forward() { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); AIDGE_ASSERT(mOp.getRawInput(0), "missing input #0"); - const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0)); + const auto &input = + op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0)); // Lazy-initialize CuDNN GlobalAveragePooling descriptor if (mGlobalAveragePoolingDesc == nullptr) { - int poolingDims = 2; // Assuming 2D pooling - int windowDims[2] = {static_cast<int>(input.dims().at(2)), static_cast<int>(input.dims().at(3))}; // Pooling window dimensions matching spatial dimensions of input tensor - int padding[2] = {0, 0}; // No padding - int stride[2] = {1, 1}; // Stride of 1 - CHECK_CUDNN_STATUS(cudnnCreatePoolingDescriptor(&mGlobalAveragePoolingDesc)); + int poolingDims = 2; // Assuming 2D pooling + int windowDims[2] = {static_cast<int>(input.dims().at(2)), + static_cast<int>(input.dims().at( + 3))}; // Pooling window dimensions matching + // spatial dimensions of input tensor + int padding[2] = {0, 0}; // No padding + int stride[2] = {1, 1}; // Stride of 1 CHECK_CUDNN_STATUS( - cudnnSetPoolingNdDescriptor(mGlobalAveragePoolingDesc, mMode, CUDNN_NOT_PROPAGATE_NAN, poolingDims, windowDims, padding, stride) - // cudnnSetPooling2dDesccomputedOutputriptor(mGlobalAveragePoolingDesc, mMode, CUDNN_NOT_PROPAGATE_NAN, 1, 1, 0, 0, 1, 1) + cudnnCreatePoolingDescriptor(&mGlobalAveragePoolingDesc)); + CHECK_CUDNN_STATUS( + cudnnSetPoolingNdDescriptor(mGlobalAveragePoolingDesc, + mMode, + CUDNN_NOT_PROPAGATE_NAN, + poolingDims, + windowDims, + padding, + stride) + // cudnnSetPooling2dDesccomputedOutputriptor(mGlobalAveragePoolingDesc, + // mMode, CUDNN_NOT_PROPAGATE_NAN, 1, 1, 0, 0, 1, 1) ); } if (op.getOutput(0)->dataType() == DataType::Float64) { forward_<double>(input); - } - else { + } else { forward_<float>(input); } } template <class T> -void Aidge::GlobalAveragePoolingImpl_cuda::forward_(const Tensor& input) { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); +void Aidge::GlobalAveragePoolingImpl_cuda::forward_(const Tensor &input) { + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; - CHECK_CUDNN_STATUS( - cudnnPoolingForward( - CudaContext::cudnnHandle(), - mGlobalAveragePoolingDesc, - &alpha, - std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input), - input.getImpl()->rawPtr(), - &beta, - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), - std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr() - ) - ); + CHECK_CUDNN_STATUS(cudnnPoolingForward( + CudaContext::cudnnHandle(), + mGlobalAveragePoolingDesc, + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl()) + ->getCudnnTensorDesc(input), + input.getImpl()->rawPtr(), + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl()) + ->getCudnnTensorDesc(*op.getOutput(0)), + std::static_pointer_cast<Tensor>(op.getRawOutput(0)) + ->getImpl() + ->rawPtr())); } void Aidge::GlobalAveragePoolingImpl_cuda::backward() { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); - AIDGE_ASSERT(mGlobalAveragePoolingDesc != nullptr, "GlobalAvgPool descriptor must be created during forward!"); + AIDGE_ASSERT(mGlobalAveragePoolingDesc != nullptr, + "GlobalAvgPool descriptor must be created during forward!"); AIDGE_ASSERT(op.getOutput(0)->grad(), "missing output grad #0"); - const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad()); + const auto &output_grad = + op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, + *op.getOutput(0)->grad()); if (op.getOutput(0)->dataType() == DataType::Float64) { backward_<double>(output_grad); - } - else { + } else { backward_<float>(output_grad); } } template <class T> -void Aidge::GlobalAveragePoolingImpl_cuda::backward_(const Tensor& output_grad) { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); +void Aidge::GlobalAveragePoolingImpl_cuda::backward_( + const Tensor &output_grad) { + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); const T alpha = 1.0f; const T beta = 0.0f; - CHECK_CUDNN_STATUS( - cudnnPoolingBackward(CudaContext::cudnnHandle(), - mGlobalAveragePoolingDesc, - &alpha, - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), - std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr(), - std::dynamic_pointer_cast<TensorImpl_cuda_>(output_grad.getImpl())->getCudnnTensorDesc(output_grad), - output_grad.getImpl()->rawPtr(), - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->getImpl())->getCudnnTensorDesc(*op.getInput(0)), - op.getInput(0)->getImpl()->rawPtr(), - &beta, - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->grad()->getImpl())->getCudnnTensorDesc(*op.getInput(0)), - op.getInput(0)->grad()->getImpl()->rawPtr())); + CHECK_CUDNN_STATUS(cudnnPoolingBackward( + CudaContext::cudnnHandle(), + mGlobalAveragePoolingDesc, + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl()) + ->getCudnnTensorDesc(*op.getOutput(0)), + std::static_pointer_cast<Tensor>(op.getRawOutput(0)) + ->getImpl() + ->rawPtr(), + std::dynamic_pointer_cast<TensorImpl_cuda_>(output_grad.getImpl()) + ->getCudnnTensorDesc(output_grad), + output_grad.getImpl()->rawPtr(), + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->getImpl()) + ->getCudnnTensorDesc(*op.getInput(0)), + op.getInput(0)->getImpl()->rawPtr(), + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>( + op.getInput(0)->grad()->getImpl()) + ->getCudnnTensorDesc(*op.getInput(0)), + op.getInput(0)->grad()->getImpl()->rawPtr())); } Aidge::GlobalAveragePoolingImpl_cuda::~GlobalAveragePoolingImpl_cuda() { - if(mGlobalAveragePoolingDesc != nullptr) + if (mGlobalAveragePoolingDesc != nullptr) cudnnDestroyPoolingDescriptor(mGlobalAveragePoolingDesc); } - diff --git a/src/operator/ILayerNormImpl.cpp b/src/operator/ILayerNormImpl.cpp index 47dd1d5d1a3f127c9e08788f605796020a7814a7..41b9b48c57894316d71ae361cae8159d4d4a9ca8 100644 --- a/src/operator/ILayerNormImpl.cpp +++ b/src/operator/ILayerNormImpl.cpp @@ -11,14 +11,14 @@ * ********************************************************************************/ +#include <algorithm> // For std::max #include <cassert> -#include <chrono> // std::chrono::milliseconds -#include <numeric> // std::accumulate -#include <thread> // std::this_thread::sleep_for -#include <vector> -#include <algorithm> // For std::max -#include <cmath> // For pow +#include <chrono> // std::chrono::milliseconds +#include <cmath> // For pow +#include <numeric> // std::accumulate +#include <thread> // std::this_thread::sleep_for #include <typeinfo> +#include <vector> #include "aidge/backend/cuda/data/TensorImpl.hpp" #include "aidge/backend/cuda/operator/ILayerNormImpl.hpp" @@ -30,52 +30,60 @@ void Aidge::ILayerNormImpl_cuda::forward() { - - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); assert(mOp.getRawInput(0) && "missing input #0"); assert(mOp.getRawInput(1) && "missing input #1"); assert(mOp.getRawInput(2) && "missing input #2"); - const auto& input0 = op.getInput(0)->refCastFrom(mInput0Fallback, *op.getOutput(0)); - const auto& input1 = op.getInput(1)->refCastFrom(mInput1Fallback, *op.getOutput(0)); - const auto& input2 = op.getInput(2)->refCastFrom(mInput2Fallback, *op.getOutput(0)); - - switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { - case DataType::Float64: - forward_<double>(input0, input1, input2); - break; - case DataType::Float32: - forward_<float>(input0, input1, input2); - break; - default: - AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + const auto &input0 = + op.getInput(0)->refCastFrom(mInput0Fallback, *op.getOutput(0)); + const auto &input1 = + op.getInput(1)->refCastFrom(mInput1Fallback, *op.getOutput(0)); + const auto &input2 = + op.getInput(2)->refCastFrom(mInput2Fallback, *op.getOutput(0)); + + switch ( + std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + forward_<double>(input0, input1, input2); + break; + case DataType::Float32: + forward_<float>(input0, input1, input2); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, + "Data type is not supported by Backend Cuda"); } } - -template<class T> -void Aidge::ILayerNormImpl_cuda::forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2) -{ - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); - const T * input_raw = static_cast<const T*>(input0.getImpl()->rawPtr()); - const T * weight = static_cast<const T*>(input1.getImpl()->rawPtr()); - const T * bias = static_cast<const T*>(input2.getImpl()->rawPtr()); - T * output = static_cast<T*>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr()); +template <class T> +void Aidge::ILayerNormImpl_cuda::forward_(const Tensor &input0, + const Tensor &input1, + const Tensor &input2) { + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); + const T *input_raw = static_cast<const T *>(input0.getImpl()->rawPtr()); + const T *weight = static_cast<const T *>(input1.getImpl()->rawPtr()); + const T *bias = static_cast<const T *>(input2.getImpl()->rawPtr()); + T *output = + static_cast<T *>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)) + ->getImpl() + ->rawPtr()); int N = 15; int output_bits = 8; size_t size = input0.size(); std::vector<DimSize_t> dims_input = input0.dims(); - // maybe find a most efficient way to compute scaling factor (a max and min function could help to retrieve scaling factor value) + // maybe find a most efficient way to compute scaling factor (a max and min + // function could help to retrieve scaling factor value) double min = std::numeric_limits<double>::max(); double max = std::numeric_limits<double>::min(); - for(std::size_t i = 0; i < dims_input[0]; i++) { - for(std::size_t j = 0; j < dims_input[1]; j++) { - for(std::size_t k = 0; k < dims_input[2]; k++) { - for(std::size_t l = 0; l < dims_input[3]; l++) { + for (std::size_t i = 0; i < dims_input[0]; i++) { + for (std::size_t j = 0; j < dims_input[1]; j++) { + for (std::size_t k = 0; k < dims_input[2]; k++) { + for (std::size_t l = 0; l < dims_input[3]; l++) { std::vector<std::size_t> coordIdx = {i, j, k, l}; std::size_t newFlatIdx = input0.getIdx(coordIdx); if (newFlatIdx < min) { @@ -84,57 +92,76 @@ void Aidge::ILayerNormImpl_cuda::forward_(const Tensor& input0, const Tensor& in if (newFlatIdx > max) { max = newFlatIdx; } - } - } + } + } } } double m = std::max(std::abs(min), std::abs(max)); - double normalization_factor = static_cast<double>(1 << (output_bits - 1)) - 1; - double scaling_factor = m / normalization_factor; - - // The new scaling factor that we can use to dequantify the returned tensor (not used here) - // double new_SF = 1/std::pow(2,2*output_bits-1); - - ILayerNormforward(input_raw, output, scaling_factor, weight, bias, size, dims_input); + double normalization_factor = + static_cast<double>(1 << (output_bits - 1)) - 1; + double scaling_factor = m / normalization_factor; + + // The new scaling factor that we can use to dequantify the returned tensor + // (not used here) double new_SF = 1/std::pow(2,2*output_bits-1); + + ILayerNormforward(input_raw, + output, + scaling_factor, + weight, + bias, + size, + dims_input); } void Aidge::ILayerNormImpl_cuda::backward() { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); assert(op.getOutput(0)->grad() && "missing output #0"); - const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad()); + const auto &output_grad = + op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, + *op.getOutput(0)->grad()); if (op.getInput(0)->grad()->dataType() == DataType::Float64) { backward_<double>(output_grad); - } - else { + } else { backward_<float>(output_grad); } } template <class T> -void Aidge::ILayerNormImpl_cuda::backward_(const Tensor& output_grad) { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); +void Aidge::ILayerNormImpl_cuda::backward_(const Tensor &output_grad) { + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); size_t size = output_grad.size(); std::vector<DimSize_t> dims_input = output_grad.dims(); - const T * output = static_cast<const T*>(std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()); - - T * input_grad = static_cast<T*>(op.getInput(0)->grad()->getImpl()->rawPtr()); - T * weight_grad = static_cast<T*>(op.getInput(1)->grad()->getImpl()->rawPtr()); - T * bias_grad = static_cast<T*>(op.getInput(2)->grad()->getImpl()->rawPtr()); - - const T * input = static_cast<const T*>(op.getInput(0)->getImpl()->rawPtr()); - const T * weight = static_cast<const T*>(op.getInput(1)->getImpl()->rawPtr()); - const T * bias = static_cast<const T*>(op.getInput(2)->getImpl()->rawPtr()); + const T *output = static_cast<const T *>( + std::static_pointer_cast<Tensor>(op.getRawOutput(0)) + ->getImpl() + ->rawPtr()); + + T *input_grad = + static_cast<T *>(op.getInput(0)->grad()->getImpl()->rawPtr()); + T *weight_grad = + static_cast<T *>(op.getInput(1)->grad()->getImpl()->rawPtr()); + T *bias_grad = + static_cast<T *>(op.getInput(2)->grad()->getImpl()->rawPtr()); + + const T *input = + static_cast<const T *>(op.getInput(0)->getImpl()->rawPtr()); + const T *weight = + static_cast<const T *>(op.getInput(1)->getImpl()->rawPtr()); + const T *bias = + static_cast<const T *>(op.getInput(2)->getImpl()->rawPtr()); // maybe find a most efficient way to compute mean and variance tensor - std::vector<std::vector<std::vector<std::vector<T>>>> means(dims_input[0], - std::vector<std::vector<std::vector<T>>>(dims_input[1], + std::vector<std::vector<std::vector<std::vector<T>>>> means( + dims_input[0], + std::vector<std::vector<std::vector<T>>>( + dims_input[1], std::vector<std::vector<T>>(dims_input[2], - std::vector<T>(dims_input[3], 0.0f)))); + std::vector<T>(dims_input[3], 0.0f)))); for (std::size_t i = 0; i < dims_input[0]; i++) { for (std::size_t j = 0; j < dims_input[1]; j++) { @@ -157,16 +184,20 @@ void Aidge::ILayerNormImpl_cuda::backward_(const Tensor& output_grad) { for (const auto &vec3d : means) { for (const auto &vec2d : vec3d) { for (const auto &vec1d : vec2d) { - flat_means.insert(flat_means.end(), vec1d.begin(), vec1d.end()); + flat_means.insert(flat_means.end(), + vec1d.begin(), + vec1d.end()); } } } - std::vector<std::vector<std::vector<std::vector<T>>>> vars(dims_input[0], - std::vector<std::vector<std::vector<T>>>(dims_input[1], + std::vector<std::vector<std::vector<std::vector<T>>>> vars( + dims_input[0], + std::vector<std::vector<std::vector<T>>>( + dims_input[1], std::vector<std::vector<T>>(dims_input[2], - std::vector<T>(dims_input[3], 0.0f)))); - + std::vector<T>(dims_input[3], 0.0f)))); + for (std::size_t i = 0; i < dims_input[0]; i++) { for (std::size_t j = 0; j < dims_input[1]; j++) { for (std::size_t k = 0; k < dims_input[2]; k++) { @@ -196,9 +227,20 @@ void Aidge::ILayerNormImpl_cuda::backward_(const Tensor& output_grad) { } } - const T* mean_ = flat_means.data(); - const T* var_ = flat_vars.data(); - const T * output_grad_raw = static_cast<const T*>(output_grad.getImpl()->rawPtr()); - - ILayerNormbackward(output, output_grad_raw, input, mean_, var_, weight, bias, input_grad, weight_grad, bias_grad, size); + const T *mean_ = flat_means.data(); + const T *var_ = flat_vars.data(); + const T *output_grad_raw = + static_cast<const T *>(output_grad.getImpl()->rawPtr()); + + ILayerNormbackward(output, + output_grad_raw, + input, + mean_, + var_, + weight, + bias, + input_grad, + weight_grad, + bias_grad, + size); } diff --git a/src/operator/LnImpl.cpp b/src/operator/LnImpl.cpp index ed09ed45f5006c3760376a9d6f44f29d05bcfabe..1b88a601b29775abaca9ab1f7008142ab06d8b8c 100644 --- a/src/operator/LnImpl.cpp +++ b/src/operator/LnImpl.cpp @@ -21,60 +21,66 @@ #include "aidge/utils/Types.h" void Aidge::LnImpl_cuda::forward() { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); assert(mOp.getRawInput(0) && "missing input #0"); - const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0)); + const auto &input = + op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0)); - switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { - case DataType::Float64: - forward_<double>(input); - break; - case DataType::Float32: - forward_<float>(input); - break; - case DataType::Float16: - forward_<half>(input); - break; - default: - AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + switch ( + std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + forward_<double>(input); + break; + case DataType::Float32: + forward_<float>(input); + break; + case DataType::Float16: + forward_<half>(input); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, + "Data type is not supported by Backend Cuda"); } } -template <class T> -void Aidge::LnImpl_cuda::forward_(const Tensor& input) { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); - const T * inputPtr = static_cast<const T*>(input.getImpl()->rawPtr()); - T * outputPtr = static_cast<T*>(op.getOutput(0)->getImpl()->rawPtr()); - +template <class T> void Aidge::LnImpl_cuda::forward_(const Tensor &input) { + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); + const T *inputPtr = static_cast<const T *>(input.getImpl()->rawPtr()); + T *outputPtr = static_cast<T *>(op.getOutput(0)->getImpl()->rawPtr()); - Aidge::lnForward<T>(inputPtr, outputPtr, static_cast<int>(op.getOutput(0)->size())); + Aidge::lnForward<T>(inputPtr, + outputPtr, + static_cast<int>(op.getOutput(0)->size())); } void Aidge::LnImpl_cuda::backward() { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); assert(op.getOutput(0)->grad() && "missing output #0"); - const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad()); + const auto &output_grad = + op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, + *op.getOutput(0)->grad()); - switch(op.getInput(0)->grad()->dataType()) { - case DataType::Float64: - backward_<double>(output_grad); - break; - case DataType::Float32: - backward_<float>(output_grad); - break; - case DataType::Float16: - backward_<half>(output_grad); - break; - default: - AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + switch (op.getInput(0)->grad()->dataType()) { + case DataType::Float64: + backward_<double>(output_grad); + break; + case DataType::Float32: + backward_<float>(output_grad); + break; + case DataType::Float16: + backward_<half>(output_grad); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, + "Data type is not supported by Backend Cuda"); } } template <class T> -void Aidge::LnImpl_cuda::backward_(const Tensor& output_grad) { - //TODO +void Aidge::LnImpl_cuda::backward_(const Tensor &output_grad) { + // TODO } diff --git a/src/operator/MaxPoolingImpl.cpp b/src/operator/MaxPoolingImpl.cpp index 39050635102ebebaed8192cb4bb338e2bc31d5e8..7a6518af1cd603609e65fa9d0bba3003e04b42df 100644 --- a/src/operator/MaxPoolingImpl.cpp +++ b/src/operator/MaxPoolingImpl.cpp @@ -21,109 +21,120 @@ template <Aidge::DimIdx_t DIM> void Aidge::MaxPoolingImpl_cuda<DIM>::forward() { - const MaxPooling_Op<DIM>& op_ = static_cast<const MaxPooling_Op<DIM>&>(mOp); + const MaxPooling_Op<DIM> &op_ = + static_cast<const MaxPooling_Op<DIM> &>(mOp); AIDGE_ASSERT(mOp.getRawInput(0), "missing input #0"); - const auto& input = op_.getInput(0)->refCastFrom(mInputFallback, *op_.getOutput(0)); + const auto &input = + op_.getInput(0)->refCastFrom(mInputFallback, *op_.getOutput(0)); // Lazy-initialize CuDNN MaxPooling descriptor if (mMaxPoolingDesc == nullptr) { - const std::vector<int> strides(op_.strideDims().begin(), op_.strideDims().end()); + const std::vector<int> strides(op_.strideDims().begin(), + op_.strideDims().end()); const std::vector<int> paddings(DIM, 0); - const std::vector<int> window_dims(op_.kernelDims().begin(), op_.kernelDims().end()); + const std::vector<int> window_dims(op_.kernelDims().begin(), + op_.kernelDims().end()); CHECK_CUDNN_STATUS(cudnnCreatePoolingDescriptor(&mMaxPoolingDesc)); - CHECK_CUDNN_STATUS( - cudnnSetPoolingNdDescriptor(mMaxPoolingDesc, - mMode, - CUDNN_NOT_PROPAGATE_NAN, - DIM, - &window_dims[0], - &paddings[0], - &strides[0])); + CHECK_CUDNN_STATUS(cudnnSetPoolingNdDescriptor(mMaxPoolingDesc, + mMode, + CUDNN_NOT_PROPAGATE_NAN, + DIM, + &window_dims[0], + &paddings[0], + &strides[0])); } - // Do the actual forward computation // Template is only for scaling parameters, which are always in float // excepted when the convolution is performed in double precision. if (op_.getOutput(0)->dataType() == DataType::Float64) { forward_<double>(input); - } - else { + } else { forward_<float>(input); } } template <Aidge::DimIdx_t DIM> template <class T> -void Aidge::MaxPoolingImpl_cuda<DIM>::forward_(const Tensor& input) { - const MaxPooling_Op<DIM>& op_ = static_cast<const MaxPooling_Op<DIM>&>(mOp); +void Aidge::MaxPoolingImpl_cuda<DIM>::forward_(const Tensor &input) { + const MaxPooling_Op<DIM> &op_ = + static_cast<const MaxPooling_Op<DIM> &>(mOp); const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; - CHECK_CUDNN_STATUS( - cudnnPoolingForward( - CudaContext::cudnnHandle(), - mMaxPoolingDesc, - &alpha, - std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input), - input.getImpl()->rawPtr(), - &beta, - std::dynamic_pointer_cast<TensorImpl_cuda_>(op_.getOutput(0)->getImpl())->getCudnnTensorDesc(*op_.getOutput(0)), - op_.getOutput(0)->getImpl()->rawPtr() - ) - ); + CHECK_CUDNN_STATUS(cudnnPoolingForward( + CudaContext::cudnnHandle(), + mMaxPoolingDesc, + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl()) + ->getCudnnTensorDesc(input), + input.getImpl()->rawPtr(), + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>( + op_.getOutput(0)->getImpl()) + ->getCudnnTensorDesc(*op_.getOutput(0)), + op_.getOutput(0)->getImpl()->rawPtr())); } template <Aidge::DimIdx_t DIM> void Aidge::MaxPoolingImpl_cuda<DIM>::backward() { - const MaxPooling_Op<DIM>& op_ = static_cast<const MaxPooling_Op<DIM>&>(mOp); + const MaxPooling_Op<DIM> &op_ = + static_cast<const MaxPooling_Op<DIM> &>(mOp); - AIDGE_ASSERT(mMaxPoolingDesc != nullptr, "MaxPool descriptor must be created during forward!"); + AIDGE_ASSERT(mMaxPoolingDesc != nullptr, + "MaxPool descriptor must be created during forward!"); AIDGE_ASSERT(op_.getOutput(0)->grad(), "missing output grad #0"); - const auto& output_grad = op_.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op_.getOutput(0)->grad()); + const auto &output_grad = + op_.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, + *op_.getOutput(0)->grad()); // Do the actual backward computation // Template is only for scaling parameters, which are always in float // excepted when the convolution is performed in double precision. if (op_.getOutput(0)->dataType() == DataType::Float64) { backward_<double>(output_grad); - } - else { + } else { backward_<float>(output_grad); } } template <Aidge::DimIdx_t DIM> template <class T> -void Aidge::MaxPoolingImpl_cuda<DIM>::backward_(const Tensor& output_grad) { - const MaxPooling_Op<DIM>& op_ = static_cast<const MaxPooling_Op<DIM>&>(mOp); +void Aidge::MaxPoolingImpl_cuda<DIM>::backward_(const Tensor &output_grad) { + const MaxPooling_Op<DIM> &op_ = + static_cast<const MaxPooling_Op<DIM> &>(mOp); const T alpha = 1.0f; const T beta = 0.0f; - CHECK_CUDNN_STATUS( - cudnnPoolingBackward(CudaContext::cudnnHandle(), - mMaxPoolingDesc, - &alpha, - std::dynamic_pointer_cast<TensorImpl_cuda_>(op_.getOutput(0)->getImpl())->getCudnnTensorDesc(*op_.getOutput(0)), - op_.getOutput(0)->getImpl()->rawPtr(), - std::dynamic_pointer_cast<TensorImpl_cuda_>(output_grad.getImpl())->getCudnnTensorDesc(output_grad), - output_grad.getImpl()->rawPtr(), - std::dynamic_pointer_cast<TensorImpl_cuda_>(op_.getInput(0)->getImpl())->getCudnnTensorDesc(*op_.getInput(0)), - op_.getInput(0)->getImpl()->rawPtr(), - &beta, - std::dynamic_pointer_cast<TensorImpl_cuda_>(op_.getInput(0)->grad()->getImpl())->getCudnnTensorDesc(*op_.getInput(0)), - op_.getInput(0)->grad()->getImpl()->rawPtr())); + CHECK_CUDNN_STATUS(cudnnPoolingBackward( + CudaContext::cudnnHandle(), + mMaxPoolingDesc, + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>( + op_.getOutput(0)->getImpl()) + ->getCudnnTensorDesc(*op_.getOutput(0)), + op_.getOutput(0)->getImpl()->rawPtr(), + std::dynamic_pointer_cast<TensorImpl_cuda_>(output_grad.getImpl()) + ->getCudnnTensorDesc(output_grad), + output_grad.getImpl()->rawPtr(), + std::dynamic_pointer_cast<TensorImpl_cuda_>(op_.getInput(0)->getImpl()) + ->getCudnnTensorDesc(*op_.getInput(0)), + op_.getInput(0)->getImpl()->rawPtr(), + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>( + op_.getInput(0)->grad()->getImpl()) + ->getCudnnTensorDesc(*op_.getInput(0)), + op_.getInput(0)->grad()->getImpl()->rawPtr())); } template <Aidge::DimIdx_t DIM> Aidge::MaxPoolingImpl_cuda<DIM>::~MaxPoolingImpl_cuda() { - if(mMaxPoolingDesc != nullptr) + if (mMaxPoolingDesc != nullptr) cudnnDestroyPoolingDescriptor(mMaxPoolingDesc); } - // Template declarations template class Aidge::MaxPoolingImpl_cuda<2>; diff --git a/src/operator/MulImpl.cpp b/src/operator/MulImpl.cpp index af87251e8f29eded7d24cca2f08b880557ebb482..ed66e27fdcc2dc3e332349033e6866e4557f316b 100644 --- a/src/operator/MulImpl.cpp +++ b/src/operator/MulImpl.cpp @@ -11,9 +11,9 @@ #include <algorithm> #include <cassert> +#include <chrono> #include <numeric> #include <vector> -#include <chrono> #include "aidge/backend/cuda/data/TensorImpl.hpp" #include "aidge/backend/cuda/operator/MulImpl.hpp" @@ -23,27 +23,39 @@ #include "aidge/utils/Types.h" void Aidge::MulImpl_cuda::forward() { - const Mul_Op& op = static_cast<const Mul_Op&>(mOp); + const Mul_Op &op = static_cast<const Mul_Op &>(mOp); // Check inputs AIDGE_ASSERT(op.getInput(0), "missing input in Mul operator"); - AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Mul forward because the 0-th input has no implementation."); + AIDGE_ASSERT(op.getInput(0)->hasImpl(), + "cannot run Mul forward because the 0-th input has no " + "implementation."); DataType datatypeFirstInput = op.getInput(0)->dataType(); for (IOIndex_t i = 1; i < op.nbInputs(); ++i) { AIDGE_ASSERT(op.getInput(i), "missing input in Mul operator"); - AIDGE_ASSERT(op.getInput(i)->hasImpl(), "cannot run Mul forward because the {}-th input has no implementation.", i); - AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, "Cannot Mul inputs with two differents data type."); + AIDGE_ASSERT(op.getInput(i)->hasImpl(), + "cannot run Mul forward because the {}-th input has no " + "implementation.", + i); + AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, + "Cannot Mul inputs with two differents data type."); } std::vector<std::shared_ptr<Tensor>> inputFallbacks(op.nbInputs()); std::vector<Tensor> inputs(op.nbInputs()); std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims - std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides + std::vector<std::vector<int>> strides( + op.nbInputs()); // For the cooresponding strides for (IOIndex_t i = 0; i < op.nbInputs(); ++i) { - inputs[i] = op.getInput(i)->refCastFrom(inputFallbacks[i], *op.getOutput(0)); + inputs[i] = + op.getInput(i)->refCastFrom(inputFallbacks[i], *op.getOutput(0)); // Get tensor dims and broadcast them - std::copy(inputs[i].dims().begin(), inputs[i].dims().end(), std::back_inserter(dims[i])); - dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1)); + std::copy(inputs[i].dims().begin(), + inputs[i].dims().end(), + std::back_inserter(dims[i])); + dims[i].insert(dims[i].cbegin(), + op.getOutput(0)->nbDims() - dims[i].size(), + int(1)); if (dims[i].size() < 4) { dims[i].resize(4, 1); @@ -59,62 +71,90 @@ void Aidge::MulImpl_cuda::forward() { strides[i] = tensorStrides; } - switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { - case DataType::Float64: - forward_<double>(inputs, dims, strides); - break; - case DataType::Float32: - forward_<float>(inputs, dims, strides); - break; - case DataType::Float16: - forward_<half>(inputs, dims, strides); - break; - default: - AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + switch ( + std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + forward_<double>(inputs, dims, strides); + break; + case DataType::Float32: + forward_<float>(inputs, dims, strides); + break; + case DataType::Float16: + forward_<half>(inputs, dims, strides); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, + "Data type is not supported by Backend Cuda"); } } template <class T> -void Aidge::MulImpl_cuda::forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); +void Aidge::MulImpl_cuda::forward_( + const std::vector<Tensor> &inputs, + const std::vector<std::vector<int>> &inputsDims, + const std::vector<std::vector<int>> &inputsStrides) { + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; // Create a Tensor descriptor with the broadcasted dims and strides cudnnTensorDescriptor_t tensorDesc0, tensorDesc1; CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc0)); - CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc0, CudaContext::data_type<T>::value, inputsDims[0].size(), inputsDims[0].data(), inputsStrides[0].data())); + CHECK_CUDNN_STATUS( + cudnnSetTensorNdDescriptor(tensorDesc0, + CudaContext::data_type<T>::value, + inputsDims[0].size(), + inputsDims[0].data(), + inputsStrides[0].data())); CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc1)); - CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc1, CudaContext::data_type<T>::value, inputsDims[1].size(), inputsDims[1].data(), inputsStrides[1].data())); + CHECK_CUDNN_STATUS( + cudnnSetTensorNdDescriptor(tensorDesc1, + CudaContext::data_type<T>::value, + inputsDims[1].size(), + inputsDims[1].data(), + inputsStrides[1].data())); // Multiply inputs cudnnOpTensorDescriptor_t opTensorDesc; CHECK_CUDNN_STATUS(cudnnCreateOpTensorDescriptor(&opTensorDesc)); - CHECK_CUDNN_STATUS(cudnnSetOpTensorDescriptor(opTensorDesc, CUDNN_OP_TENSOR_MUL, CudaContext::data_type<T>::value, CUDNN_PROPAGATE_NAN)); - if(inputs[0].size()>inputs[1].size()) { - CHECK_CUDNN_STATUS(cudnnOpTensor(CudaContext::cudnnHandle(), - opTensorDesc, - &alpha, - tensorDesc0, - inputs[0].getImpl()->rawPtr(), - &alpha, - tensorDesc1, - inputs[1].getImpl()->rawPtr(), - &beta, - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), - std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr())); - } - else { - CHECK_CUDNN_STATUS(cudnnOpTensor(CudaContext::cudnnHandle(), - opTensorDesc, - &alpha, - tensorDesc1, - inputs[1].getImpl()->rawPtr(), - &alpha, - tensorDesc0, - inputs[0].getImpl()->rawPtr(), - &beta, - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), - std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr())); + CHECK_CUDNN_STATUS( + cudnnSetOpTensorDescriptor(opTensorDesc, + CUDNN_OP_TENSOR_MUL, + CudaContext::data_type<T>::value, + CUDNN_PROPAGATE_NAN)); + if (inputs[0].size() > inputs[1].size()) { + CHECK_CUDNN_STATUS( + cudnnOpTensor(CudaContext::cudnnHandle(), + opTensorDesc, + &alpha, + tensorDesc0, + inputs[0].getImpl()->rawPtr(), + &alpha, + tensorDesc1, + inputs[1].getImpl()->rawPtr(), + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>( + op.getOutput(0)->getImpl()) + ->getCudnnTensorDesc(*op.getOutput(0)), + std::static_pointer_cast<Tensor>(op.getRawOutput(0)) + ->getImpl() + ->rawPtr())); + } else { + CHECK_CUDNN_STATUS( + cudnnOpTensor(CudaContext::cudnnHandle(), + opTensorDesc, + &alpha, + tensorDesc1, + inputs[1].getImpl()->rawPtr(), + &alpha, + tensorDesc0, + inputs[0].getImpl()->rawPtr(), + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>( + op.getOutput(0)->getImpl()) + ->getCudnnTensorDesc(*op.getOutput(0)), + std::static_pointer_cast<Tensor>(op.getRawOutput(0)) + ->getImpl() + ->rawPtr())); } CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc0)); @@ -123,24 +163,35 @@ void Aidge::MulImpl_cuda::forward_(const std::vector<Tensor>& inputs, const std: } void Aidge::MulImpl_cuda::backward() { - const Mul_Op& op = static_cast<const Mul_Op&>(mOp); + const Mul_Op &op = static_cast<const Mul_Op &>(mOp); // Check output - AIDGE_ASSERT(op.getOutput(0)->grad(), "missing output gradient in Mul operator"); - AIDGE_ASSERT(op.getOutput(0)->grad()->hasImpl(), "cannot run Mul backward because the output gradient has no implementation."); + AIDGE_ASSERT(op.getOutput(0)->grad(), + "missing output gradient in Mul operator"); + AIDGE_ASSERT(op.getOutput(0)->grad()->hasImpl(), + "cannot run Mul backward because the output gradient has no " + "implementation."); std::shared_ptr<Tensor> outputGradFallback; - const auto& outputGrad = op.getOutput(0)->grad()->refCastFrom(outputGradFallback, *op.getOutput(0)->grad()); + const auto &outputGrad = + op.getOutput(0)->grad()->refCastFrom(outputGradFallback, + *op.getOutput(0)->grad()); std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims - std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides + std::vector<std::vector<int>> strides( + op.nbInputs()); // For the cooresponding strides for (IOIndex_t i = 0; i < op.nbInputs(); ++i) { std::shared_ptr<Tensor> inputFallback; - const Tensor input = op.getInput(i)->refCastFrom(inputFallback, *op.getOutput(0)); + const Tensor input = + op.getInput(i)->refCastFrom(inputFallback, *op.getOutput(0)); // Get tensor dims and broadcast them - std::copy(input.dims().begin(), input.dims().end(), std::back_inserter(dims[i])); - dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1)); - + std::copy(input.dims().begin(), + input.dims().end(), + std::back_inserter(dims[i])); + dims[i].insert(dims[i].cbegin(), + op.getOutput(0)->nbDims() - dims[i].size(), + int(1)); + if (dims[i].size() < 4) { dims[i].resize(4, 1); } @@ -155,66 +206,88 @@ void Aidge::MulImpl_cuda::backward() { strides[i] = tensorStrides; } - switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { - case DataType::Float64: - backward_<double>(outputGrad, dims, strides); - break; - case DataType::Float32: - backward_<float>(outputGrad, dims, strides); - break; - case DataType::Float16: - backward_<half>(outputGrad, dims, strides); - break; - default: - AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + switch ( + std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + backward_<double>(outputGrad, dims, strides); + break; + case DataType::Float32: + backward_<float>(outputGrad, dims, strides); + break; + case DataType::Float16: + backward_<half>(outputGrad, dims, strides); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, + "Data type is not supported by Backend Cuda"); } } template <class T> -void Aidge::MulImpl_cuda::backward_(const Tensor& outputGrad, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); +void Aidge::MulImpl_cuda::backward_( + const Tensor &outputGrad, + const std::vector<std::vector<int>> &inputsDims, + const std::vector<std::vector<int>> &inputsStrides) { + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; - // Create a Tensor descriptor with the broadcasted dims and strides cudnnTensorDescriptor_t tensorDesc0, tensorDesc1; CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc0)); - CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc0, CudaContext::data_type<T>::value, inputsDims[0].size(), inputsDims[0].data(), inputsStrides[0].data())); + CHECK_CUDNN_STATUS( + cudnnSetTensorNdDescriptor(tensorDesc0, + CudaContext::data_type<T>::value, + inputsDims[0].size(), + inputsDims[0].data(), + inputsStrides[0].data())); CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc1)); - CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc1, CudaContext::data_type<T>::value, inputsDims[1].size(), inputsDims[1].data(), inputsStrides[1].data())); - + CHECK_CUDNN_STATUS( + cudnnSetTensorNdDescriptor(tensorDesc1, + CudaContext::data_type<T>::value, + inputsDims[1].size(), + inputsDims[1].data(), + inputsStrides[1].data())); + // Create the operation descriptor cudnnOpTensorDescriptor_t opTensorDesc; CHECK_CUDNN_STATUS(cudnnCreateOpTensorDescriptor(&opTensorDesc)); - CHECK_CUDNN_STATUS(cudnnSetOpTensorDescriptor(opTensorDesc, CUDNN_OP_TENSOR_MUL, CudaContext::data_type<T>::value, CUDNN_PROPAGATE_NAN)); + CHECK_CUDNN_STATUS( + cudnnSetOpTensorDescriptor(opTensorDesc, + CUDNN_OP_TENSOR_MUL, + CudaContext::data_type<T>::value, + CUDNN_PROPAGATE_NAN)); // Input0_grad = output_grad * Input1 - CHECK_CUDNN_STATUS(cudnnOpTensor(CudaContext::cudnnHandle(), - opTensorDesc, - &alpha, - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), - outputGrad.getImpl()->rawPtr(), - &alpha, - tensorDesc1, - op.getInput(1)->getImpl()->rawPtr(), - &beta, - tensorDesc0, - op.getInput(0)->grad()->getImpl()->rawPtr())); + CHECK_CUDNN_STATUS(cudnnOpTensor( + CudaContext::cudnnHandle(), + opTensorDesc, + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl()) + ->getCudnnTensorDesc(*op.getOutput(0)), + outputGrad.getImpl()->rawPtr(), + &alpha, + tensorDesc1, + op.getInput(1)->getImpl()->rawPtr(), + &beta, + tensorDesc0, + op.getInput(0)->grad()->getImpl()->rawPtr())); // Input1_grad = output_grad * Input0 - CHECK_CUDNN_STATUS(cudnnOpTensor(CudaContext::cudnnHandle(), - opTensorDesc, - &alpha, - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), - outputGrad.getImpl()->rawPtr(), - &alpha, - tensorDesc0, - op.getInput(0)->getImpl()->rawPtr(), - &beta, - tensorDesc1, - op.getInput(1)->grad()->getImpl()->rawPtr())); - + CHECK_CUDNN_STATUS(cudnnOpTensor( + CudaContext::cudnnHandle(), + opTensorDesc, + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl()) + ->getCudnnTensorDesc(*op.getOutput(0)), + outputGrad.getImpl()->rawPtr(), + &alpha, + tensorDesc0, + op.getInput(0)->getImpl()->rawPtr(), + &beta, + tensorDesc1, + op.getInput(1)->grad()->getImpl()->rawPtr())); + CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc0)); CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc1)); CHECK_CUDNN_STATUS(cudnnDestroyOpTensorDescriptor(opTensorDesc)); diff --git a/src/operator/PadImpl.cpp b/src/operator/PadImpl.cpp index 3606ba66d002f1467aa65771015cab02c066d5a5..6eed8a662d8bfe7bd1b9b0dc2abdfb54023f1c9f 100644 --- a/src/operator/PadImpl.cpp +++ b/src/operator/PadImpl.cpp @@ -21,15 +21,15 @@ #include "aidge/utils/ErrorHandling.hpp" #include "aidge/utils/Types.h" -template <Aidge::DimIdx_t DIM> -void Aidge::PadImpl_cuda<DIM>::forward() -{ +template <Aidge::DimIdx_t DIM> void Aidge::PadImpl_cuda<DIM>::forward() { const Pad_Op<DIM> &op = static_cast<const Pad_Op<DIM> &>(mOp); AIDGE_ASSERT(op.getInput(0), "missing input in Pad operator"); - AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Pad forward input has no implementation."); + AIDGE_ASSERT(op.getInput(0)->hasImpl(), + "cannot run Pad forward input has no implementation."); - const auto &input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0)); + const auto &input = + op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0)); auto paddingBorders = op.beginEndBorders(); @@ -38,8 +38,7 @@ void Aidge::PadImpl_cuda<DIM>::forward() mPadVal = op.borderValue(); mPadType = static_cast<unsigned int>(op.borderType()); - switch (op.getOutput(0)->dataType()) - { + switch (op.getOutput(0)->dataType()) { case DataType::Float64: forward_<double>(input); break; @@ -50,17 +49,21 @@ void Aidge::PadImpl_cuda<DIM>::forward() forward_<half>(input); break; default: - AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + AIDGE_THROW_OR_ABORT(std::runtime_error, + "Data type is not supported by Backend Cuda"); } } template <Aidge::DimIdx_t DIM> template <class T> -void Aidge::PadImpl_cuda<DIM>::forward_(const Tensor &input) -{ - const auto outDims = std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(); +void Aidge::PadImpl_cuda<DIM>::forward_(const Tensor &input) { + const auto outDims = + std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(); const T *inputPtr = static_cast<const T *>(input.getImpl()->rawPtr()); - T *output = static_cast<T *>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr()); + T *output = + static_cast<T *>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)) + ->getImpl() + ->rawPtr()); Aidge::cudaPadding(CudaContext::getDeviceProp(), outDims[1], outDims[3], @@ -77,15 +80,18 @@ void Aidge::PadImpl_cuda<DIM>::forward_(const Tensor &input) output); } -template <Aidge::DimIdx_t DIM> -void Aidge::PadImpl_cuda<DIM>::backward() -{ +template <Aidge::DimIdx_t DIM> void Aidge::PadImpl_cuda<DIM>::backward() { const Pad_Op<DIM> &op = static_cast<const Pad_Op<DIM> &>(mOp); - AIDGE_ASSERT(op.getOutput(0)->grad(), "missing output gradient in Pad operator"); - AIDGE_ASSERT(op.getOutput(0)->grad(), "cannot run Pad backward, output gradient has no implementation."); + AIDGE_ASSERT(op.getOutput(0)->grad(), + "missing output gradient in Pad operator"); + AIDGE_ASSERT( + op.getOutput(0)->grad(), + "cannot run Pad backward, output gradient has no implementation."); - const auto &outGrad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getInput(0)); + const auto &outGrad = + op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, + *op.getInput(0)); auto paddingBorders = op.beginEndBorders(); @@ -94,8 +100,8 @@ void Aidge::PadImpl_cuda<DIM>::backward() mPadVal = op.borderValue(); mPadType = static_cast<unsigned int>(op.borderType()); - switch (std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) - { + switch ( + std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { case DataType::Float64: backward_<double>(outGrad); break; @@ -106,17 +112,18 @@ void Aidge::PadImpl_cuda<DIM>::backward() backward_<half>(outGrad); break; default: - AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + AIDGE_THROW_OR_ABORT(std::runtime_error, + "Data type is not supported by Backend Cuda"); } } template <Aidge::DimIdx_t DIM> template <class T> -void Aidge::PadImpl_cuda<DIM>::backward_(const Tensor &outGrad) -{ - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); +void Aidge::PadImpl_cuda<DIM>::backward_(const Tensor &outGrad) { + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); const auto inputGradDims = op.getInput(0)->grad()->dims(); - T *inputGrad = static_cast<T *>(op.getInput(0)->grad()->getImpl()->rawPtr()); + T *inputGrad = + static_cast<T *>(op.getInput(0)->grad()->getImpl()->rawPtr()); Aidge::cudaPadding(CudaContext::getDeviceProp(), inputGradDims[1], inputGradDims[3], diff --git a/src/operator/PowImpl.cpp b/src/operator/PowImpl.cpp index 84af8c2a74c8ebaeb7d7380975089086e4db31da..60a47d2f38de9e257933f220d99bed306772d17f 100644 --- a/src/operator/PowImpl.cpp +++ b/src/operator/PowImpl.cpp @@ -23,27 +23,39 @@ #include "aidge/utils/Types.h" void Aidge::PowImpl_cuda::forward() { - const Pow_Op& op = static_cast<const Pow_Op&>(mOp); + const Pow_Op &op = static_cast<const Pow_Op &>(mOp); // Check inputs AIDGE_ASSERT(op.getInput(0), "missing input in Pow operator"); - AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Pow forward because the 0-th input has no implementation."); + AIDGE_ASSERT(op.getInput(0)->hasImpl(), + "cannot run Pow forward because the 0-th input has no " + "implementation."); DataType datatypeFirstInput = op.getInput(0)->dataType(); for (IOIndex_t i = 1; i < op.nbInputs(); ++i) { AIDGE_ASSERT(op.getInput(i), "missing input in Pow operator"); - AIDGE_ASSERT(op.getInput(i)->hasImpl(), "cannot run Pow forward because the {}-th input has no implementation.", i); - AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, "Cannot Pow inputs with two differents data type."); + AIDGE_ASSERT(op.getInput(i)->hasImpl(), + "cannot run Pow forward because the {}-th input has no " + "implementation.", + i); + AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, + "Cannot Pow inputs with two differents data type."); } std::vector<std::shared_ptr<Tensor>> inputFallbacks(op.nbInputs()); std::vector<Tensor> inputs(op.nbInputs()); std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims - std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides + std::vector<std::vector<int>> strides( + op.nbInputs()); // For the cooresponding strides for (IOIndex_t i = 0; i < op.nbInputs(); ++i) { - inputs[i] = op.getInput(i)->refCastFrom(inputFallbacks[i], *op.getOutput(0)); + inputs[i] = + op.getInput(i)->refCastFrom(inputFallbacks[i], *op.getOutput(0)); // Get tensor dims and broadcast them - std::copy(inputs[i].dims().begin(), inputs[i].dims().end(), std::back_inserter(dims[i])); - dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1)); + std::copy(inputs[i].dims().begin(), + inputs[i].dims().end(), + std::back_inserter(dims[i])); + dims[i].insert(dims[i].cbegin(), + op.getOutput(0)->nbDims() - dims[i].size(), + int(1)); if (dims[i].size() < 4) { dims[i].resize(4, 1); @@ -59,54 +71,67 @@ void Aidge::PowImpl_cuda::forward() { strides[i] = tensorStrides; } - switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { - case DataType::Float64: - forward_<double>(inputs, dims, strides); - break; - case DataType::Float32: - forward_<float>(inputs, dims, strides); - break; - case DataType::Float16: - forward_<half>(inputs, dims, strides); - break; - default: - AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + switch ( + std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + forward_<double>(inputs, dims, strides); + break; + case DataType::Float32: + forward_<float>(inputs, dims, strides); + break; + case DataType::Float16: + forward_<half>(inputs, dims, strides); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, + "Data type is not supported by Backend Cuda"); } } template <class T> -void Aidge::PowImpl_cuda::forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); +void Aidge::PowImpl_cuda::forward_( + const std::vector<Tensor> &inputs, + const std::vector<std::vector<int>> &inputsDims, + const std::vector<std::vector<int>> &inputsStrides) { + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); // const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; // const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; - const T * input1Ptr = static_cast<const T*>(inputs[0].getImpl()->rawPtr()); - const T * input2Ptr = static_cast<const T*>(inputs[1].getImpl()->rawPtr()); - T * outputPtr = static_cast<T*>(op.getOutput(0)->getImpl()->rawPtr()); + const T *input1Ptr = static_cast<const T *>(inputs[0].getImpl()->rawPtr()); + const T *input2Ptr = static_cast<const T *>(inputs[1].getImpl()->rawPtr()); + T *outputPtr = static_cast<T *>(op.getOutput(0)->getImpl()->rawPtr()); std::vector<int> outputStrides(op.getOutput(0)->nbDims(), 1); - if(op.getOutput(0)->nbDims()>1) { - for (int i = op.getOutput(0)->nbDims()-2; i >= 0; i--) { - outputStrides[i] = outputStrides[i+1] * op.getOutput(0)->dims()[i+1]; + if (op.getOutput(0)->nbDims() > 1) { + for (int i = op.getOutput(0)->nbDims() - 2; i >= 0; i--) { + outputStrides[i] = + outputStrides[i + 1] * op.getOutput(0)->dims()[i + 1]; } } - std::vector<int> outDims(std::max(op.getOutput(0)->nbDims(),std::size_t(4)), 1); + std::vector<int> outDims( + std::max(op.getOutput(0)->nbDims(), std::size_t(4)), + 1); for (std::size_t i = 0; i < op.getOutput(0)->nbDims(); i++) { outDims[i] = static_cast<int>(op.getOutput(0)->dims()[i]); } - Aidge::powForward<T>(input1Ptr, outputPtr, input2Ptr, - inputsDims[0], inputsDims[1], outDims, - inputsStrides[0], inputsStrides[1], outputStrides, - static_cast<int>(op.getOutput(0)->size())); + Aidge::powForward<T>(input1Ptr, + outputPtr, + input2Ptr, + inputsDims[0], + inputsDims[1], + outDims, + inputsStrides[0], + inputsStrides[1], + outputStrides, + static_cast<int>(op.getOutput(0)->size())); } void Aidge::PowImpl_cuda::backward() { // TODO } -template <class T> -void Aidge::PowImpl_cuda::backward_(const Tensor& outGrad) { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); +template <class T> void Aidge::PowImpl_cuda::backward_(const Tensor &outGrad) { + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; // TODO diff --git a/src/operator/ReLUImpl.cpp b/src/operator/ReLUImpl.cpp index 80d52045e832b42a95b6d7448f2016530bb9d1ac..688b035a8ff99defb7d47c240f1f9ebc0fe7ac78 100644 --- a/src/operator/ReLUImpl.cpp +++ b/src/operator/ReLUImpl.cpp @@ -20,21 +20,25 @@ #include "aidge/utils/Types.h" void Aidge::ReLUImpl_cuda::forward() { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); assert(mOp.getRawInput(0) && "missing input #0"); - const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0)); + const auto &input = + op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0)); // Lazy-initialize CuDNN ReLU descriptor if (mReLUDesc == nullptr) { - #if CUDNN_VERSION >= 5000 - CHECK_CUDNN_STATUS(cudnnCreateActivationDescriptor(&mReLUDesc)); - CHECK_CUDNN_STATUS(cudnnSetActivationDescriptor( - mReLUDesc, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, 0.0)); - #else - mReLUDesc = CUDNN_ACTIVATION_RELU; - #endif +#if CUDNN_VERSION >= 5000 + CHECK_CUDNN_STATUS(cudnnCreateActivationDescriptor(&mReLUDesc)); + CHECK_CUDNN_STATUS( + cudnnSetActivationDescriptor(mReLUDesc, + CUDNN_ACTIVATION_RELU, + CUDNN_NOT_PROPAGATE_NAN, + 0.0)); +#else + mReLUDesc = CUDNN_ACTIVATION_RELU; +#endif } // Do the actual forward computation @@ -42,44 +46,51 @@ void Aidge::ReLUImpl_cuda::forward() { // excepted when the convolution is performed in double precision. if (op.getOutput(0)->dataType() == DataType::Float64) { forward_<double>(input); - } - else { + } else { forward_<float>(input); } } -template <class T> -void Aidge::ReLUImpl_cuda::forward_(const Tensor& input) { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); +template <class T> void Aidge::ReLUImpl_cuda::forward_(const Tensor &input) { + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; - CHECK_CUDNN_STATUS( - cudnnActivationForward(CudaContext::cudnnHandle(), - mReLUDesc, - &alpha, - std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input), - input.getImpl()->rawPtr(), - &beta, - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), - std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr())); + CHECK_CUDNN_STATUS(cudnnActivationForward( + CudaContext::cudnnHandle(), + mReLUDesc, + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl()) + ->getCudnnTensorDesc(input), + input.getImpl()->rawPtr(), + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl()) + ->getCudnnTensorDesc(*op.getOutput(0)), + std::static_pointer_cast<Tensor>(op.getRawOutput(0)) + ->getImpl() + ->rawPtr())); } void Aidge::ReLUImpl_cuda::backward() { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); assert(op.getOutput(0)->grad() && "missing output #0"); - const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad()); + const auto &output_grad = + op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, + *op.getOutput(0)->grad()); // Lazy-initialize CuDNN ReLU descriptor if (mReLUDesc == nullptr) { - #if CUDNN_VERSION >= 5000 - CHECK_CUDNN_STATUS(cudnnCreateActivationDescriptor(&mReLUDesc)); - CHECK_CUDNN_STATUS(cudnnSetActivationDescriptor( - mReLUDesc, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, 0.0)); - #else - mReLUDesc = CUDNN_ACTIVATION_RELU; - #endif +#if CUDNN_VERSION >= 5000 + CHECK_CUDNN_STATUS(cudnnCreateActivationDescriptor(&mReLUDesc)); + CHECK_CUDNN_STATUS( + cudnnSetActivationDescriptor(mReLUDesc, + CUDNN_ACTIVATION_RELU, + CUDNN_NOT_PROPAGATE_NAN, + 0.0)); +#else + mReLUDesc = CUDNN_ACTIVATION_RELU; +#endif } // Do the actual backward computation @@ -87,37 +98,44 @@ void Aidge::ReLUImpl_cuda::backward() { // excepted when the convolution is performed in double precision. if (op.getInput(0)->grad()->dataType() == DataType::Float64) { backward_<double>(output_grad); - } - else { + } else { backward_<float>(output_grad); } } template <class T> -void Aidge::ReLUImpl_cuda::backward_(const Tensor& output_grad) { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); +void Aidge::ReLUImpl_cuda::backward_(const Tensor &output_grad) { + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; - CHECK_CUDNN_STATUS( - cudnnActivationBackward(CudaContext::cudnnHandle(), - mReLUDesc, - &alpha, - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), - std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr(), - std::dynamic_pointer_cast<TensorImpl_cuda_>(output_grad.getImpl())->getCudnnTensorDesc(output_grad), - output_grad.getImpl()->rawPtr(), - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->getImpl())->getCudnnTensorDesc(*op.getInput(0)), - std::static_pointer_cast<Tensor>(op.getRawInput(0))->getImpl()->rawPtr(), - &beta, - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->grad()->getImpl())->getCudnnTensorDesc(*op.getInput(0)->grad()), - op.getInput(0)->grad()->getImpl()->rawPtr())); + CHECK_CUDNN_STATUS(cudnnActivationBackward( + CudaContext::cudnnHandle(), + mReLUDesc, + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl()) + ->getCudnnTensorDesc(*op.getOutput(0)), + std::static_pointer_cast<Tensor>(op.getRawOutput(0)) + ->getImpl() + ->rawPtr(), + std::dynamic_pointer_cast<TensorImpl_cuda_>(output_grad.getImpl()) + ->getCudnnTensorDesc(output_grad), + output_grad.getImpl()->rawPtr(), + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->getImpl()) + ->getCudnnTensorDesc(*op.getInput(0)), + std::static_pointer_cast<Tensor>(op.getRawInput(0)) + ->getImpl() + ->rawPtr(), + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>( + op.getInput(0)->grad()->getImpl()) + ->getCudnnTensorDesc(*op.getInput(0)->grad()), + op.getInput(0)->grad()->getImpl()->rawPtr())); } Aidge::ReLUImpl_cuda::~ReLUImpl_cuda() { if (mReLUDesc != nullptr) { - #if CUDNN_VERSION >= 5000 - cudnnDestroyActivationDescriptor(mReLUDesc); - #endif +#if CUDNN_VERSION >= 5000 + cudnnDestroyActivationDescriptor(mReLUDesc); +#endif } } - diff --git a/src/operator/ReduceMeanImpl.cpp b/src/operator/ReduceMeanImpl.cpp index ff83ea5153a95e109ce7ef83c42ed4d672561ad1..9f3cbc570e765e18120332a0eba25e0366a2aae7 100644 --- a/src/operator/ReduceMeanImpl.cpp +++ b/src/operator/ReduceMeanImpl.cpp @@ -15,93 +15,114 @@ #include <vector> #include "aidge/backend/cuda/data/TensorImpl.hpp" -#include "aidge/backend/cuda/operator/ReduceMeanImpl.hpp" #include "aidge/backend/cuda/operator/ReduceImpl_CUDA_kernels.hpp" +#include "aidge/backend/cuda/operator/ReduceMeanImpl.hpp" #include "aidge/backend/cuda/utils/CudaContext.hpp" #include "aidge/backend/cuda/utils/CudaUtils.hpp" #include "aidge/operator/ReduceMean.hpp" #include "aidge/utils/Types.h" void Aidge::ReduceMeanImpl_cuda::forward() { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); AIDGE_ASSERT(op.getInput(0), "missing input in ReduceMean operator"); - AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run ReduceMean forward because the input has no implementation."); + AIDGE_ASSERT(op.getInput(0)->hasImpl(), + "cannot run ReduceMean forward because the input has no " + "implementation."); - const auto& input = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->refCastFrom(mInputFallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); + const auto &input = + std::static_pointer_cast<Tensor>(mOp.getRawInput(0)) + ->refCastFrom( + mInputFallback, + *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); - const ReduceMean_Op& rmOp = static_cast<const ReduceMean_Op&>(mOp); + const ReduceMean_Op &rmOp = static_cast<const ReduceMean_Op &>(mOp); bool keepDims = rmOp.keepDims(); - auto axes = rmOp.axes(); + auto axes = rmOp.axes(); if (axes.empty()) { - input.getImpl()->copy(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr(), input.size()); - } - else { - switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { - case DataType::Float64: - forward_<double>(input, axes, keepDims); - break; - case DataType::Float32: - forward_<float>(input, axes, keepDims); - break; - case DataType::Float16: - forward_<half>(input, axes, keepDims); - break; - default: - AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + input.getImpl()->copy( + std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)) + ->getImpl() + ->rawPtr(), + input.size()); + } else { + switch (std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)) + ->dataType()) { + case DataType::Float64: + forward_<double>(input, axes, keepDims); + break; + case DataType::Float32: + forward_<float>(input, axes, keepDims); + break; + case DataType::Float16: + forward_<half>(input, axes, keepDims); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, + "Data type is not supported by Backend Cuda"); } } } - template <class T> -void Aidge::ReduceMeanImpl_cuda::forward_(const Tensor& input, const std::vector<int>& axes, bool keepDims) { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); +void Aidge::ReduceMeanImpl_cuda::forward_(const Tensor &input, + const std::vector<int> &axes, + bool keepDims) { + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; cudnnReduceTensorDescriptor_t reduceDesc; cudnnTensorDescriptor_t outputDesc; if (keepDims) { - outputDesc = std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)); + outputDesc = std::dynamic_pointer_cast<TensorImpl_cuda_>( + op.getOutput(0)->getImpl()) + ->getCudnnTensorDesc(*op.getOutput(0)); CHECK_CUDNN_STATUS(cudnnCreateReduceTensorDescriptor(&reduceDesc)); - CHECK_CUDNN_STATUS(cudnnSetReduceTensorDescriptor(reduceDesc, - CUDNN_REDUCE_TENSOR_AVG, - CudaContext::data_type<T>::value, - CUDNN_PROPAGATE_NAN, - CUDNN_REDUCE_TENSOR_NO_INDICES, - CUDNN_32BIT_INDICES)); - + CHECK_CUDNN_STATUS( + cudnnSetReduceTensorDescriptor(reduceDesc, + CUDNN_REDUCE_TENSOR_AVG, + CudaContext::data_type<T>::value, + CUDNN_PROPAGATE_NAN, + CUDNN_REDUCE_TENSOR_NO_INDICES, + CUDNN_32BIT_INDICES)); size_t workspaceSize; - CHECK_CUDNN_STATUS(cudnnGetReductionWorkspaceSize(CudaContext::cudnnHandle(), - reduceDesc, - std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input), - outputDesc, - &workspaceSize)); + CHECK_CUDNN_STATUS(cudnnGetReductionWorkspaceSize( + CudaContext::cudnnHandle(), + reduceDesc, + std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl()) + ->getCudnnTensorDesc(input), + outputDesc, + &workspaceSize)); void *d_workspace; CHECK_CUDA_STATUS(cudaMalloc(&d_workspace, workspaceSize)); - CHECK_CUDNN_STATUS(cudnnReduceTensor(CudaContext::cudnnHandle(), - reduceDesc, - NULL, - 0, - d_workspace, - workspaceSize, - &alpha, - std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input), - input.getImpl()->rawPtr(), - &beta, - outputDesc, - std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr())); + CHECK_CUDNN_STATUS(cudnnReduceTensor( + CudaContext::cudnnHandle(), + reduceDesc, + NULL, + 0, + d_workspace, + workspaceSize, + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl()) + ->getCudnnTensorDesc(input), + input.getImpl()->rawPtr(), + &beta, + outputDesc, + std::static_pointer_cast<Tensor>(op.getRawOutput(0)) + ->getImpl() + ->rawPtr())); CHECK_CUDNN_STATUS(cudnnDestroyReduceTensorDescriptor(reduceDesc)); - } - else { + } else { CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&outputDesc)); std::vector<int> outputDims; - std::copy(input.dims().begin(), input.dims().end(), std::back_inserter(outputDims)); - for (const auto axis:axes) { + std::copy(input.dims().begin(), + input.dims().end(), + std::back_inserter(outputDims)); + for (const auto axis : axes) { outputDims[axis] = 1; } if (outputDims.size() < 4) { @@ -114,39 +135,50 @@ void Aidge::ReduceMeanImpl_cuda::forward_(const Tensor& input, const std::vector outputStrides[i - 1] = product; product *= outputDims[i - 1]; } - CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(outputDesc, CudaContext::data_type<T>::value, outputDims.size(), outputDims.data(), outputStrides.data())); - - CHECK_CUDNN_STATUS(cudnnCreateReduceTensorDescriptor(&reduceDesc)); - CHECK_CUDNN_STATUS(cudnnSetReduceTensorDescriptor(reduceDesc, - CUDNN_REDUCE_TENSOR_AVG, - CudaContext::data_type<T>::value, - CUDNN_PROPAGATE_NAN, - CUDNN_REDUCE_TENSOR_NO_INDICES, - CUDNN_32BIT_INDICES)); + CHECK_CUDNN_STATUS( + cudnnSetTensorNdDescriptor(outputDesc, + CudaContext::data_type<T>::value, + outputDims.size(), + outputDims.data(), + outputStrides.data())); + CHECK_CUDNN_STATUS(cudnnCreateReduceTensorDescriptor(&reduceDesc)); + CHECK_CUDNN_STATUS( + cudnnSetReduceTensorDescriptor(reduceDesc, + CUDNN_REDUCE_TENSOR_AVG, + CudaContext::data_type<T>::value, + CUDNN_PROPAGATE_NAN, + CUDNN_REDUCE_TENSOR_NO_INDICES, + CUDNN_32BIT_INDICES)); size_t workspaceSize; - CHECK_CUDNN_STATUS(cudnnGetReductionWorkspaceSize(CudaContext::cudnnHandle(), - reduceDesc, - std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input), - outputDesc, - &workspaceSize)); + CHECK_CUDNN_STATUS(cudnnGetReductionWorkspaceSize( + CudaContext::cudnnHandle(), + reduceDesc, + std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl()) + ->getCudnnTensorDesc(input), + outputDesc, + &workspaceSize)); void *d_workspace; CHECK_CUDA_STATUS(cudaMalloc(&d_workspace, workspaceSize)); - CHECK_CUDNN_STATUS(cudnnReduceTensor(CudaContext::cudnnHandle(), - reduceDesc, - NULL, - 0, - d_workspace, - workspaceSize, - &alpha, - std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input), - input.getImpl()->rawPtr(), - &beta, - outputDesc, - std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr())); + CHECK_CUDNN_STATUS(cudnnReduceTensor( + CudaContext::cudnnHandle(), + reduceDesc, + NULL, + 0, + d_workspace, + workspaceSize, + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl()) + ->getCudnnTensorDesc(input), + input.getImpl()->rawPtr(), + &beta, + outputDesc, + std::static_pointer_cast<Tensor>(op.getRawOutput(0)) + ->getImpl() + ->rawPtr())); CHECK_CUDNN_STATUS(cudnnDestroyReduceTensorDescriptor(reduceDesc)); CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(outputDesc)); @@ -154,47 +186,57 @@ void Aidge::ReduceMeanImpl_cuda::forward_(const Tensor& input, const std::vector } void Aidge::ReduceMeanImpl_cuda::backward() { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); - AIDGE_ASSERT(op.getOutput(0)->grad(), "missing outputGrad in ReduceMean operator"); - AIDGE_ASSERT(op.getOutput(0)->grad()->hasImpl(), "cannot run ReduceMean backward because the output grad has no implementation."); - - const auto& outGrad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getInput(0)->grad()); - - const ReduceMean_Op& rmOp = static_cast<const ReduceMean_Op&>(mOp); - auto axes = rmOp.axes(); - switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { - case DataType::Float64: - backward_<double>(outGrad, axes); - break; - case DataType::Float32: - backward_<float>(outGrad, axes); - break; - case DataType::Float16: - backward_<half>(outGrad, axes); - break; - default: - AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); + AIDGE_ASSERT(op.getOutput(0)->grad(), + "missing outputGrad in ReduceMean operator"); + AIDGE_ASSERT(op.getOutput(0)->grad()->hasImpl(), + "cannot run ReduceMean backward because the output grad has " + "no implementation."); + + const auto &outGrad = + op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, + *op.getInput(0)->grad()); + + const ReduceMean_Op &rmOp = static_cast<const ReduceMean_Op &>(mOp); + auto axes = rmOp.axes(); + switch ( + std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + backward_<double>(outGrad, axes); + break; + case DataType::Float32: + backward_<float>(outGrad, axes); + break; + case DataType::Float16: + backward_<half>(outGrad, axes); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, + "Data type is not supported by Backend Cuda"); } } template <class T> -void Aidge::ReduceMeanImpl_cuda::backward_(const Tensor& outGrad, const std::vector<int>& axes) { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); +void Aidge::ReduceMeanImpl_cuda::backward_(const Tensor &outGrad, + const std::vector<int> &axes) { + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); // const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; // const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; - const T * outputGrad = static_cast<const T*>(op.getOutput(0)->grad()->getImpl()->rawPtr()); - T * inputGrad = static_cast<T*>(op.getInput(0)->grad()->getImpl()->rawPtr()); + const T *outputGrad = + static_cast<const T *>(op.getOutput(0)->grad()->getImpl()->rawPtr()); + T *inputGrad = + static_cast<T *>(op.getInput(0)->grad()->getImpl()->rawPtr()); std::vector<std::size_t> factors; - for (auto axis:axes) { + for (auto axis : axes) { factors.push_back(op.getInput(0)->grad()->dims()[axis]); } - + Aidge::ReduceBackward(outputGrad, - inputGrad, - outGrad.dims(), - op.getInput(0)->grad()->dims(), - axes, - factors, - static_cast<int>(op.getInput(0)->grad()->size())); + inputGrad, + outGrad.dims(), + op.getInput(0)->grad()->dims(), + axes, + factors, + static_cast<int>(op.getInput(0)->grad()->size())); } diff --git a/src/operator/ReduceSumImpl.cpp b/src/operator/ReduceSumImpl.cpp index 895584d87dab88f3f71a424a02a3b32954c4dc43..9bd6d839d209874704feb4830b3e73704f795e4d 100644 --- a/src/operator/ReduceSumImpl.cpp +++ b/src/operator/ReduceSumImpl.cpp @@ -15,93 +15,114 @@ #include <vector> #include "aidge/backend/cuda/data/TensorImpl.hpp" -#include "aidge/backend/cuda/operator/ReduceSumImpl.hpp" #include "aidge/backend/cuda/operator/ReduceImpl_CUDA_kernels.hpp" +#include "aidge/backend/cuda/operator/ReduceSumImpl.hpp" #include "aidge/backend/cuda/utils/CudaContext.hpp" #include "aidge/backend/cuda/utils/CudaUtils.hpp" #include "aidge/operator/ReduceSum.hpp" #include "aidge/utils/Types.h" void Aidge::ReduceSumImpl_cuda::forward() { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); AIDGE_ASSERT(op.getInput(0), "missing input in ReduceSum operator"); - AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run ReduceSum forward because the input has no implementation."); + AIDGE_ASSERT(op.getInput(0)->hasImpl(), + "cannot run ReduceSum forward because the input has no " + "implementation."); - const auto& input = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->refCastFrom(mInputFallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); + const auto &input = + std::static_pointer_cast<Tensor>(mOp.getRawInput(0)) + ->refCastFrom( + mInputFallback, + *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); - const ReduceSum_Op& rsOp = static_cast<const ReduceSum_Op&>(mOp); + const ReduceSum_Op &rsOp = static_cast<const ReduceSum_Op &>(mOp); bool keepDims = rsOp.keepDims(); - auto axes = rsOp.axes(); + auto axes = rsOp.axes(); if (axes.empty()) { - input.getImpl()->copy(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr(), input.size()); - } - else { - switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { - case DataType::Float64: - forward_<double>(input, axes, keepDims); - break; - case DataType::Float32: - forward_<float>(input, axes, keepDims); - break; - case DataType::Float16: - forward_<half>(input, axes, keepDims); - break; - default: - AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + input.getImpl()->copy( + std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)) + ->getImpl() + ->rawPtr(), + input.size()); + } else { + switch (std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)) + ->dataType()) { + case DataType::Float64: + forward_<double>(input, axes, keepDims); + break; + case DataType::Float32: + forward_<float>(input, axes, keepDims); + break; + case DataType::Float16: + forward_<half>(input, axes, keepDims); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, + "Data type is not supported by Backend Cuda"); } } } - template <class T> -void Aidge::ReduceSumImpl_cuda::forward_(const Tensor& input, const std::vector<int>& axes, bool keepDims) { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); +void Aidge::ReduceSumImpl_cuda::forward_(const Tensor &input, + const std::vector<int> &axes, + bool keepDims) { + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; cudnnReduceTensorDescriptor_t reduceDesc; cudnnTensorDescriptor_t outputDesc; if (keepDims) { - outputDesc = std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)); + outputDesc = std::dynamic_pointer_cast<TensorImpl_cuda_>( + op.getOutput(0)->getImpl()) + ->getCudnnTensorDesc(*op.getOutput(0)); CHECK_CUDNN_STATUS(cudnnCreateReduceTensorDescriptor(&reduceDesc)); - CHECK_CUDNN_STATUS(cudnnSetReduceTensorDescriptor(reduceDesc, - CUDNN_REDUCE_TENSOR_ADD, - CudaContext::data_type<T>::value, - CUDNN_PROPAGATE_NAN, - CUDNN_REDUCE_TENSOR_NO_INDICES, - CUDNN_32BIT_INDICES)); - + CHECK_CUDNN_STATUS( + cudnnSetReduceTensorDescriptor(reduceDesc, + CUDNN_REDUCE_TENSOR_ADD, + CudaContext::data_type<T>::value, + CUDNN_PROPAGATE_NAN, + CUDNN_REDUCE_TENSOR_NO_INDICES, + CUDNN_32BIT_INDICES)); size_t workspaceSize; - CHECK_CUDNN_STATUS(cudnnGetReductionWorkspaceSize(CudaContext::cudnnHandle(), - reduceDesc, - std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input), - outputDesc, - &workspaceSize)); + CHECK_CUDNN_STATUS(cudnnGetReductionWorkspaceSize( + CudaContext::cudnnHandle(), + reduceDesc, + std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl()) + ->getCudnnTensorDesc(input), + outputDesc, + &workspaceSize)); void *d_workspace; CHECK_CUDA_STATUS(cudaMalloc(&d_workspace, workspaceSize)); - CHECK_CUDNN_STATUS(cudnnReduceTensor(CudaContext::cudnnHandle(), - reduceDesc, - NULL, - 0, - d_workspace, - workspaceSize, - &alpha, - std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input), - input.getImpl()->rawPtr(), - &beta, - outputDesc, - std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr())); + CHECK_CUDNN_STATUS(cudnnReduceTensor( + CudaContext::cudnnHandle(), + reduceDesc, + NULL, + 0, + d_workspace, + workspaceSize, + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl()) + ->getCudnnTensorDesc(input), + input.getImpl()->rawPtr(), + &beta, + outputDesc, + std::static_pointer_cast<Tensor>(op.getRawOutput(0)) + ->getImpl() + ->rawPtr())); CHECK_CUDNN_STATUS(cudnnDestroyReduceTensorDescriptor(reduceDesc)); - } - else { + } else { CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&outputDesc)); std::vector<int> outputDims; - std::copy(input.dims().begin(), input.dims().end(), std::back_inserter(outputDims)); - for (const auto axis:axes) { + std::copy(input.dims().begin(), + input.dims().end(), + std::back_inserter(outputDims)); + for (const auto axis : axes) { outputDims[axis] = 1; } if (outputDims.size() < 4) { @@ -114,39 +135,50 @@ void Aidge::ReduceSumImpl_cuda::forward_(const Tensor& input, const std::vector< outputStrides[i - 1] = product; product *= outputDims[i - 1]; } - CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(outputDesc, CudaContext::data_type<T>::value, outputDims.size(), outputDims.data(), outputStrides.data())); - - CHECK_CUDNN_STATUS(cudnnCreateReduceTensorDescriptor(&reduceDesc)); - CHECK_CUDNN_STATUS(cudnnSetReduceTensorDescriptor(reduceDesc, - CUDNN_REDUCE_TENSOR_ADD, - CudaContext::data_type<T>::value, - CUDNN_PROPAGATE_NAN, - CUDNN_REDUCE_TENSOR_NO_INDICES, - CUDNN_32BIT_INDICES)); + CHECK_CUDNN_STATUS( + cudnnSetTensorNdDescriptor(outputDesc, + CudaContext::data_type<T>::value, + outputDims.size(), + outputDims.data(), + outputStrides.data())); + CHECK_CUDNN_STATUS(cudnnCreateReduceTensorDescriptor(&reduceDesc)); + CHECK_CUDNN_STATUS( + cudnnSetReduceTensorDescriptor(reduceDesc, + CUDNN_REDUCE_TENSOR_ADD, + CudaContext::data_type<T>::value, + CUDNN_PROPAGATE_NAN, + CUDNN_REDUCE_TENSOR_NO_INDICES, + CUDNN_32BIT_INDICES)); size_t workspaceSize; - CHECK_CUDNN_STATUS(cudnnGetReductionWorkspaceSize(CudaContext::cudnnHandle(), - reduceDesc, - std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input), - outputDesc, - &workspaceSize)); + CHECK_CUDNN_STATUS(cudnnGetReductionWorkspaceSize( + CudaContext::cudnnHandle(), + reduceDesc, + std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl()) + ->getCudnnTensorDesc(input), + outputDesc, + &workspaceSize)); void *d_workspace; CHECK_CUDA_STATUS(cudaMalloc(&d_workspace, workspaceSize)); - CHECK_CUDNN_STATUS(cudnnReduceTensor(CudaContext::cudnnHandle(), - reduceDesc, - NULL, - 0, - d_workspace, - workspaceSize, - &alpha, - std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input), - input.getImpl()->rawPtr(), - &beta, - outputDesc, - std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr())); + CHECK_CUDNN_STATUS(cudnnReduceTensor( + CudaContext::cudnnHandle(), + reduceDesc, + NULL, + 0, + d_workspace, + workspaceSize, + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl()) + ->getCudnnTensorDesc(input), + input.getImpl()->rawPtr(), + &beta, + outputDesc, + std::static_pointer_cast<Tensor>(op.getRawOutput(0)) + ->getImpl() + ->rawPtr())); CHECK_CUDNN_STATUS(cudnnDestroyReduceTensorDescriptor(reduceDesc)); CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(outputDesc)); @@ -154,46 +186,56 @@ void Aidge::ReduceSumImpl_cuda::forward_(const Tensor& input, const std::vector< } void Aidge::ReduceSumImpl_cuda::backward() { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); - AIDGE_ASSERT(op.getOutput(0)->grad(), "missing outputGrad in ReduceSum operator"); - AIDGE_ASSERT(op.getOutput(0)->grad()->hasImpl(), "cannot run ReduceSum backward because the output grad has no implementation."); - - const auto& outGrad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getInput(0)->grad()); - - const ReduceSum_Op& rmOp = static_cast<const ReduceSum_Op&>(mOp); - auto axes = rmOp.axes(); - switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { - case DataType::Float64: - backward_<double>(outGrad, axes); - break; - case DataType::Float32: - backward_<float>(outGrad, axes); - break; - case DataType::Float16: - backward_<half>(outGrad, axes); - break; - default: - AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); + AIDGE_ASSERT(op.getOutput(0)->grad(), + "missing outputGrad in ReduceSum operator"); + AIDGE_ASSERT(op.getOutput(0)->grad()->hasImpl(), + "cannot run ReduceSum backward because the output grad has " + "no implementation."); + + const auto &outGrad = + op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, + *op.getInput(0)->grad()); + + const ReduceSum_Op &rmOp = static_cast<const ReduceSum_Op &>(mOp); + auto axes = rmOp.axes(); + switch ( + std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + backward_<double>(outGrad, axes); + break; + case DataType::Float32: + backward_<float>(outGrad, axes); + break; + case DataType::Float16: + backward_<half>(outGrad, axes); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, + "Data type is not supported by Backend Cuda"); } } template <class T> -void Aidge::ReduceSumImpl_cuda::backward_(const Tensor& outGrad, const std::vector<int>& axes) { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); +void Aidge::ReduceSumImpl_cuda::backward_(const Tensor &outGrad, + const std::vector<int> &axes) { + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); - const T * outputGrad = static_cast<const T*>(op.getOutput(0)->grad()->getImpl()->rawPtr()); - T * inputGrad = static_cast<T*>(op.getInput(0)->grad()->getImpl()->rawPtr()); + const T *outputGrad = + static_cast<const T *>(op.getOutput(0)->grad()->getImpl()->rawPtr()); + T *inputGrad = + static_cast<T *>(op.getInput(0)->grad()->getImpl()->rawPtr()); std::vector<std::size_t> factors; - for (auto axis:axes) { + for (auto axis : axes) { factors.push_back(op.getInput(0)->grad()->dims()[axis]); } - + Aidge::ReduceBackward(outputGrad, - inputGrad, - outGrad.dims(), - op.getInput(0)->grad()->dims(), - axes, - factors, - static_cast<int>(op.getInput(0)->grad()->size())); + inputGrad, + outGrad.dims(), + op.getInput(0)->grad()->dims(), + axes, + factors, + static_cast<int>(op.getInput(0)->grad()->size())); } diff --git a/src/operator/ReshapeImpl.cpp b/src/operator/ReshapeImpl.cpp index 783e244057b0fc42a782fd363c3a99aa6d73b46b..159550fff7b23448bcee7c0f0ecf14412d6910d4 100644 --- a/src/operator/ReshapeImpl.cpp +++ b/src/operator/ReshapeImpl.cpp @@ -22,20 +22,29 @@ #include "aidge/utils/Types.h" void Aidge::ReshapeImpl_cuda::forward() { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); // FIXME: uncomment the following code once memory handling will work assert(mOp.getRawInput(0) && "missing input #0"); - const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0)); + const auto &input = + op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0)); - std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))-> getImpl() -> setRawPtr(input.getImpl()->rawPtr(), input.getImpl()->size()); + std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)) + ->getImpl() + ->setRawPtr(input.getImpl()->rawPtr(), input.getImpl()->size()); } void Aidge::ReshapeImpl_cuda::backward() { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); AIDGE_ASSERT(op.getOutput(0)->grad(), "missing output grad #0"); - const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad()); + const auto &output_grad = + op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, + *op.getOutput(0)->grad()); - std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->grad() -> getImpl() -> setRawPtr(output_grad.getImpl()->rawPtr(), output_grad.getImpl()->size()); + std::static_pointer_cast<Tensor>(mOp.getRawInput(0)) + ->grad() + ->getImpl() + ->setRawPtr(output_grad.getImpl()->rawPtr(), + output_grad.getImpl()->size()); } diff --git a/src/operator/ShiftGELUImpl.cpp b/src/operator/ShiftGELUImpl.cpp index c2774804d04a422aefd0c66ed0d1fc1d949b1f06..4e4329af7cabecb60db2b776812b0fd516c90939 100644 --- a/src/operator/ShiftGELUImpl.cpp +++ b/src/operator/ShiftGELUImpl.cpp @@ -11,14 +11,14 @@ * ********************************************************************************/ +#include <algorithm> // For std::max #include <cassert> -#include <chrono> // std::chrono::milliseconds -#include <numeric> // std::accumulate -#include <thread> // std::this_thread::sleep_for -#include <vector> -#include <algorithm> // For std::max -#include <cmath> // For pow +#include <chrono> // std::chrono::milliseconds +#include <cmath> // For pow +#include <numeric> // std::accumulate +#include <thread> // std::this_thread::sleep_for #include <typeinfo> +#include <vector> #include "aidge/backend/cuda/data/TensorImpl.hpp" #include "aidge/backend/cuda/operator/ShiftGELUImpl.hpp" @@ -30,42 +30,48 @@ void Aidge::ShiftGELUImpl_cuda::forward() { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); assert(mOp.getRawInput(0) && "missing input #0"); - const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0)); - - switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { - case DataType::Float64: - forward_<double>(input); - break; - case DataType::Float32: - forward_<float>(input); - break; - default: - AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + const auto &input = + op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0)); + + switch ( + std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + forward_<double>(input); + break; + case DataType::Float32: + forward_<float>(input); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, + "Data type is not supported by Backend Cuda"); } } -template<class T> -void Aidge::ShiftGELUImpl_cuda::forward_(const Tensor& input) -{ - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); - const T * input_raw = static_cast<const T*>(input.getImpl()->rawPtr()); - T * output = static_cast<T*>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr()); +template <class T> +void Aidge::ShiftGELUImpl_cuda::forward_(const Tensor &input) { + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); + const T *input_raw = static_cast<const T *>(input.getImpl()->rawPtr()); + T *output = + static_cast<T *>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)) + ->getImpl() + ->rawPtr()); int N = 15; int output_bits = 8; size_t size = input.size(); std::vector<DimSize_t> dims_input = input.dims(); - // maybe find a most efficient way to compute scaling factor (a max and min function could help to retrieve scaling factor value) + // maybe find a most efficient way to compute scaling factor (a max and min + // function could help to retrieve scaling factor value) double min = std::numeric_limits<double>::max(); double max = std::numeric_limits<double>::min(); - for(std::size_t i = 0; i < dims_input[0]; i++) { - for(std::size_t j = 0; j < dims_input[1]; j++) { - for(std::size_t k = 0; k < dims_input[2]; k++) { - for(std::size_t l = 0; l < dims_input[3]; l++) { + for (std::size_t i = 0; i < dims_input[0]; i++) { + for (std::size_t j = 0; j < dims_input[1]; j++) { + for (std::size_t k = 0; k < dims_input[2]; k++) { + for (std::size_t l = 0; l < dims_input[3]; l++) { std::vector<std::size_t> coordIdx = {i, j, k, l}; std::size_t newFlatIdx = input.getIdx(coordIdx); if (newFlatIdx < min) { @@ -74,46 +80,57 @@ void Aidge::ShiftGELUImpl_cuda::forward_(const Tensor& input) if (newFlatIdx > max) { max = newFlatIdx; } - } - } + } + } } } double m = std::max(std::abs(min), std::abs(max)); - double normalization_factor = static_cast<double>(1 << (output_bits - 1)) - 1; - double scaling_factor = m / normalization_factor; - - // The new scaling factor that we can use to dequantify the returned tensor (not used here) - // double new_SF = 1/std::pow(2,2*output_bits-1); - - ShiftGELUforward(input_raw, output, scaling_factor,N, output_bits, size, dims_input); + double normalization_factor = + static_cast<double>(1 << (output_bits - 1)) - 1; + double scaling_factor = m / normalization_factor; + + // The new scaling factor that we can use to dequantify the returned tensor + // (not used here) double new_SF = 1/std::pow(2,2*output_bits-1); + + ShiftGELUforward(input_raw, + output, + scaling_factor, + N, + output_bits, + size, + dims_input); } void Aidge::ShiftGELUImpl_cuda::backward() { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); assert(op.getOutput(0)->grad() && "missing output #0"); - const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad()); + const auto &output_grad = + op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, + *op.getOutput(0)->grad()); if (op.getInput(0)->grad()->dataType() == DataType::Float64) { backward_<double>(output_grad); - } - else { + } else { backward_<float>(output_grad); } } template <class T> -void Aidge::ShiftGELUImpl_cuda::backward_(const Tensor& output_grad) { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); - const T * input = static_cast<const T*>(std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()); +void Aidge::ShiftGELUImpl_cuda::backward_(const Tensor &output_grad) { + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); + const T *input = static_cast<const T *>( + std::static_pointer_cast<Tensor>(op.getRawOutput(0)) + ->getImpl() + ->rawPtr()); size_t size = output_grad.size(); - T * output = static_cast<T*>(op.getInput(0)->grad()->getImpl()->rawPtr()); + T *output = static_cast<T *>(op.getInput(0)->grad()->getImpl()->rawPtr()); - const T * output_grad_raw = static_cast<const T*>(output_grad.getImpl()->rawPtr()); + const T *output_grad_raw = + static_cast<const T *>(output_grad.getImpl()->rawPtr()); ShiftGELUbackward(input, output_grad_raw, output, size); - } \ No newline at end of file diff --git a/src/operator/ShiftMaxImpl.cpp b/src/operator/ShiftMaxImpl.cpp index 1134cc5d6b99e53eb492c82e32d811bc0bcba0e0..2abb85ef435b86fd7c016cc9ae6ed5c83c5cee51 100644 --- a/src/operator/ShiftMaxImpl.cpp +++ b/src/operator/ShiftMaxImpl.cpp @@ -11,14 +11,14 @@ * ********************************************************************************/ +#include <algorithm> // For std::max #include <cassert> -#include <chrono> // std::chrono::milliseconds -#include <numeric> // std::accumulate -#include <thread> // std::this_thread::sleep_for -#include <vector> -#include <algorithm> // For std::max -#include <cmath> // For pow +#include <chrono> // std::chrono::milliseconds +#include <cmath> // For pow +#include <numeric> // std::accumulate +#include <thread> // std::this_thread::sleep_for #include <typeinfo> +#include <vector> #include "aidge/backend/cuda/data/TensorImpl.hpp" #include "aidge/backend/cuda/operator/ShiftMaxImpl.hpp" @@ -30,42 +30,48 @@ void Aidge::ShiftMaxImpl_cuda::forward() { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); assert(mOp.getRawInput(0) && "missing input #0"); - const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0)); - - switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { - case DataType::Float64: - forward_<double>(input); - break; - case DataType::Float32: - forward_<float>(input); - break; - default: - AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + const auto &input = + op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0)); + + switch ( + std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + forward_<double>(input); + break; + case DataType::Float32: + forward_<float>(input); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, + "Data type is not supported by Backend Cuda"); } } -template<class T> -void Aidge::ShiftMaxImpl_cuda::forward_(const Tensor& input) -{ - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); - const T * input_raw = static_cast<const T*>(input.getImpl()->rawPtr()); - T * output = static_cast<T*>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr()); +template <class T> +void Aidge::ShiftMaxImpl_cuda::forward_(const Tensor &input) { + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); + const T *input_raw = static_cast<const T *>(input.getImpl()->rawPtr()); + T *output = + static_cast<T *>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)) + ->getImpl() + ->rawPtr()); int N = 15; int output_bits = 8; size_t size = input.size(); std::vector<DimSize_t> dims_input = input.dims(); - // maybe find a most efficient way to compute scaling factor (a max and min function could help to retrieve scaling factor value) + // maybe find a most efficient way to compute scaling factor (a max and min + // function could help to retrieve scaling factor value) double min = std::numeric_limits<double>::max(); double max = std::numeric_limits<double>::min(); - for(std::size_t i = 0; i < dims_input[0]; i++) { - for(std::size_t j = 0; j < dims_input[1]; j++) { - for(std::size_t k = 0; k < dims_input[2]; k++) { - for(std::size_t l = 0; l < dims_input[3]; l++) { + for (std::size_t i = 0; i < dims_input[0]; i++) { + for (std::size_t j = 0; j < dims_input[1]; j++) { + for (std::size_t k = 0; k < dims_input[2]; k++) { + for (std::size_t l = 0; l < dims_input[3]; l++) { std::vector<std::size_t> coordIdx = {i, j, k, l}; std::size_t newFlatIdx = input.getIdx(coordIdx); if (newFlatIdx < min) { @@ -74,48 +80,63 @@ void Aidge::ShiftMaxImpl_cuda::forward_(const Tensor& input) if (newFlatIdx > max) { max = newFlatIdx; } - } - } + } + } } } double m = std::max(std::abs(min), std::abs(max)); - double normalization_factor = static_cast<double>(1 << (output_bits - 1)) - 1; - double scaling_factor = m / normalization_factor; - - // The new scaling factor that we can use to dequantify the returned tensor (not used here) - // double new_SF = 1/std::pow(2,2*output_bits-1); - - ShiftMaxforward(input_raw, output, scaling_factor,N, output_bits, size, dims_input); + double normalization_factor = + static_cast<double>(1 << (output_bits - 1)) - 1; + double scaling_factor = m / normalization_factor; + + // The new scaling factor that we can use to dequantify the returned tensor + // (not used here) double new_SF = 1/std::pow(2,2*output_bits-1); + + ShiftMaxforward(input_raw, + output, + scaling_factor, + N, + output_bits, + size, + dims_input); } - void Aidge::ShiftMaxImpl_cuda::backward() { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); assert(op.getOutput(0)->grad() && "missing output #0"); - const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad()); + const auto &output_grad = + op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, + *op.getOutput(0)->grad()); if (op.getInput(0)->grad()->dataType() == DataType::Float64) { backward_<double>(output_grad); - } - else { + } else { backward_<float>(output_grad); } } template <class T> -void Aidge::ShiftMaxImpl_cuda::backward_(const Tensor& output_grad) { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); - const T * output_tensor = static_cast<const T*>(std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()); +void Aidge::ShiftMaxImpl_cuda::backward_(const Tensor &output_grad) { + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); + const T *output_tensor = static_cast<const T *>( + std::static_pointer_cast<Tensor>(op.getRawOutput(0)) + ->getImpl() + ->rawPtr()); size_t size = output_grad.size(); std::vector<DimSize_t> dims_output = output_grad.dims(); - T * input_grad = static_cast<T*>(op.getInput(0)->grad()->getImpl()->rawPtr()); - - const T * output_grad_raw = static_cast<const T*>(output_grad.getImpl()->rawPtr()); - ShiftMaxbackward(output_tensor, output_grad_raw, input_grad, size, dims_output); + T *input_grad = + static_cast<T *>(op.getInput(0)->grad()->getImpl()->rawPtr()); + const T *output_grad_raw = + static_cast<const T *>(output_grad.getImpl()->rawPtr()); + ShiftMaxbackward(output_tensor, + output_grad_raw, + input_grad, + size, + dims_output); } \ No newline at end of file diff --git a/src/operator/SigmoidImpl.cpp b/src/operator/SigmoidImpl.cpp index 386cd9d821b3019cf8f0de2cc757ae514446f1a6..348d64076bab8d459a3920b35ea1bb750234722e 100644 --- a/src/operator/SigmoidImpl.cpp +++ b/src/operator/SigmoidImpl.cpp @@ -20,21 +20,25 @@ #include "aidge/utils/Types.h" void Aidge::SigmoidImpl_cuda::forward() { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); assert(mOp.getRawInput(0) && "missing input #0"); - const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0)); + const auto &input = + op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0)); // Lazy-initialize CuDNN Sigmoid descriptor if (mSigmoidDesc == nullptr) { - #if CUDNN_VERSION >= 5000 - CHECK_CUDNN_STATUS(cudnnCreateActivationDescriptor(&mSigmoidDesc)); - CHECK_CUDNN_STATUS(cudnnSetActivationDescriptor( - mSigmoidDesc, CUDNN_ACTIVATION_SIGMOID, CUDNN_NOT_PROPAGATE_NAN, 0.0)); - #else - mSigmoidDesc = CUDNN_ACTIVATION_SIGMOID; - #endif +#if CUDNN_VERSION >= 5000 + CHECK_CUDNN_STATUS(cudnnCreateActivationDescriptor(&mSigmoidDesc)); + CHECK_CUDNN_STATUS( + cudnnSetActivationDescriptor(mSigmoidDesc, + CUDNN_ACTIVATION_SIGMOID, + CUDNN_NOT_PROPAGATE_NAN, + 0.0)); +#else + mSigmoidDesc = CUDNN_ACTIVATION_SIGMOID; +#endif } // Do the actual forward computation @@ -42,44 +46,52 @@ void Aidge::SigmoidImpl_cuda::forward() { // excepted when the convolution is performed in double precision. if (op.getOutput(0)->dataType() == DataType::Float64) { forward_<double>(input); - } - else { + } else { forward_<float>(input); } } template <class T> -void Aidge::SigmoidImpl_cuda::forward_(const Tensor& input) { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); +void Aidge::SigmoidImpl_cuda::forward_(const Tensor &input) { + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; - CHECK_CUDNN_STATUS( - cudnnActivationForward(CudaContext::cudnnHandle(), - mSigmoidDesc, - &alpha, - std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input), - input.getImpl()->rawPtr(), - &beta, - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), - std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr())); + CHECK_CUDNN_STATUS(cudnnActivationForward( + CudaContext::cudnnHandle(), + mSigmoidDesc, + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl()) + ->getCudnnTensorDesc(input), + input.getImpl()->rawPtr(), + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl()) + ->getCudnnTensorDesc(*op.getOutput(0)), + std::static_pointer_cast<Tensor>(op.getRawOutput(0)) + ->getImpl() + ->rawPtr())); } void Aidge::SigmoidImpl_cuda::backward() { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); assert(op.getOutput(0)->grad() && "missing output #0"); - const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad()); + const auto &output_grad = + op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, + *op.getOutput(0)->grad()); // Lazy-initialize CuDNN Sigmoid descriptor if (mSigmoidDesc == nullptr) { - #if CUDNN_VERSION >= 5000 - CHECK_CUDNN_STATUS(cudnnCreateActivationDescriptor(&mSigmoidDesc)); - CHECK_CUDNN_STATUS(cudnnSetActivationDescriptor( - mSigmoidDesc, CUDNN_ACTIVATION_SIGMOID, CUDNN_NOT_PROPAGATE_NAN, 0.0)); - #else - mSigmoidDesc = CUDNN_ACTIVATION_SIGMOID; - #endif +#if CUDNN_VERSION >= 5000 + CHECK_CUDNN_STATUS(cudnnCreateActivationDescriptor(&mSigmoidDesc)); + CHECK_CUDNN_STATUS( + cudnnSetActivationDescriptor(mSigmoidDesc, + CUDNN_ACTIVATION_SIGMOID, + CUDNN_NOT_PROPAGATE_NAN, + 0.0)); +#else + mSigmoidDesc = CUDNN_ACTIVATION_SIGMOID; +#endif } // Do the actual backward computation @@ -87,37 +99,44 @@ void Aidge::SigmoidImpl_cuda::backward() { // excepted when the convolution is performed in double precision. if (op.getInput(0)->grad()->dataType() == DataType::Float64) { backward_<double>(output_grad); - } - else { + } else { backward_<float>(output_grad); } } template <class T> -void Aidge::SigmoidImpl_cuda::backward_(const Tensor& output_grad) { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); +void Aidge::SigmoidImpl_cuda::backward_(const Tensor &output_grad) { + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; - CHECK_CUDNN_STATUS( - cudnnActivationBackward(CudaContext::cudnnHandle(), - mSigmoidDesc, - &alpha, - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), - std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr(), - std::dynamic_pointer_cast<TensorImpl_cuda_>(output_grad.getImpl())->getCudnnTensorDesc(output_grad), - output_grad.getImpl()->rawPtr(), - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->getImpl())->getCudnnTensorDesc(*op.getInput(0)), - std::static_pointer_cast<Tensor>(op.getRawInput(0))->getImpl()->rawPtr(), - &beta, - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->grad()->getImpl())->getCudnnTensorDesc(*op.getInput(0)->grad()), - op.getInput(0)->grad()->getImpl()->rawPtr())); + CHECK_CUDNN_STATUS(cudnnActivationBackward( + CudaContext::cudnnHandle(), + mSigmoidDesc, + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl()) + ->getCudnnTensorDesc(*op.getOutput(0)), + std::static_pointer_cast<Tensor>(op.getRawOutput(0)) + ->getImpl() + ->rawPtr(), + std::dynamic_pointer_cast<TensorImpl_cuda_>(output_grad.getImpl()) + ->getCudnnTensorDesc(output_grad), + output_grad.getImpl()->rawPtr(), + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->getImpl()) + ->getCudnnTensorDesc(*op.getInput(0)), + std::static_pointer_cast<Tensor>(op.getRawInput(0)) + ->getImpl() + ->rawPtr(), + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>( + op.getInput(0)->grad()->getImpl()) + ->getCudnnTensorDesc(*op.getInput(0)->grad()), + op.getInput(0)->grad()->getImpl()->rawPtr())); } Aidge::SigmoidImpl_cuda::~SigmoidImpl_cuda() { if (mSigmoidDesc != nullptr) { - #if CUDNN_VERSION >= 5000 - cudnnDestroyActivationDescriptor(mSigmoidDesc); - #endif +#if CUDNN_VERSION >= 5000 + cudnnDestroyActivationDescriptor(mSigmoidDesc); +#endif } } - diff --git a/src/operator/SubImpl.cpp b/src/operator/SubImpl.cpp index a04a1c3018b0c9ba455d21ba563253eb3e004e10..e63b9c04f89cd7cf1919006106dfe97b1833b935 100644 --- a/src/operator/SubImpl.cpp +++ b/src/operator/SubImpl.cpp @@ -22,27 +22,39 @@ #include "aidge/utils/Types.h" void Aidge::SubImpl_cuda::forward() { - const Sub_Op& op = static_cast<const Sub_Op&>(mOp); + const Sub_Op &op = static_cast<const Sub_Op &>(mOp); // Check inputs AIDGE_ASSERT(op.getInput(0), "missing input in Sub operator"); - AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Sub forward because the 0-th input has no implementation."); + AIDGE_ASSERT(op.getInput(0)->hasImpl(), + "cannot run Sub forward because the 0-th input has no " + "implementation."); DataType datatypeFirstInput = op.getInput(0)->dataType(); for (IOIndex_t i = 1; i < op.nbInputs(); ++i) { AIDGE_ASSERT(op.getInput(i), "missing input in Sub operator"); - AIDGE_ASSERT(op.getInput(i)->hasImpl(), "cannot run Sub forward because the {}-th input has no implementation.", i); - AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, "Cannot add inputs with two differents data type."); + AIDGE_ASSERT(op.getInput(i)->hasImpl(), + "cannot run Sub forward because the {}-th input has no " + "implementation.", + i); + AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, + "Cannot add inputs with two differents data type."); } std::vector<std::shared_ptr<Tensor>> inputFallbacks(op.nbInputs()); std::vector<Tensor> inputs(op.nbInputs()); std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims - std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides + std::vector<std::vector<int>> strides( + op.nbInputs()); // For the cooresponding strides for (IOIndex_t i = 0; i < op.nbInputs(); ++i) { - inputs[i] = op.getInput(i)->refCastFrom(inputFallbacks[i], *op.getOutput(0)); + inputs[i] = + op.getInput(i)->refCastFrom(inputFallbacks[i], *op.getOutput(0)); // Get tensor dims and broadcast them - std::copy(inputs[i].dims().begin(), inputs[i].dims().end(), std::back_inserter(dims[i])); - dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1)); + std::copy(inputs[i].dims().begin(), + inputs[i].dims().end(), + std::back_inserter(dims[i])); + dims[i].insert(dims[i].cbegin(), + op.getOutput(0)->nbDims() - dims[i].size(), + int(1)); if (dims[i].size() < 4) { dims[i].resize(4, 1); @@ -58,76 +70,106 @@ void Aidge::SubImpl_cuda::forward() { strides[i] = tensorStrides; } - switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { - case DataType::Float64: - forward_<double>(inputs, dims, strides); - break; - case DataType::Float32: - forward_<float>(inputs, dims, strides); - break; - case DataType::Float16: - forward_<half>(inputs, dims, strides); - break; - default: - AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + switch ( + std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + forward_<double>(inputs, dims, strides); + break; + case DataType::Float32: + forward_<float>(inputs, dims, strides); + break; + case DataType::Float16: + forward_<half>(inputs, dims, strides); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, + "Data type is not supported by Backend Cuda"); } } template <class T> -void Aidge::SubImpl_cuda::forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); +void Aidge::SubImpl_cuda::forward_( + const std::vector<Tensor> &inputs, + const std::vector<std::vector<int>> &inputsDims, + const std::vector<std::vector<int>> &inputsStrides) { + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; const typename Cuda::cudnn_scaling_type<T>::type gamma = -1.0f; // Create a Tensor descriptor with the broadcasted dims and strides cudnnTensorDescriptor_t tensorDesc; CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc)); - CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc, CudaContext::data_type<T>::value, inputsDims[0].size(), inputsDims[0].data(), inputsStrides[0].data())); - // Add first input to the output CHECK_CUDNN_STATUS( - cudnnAddTensor(CudaContext::cudnnHandle(), - &alpha, - tensorDesc, - inputs[0].getImpl()->rawPtr(), - &beta, - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), - std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()) - ); + cudnnSetTensorNdDescriptor(tensorDesc, + CudaContext::data_type<T>::value, + inputsDims[0].size(), + inputsDims[0].data(), + inputsStrides[0].data())); + // Add first input to the output + CHECK_CUDNN_STATUS(cudnnAddTensor( + CudaContext::cudnnHandle(), + &alpha, + tensorDesc, + inputs[0].getImpl()->rawPtr(), + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl()) + ->getCudnnTensorDesc(*op.getOutput(0)), + std::static_pointer_cast<Tensor>(op.getRawOutput(0)) + ->getImpl() + ->rawPtr())); // Substract other inputs if there are any - for (size_t i = 1; i < op.nbInputs(); ++i) - { - CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc, CudaContext::data_type<T>::value, inputsDims[i].size(), inputsDims[i].data(), inputsStrides[i].data())); + for (size_t i = 1; i < op.nbInputs(); ++i) { + CHECK_CUDNN_STATUS( + cudnnSetTensorNdDescriptor(tensorDesc, + CudaContext::data_type<T>::value, + inputsDims[i].size(), + inputsDims[i].data(), + inputsStrides[i].data())); CHECK_CUDNN_STATUS( cudnnAddTensor(CudaContext::cudnnHandle(), - &gamma, - tensorDesc, - inputs[i].getImpl()->rawPtr(), - &alpha, - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), - std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()) - ); + &gamma, + tensorDesc, + inputs[i].getImpl()->rawPtr(), + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>( + op.getOutput(0)->getImpl()) + ->getCudnnTensorDesc(*op.getOutput(0)), + std::static_pointer_cast<Tensor>(op.getRawOutput(0)) + ->getImpl() + ->rawPtr())); } CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc)); } void Aidge::SubImpl_cuda::backward() { - const Sub_Op& op = static_cast<const Sub_Op&>(mOp); + const Sub_Op &op = static_cast<const Sub_Op &>(mOp); // Check output - AIDGE_ASSERT(op.getOutput(0)->grad(), "missing output gradient in Sub operator"); - AIDGE_ASSERT(op.getOutput(0)->grad()->hasImpl(), "cannot run Sub backward because the output gradient has no implementation."); + AIDGE_ASSERT(op.getOutput(0)->grad(), + "missing output gradient in Sub operator"); + AIDGE_ASSERT(op.getOutput(0)->grad()->hasImpl(), + "cannot run Sub backward because the output gradient has no " + "implementation."); std::shared_ptr<Tensor> outputGradFallback; - const auto& outputGrad = op.getOutput(0)->grad()->refCastFrom(outputGradFallback, *op.getOutput(0)->grad()); + const auto &outputGrad = + op.getOutput(0)->grad()->refCastFrom(outputGradFallback, + *op.getOutput(0)->grad()); std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims - std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides + std::vector<std::vector<int>> strides( + op.nbInputs()); // For the cooresponding strides for (IOIndex_t i = 0; i < op.nbInputs(); ++i) { std::shared_ptr<Tensor> inputFallback; - const Tensor input = op.getInput(i)->refCastFrom(inputFallback, *op.getOutput(0)); + const Tensor input = + op.getInput(i)->refCastFrom(inputFallback, *op.getOutput(0)); // Get tensor dims and broadcast them - std::copy(input.dims().begin(), input.dims().end(), std::back_inserter(dims[i])); - dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1)); + std::copy(input.dims().begin(), + input.dims().end(), + std::back_inserter(dims[i])); + dims[i].insert(dims[i].cbegin(), + op.getOutput(0)->nbDims() - dims[i].size(), + int(1)); // Compute the corresponding strides std::vector<int> tensorStrides(dims[i].size()); @@ -139,83 +181,97 @@ void Aidge::SubImpl_cuda::backward() { strides[i] = tensorStrides; } - switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { - case DataType::Float64: - backward_<double>(outputGrad, dims, strides); - break; - case DataType::Float32: - backward_<float>(outputGrad, dims, strides); - break; - case DataType::Float16: - backward_<half>(outputGrad, dims, strides); - break; - default: - AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + switch ( + std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + backward_<double>(outputGrad, dims, strides); + break; + case DataType::Float32: + backward_<float>(outputGrad, dims, strides); + break; + case DataType::Float16: + backward_<half>(outputGrad, dims, strides); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, + "Data type is not supported by Backend Cuda"); } } template <class T> -void Aidge::SubImpl_cuda::backward_(const Tensor& outputGrad, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); +void Aidge::SubImpl_cuda::backward_( + const Tensor &outputGrad, + const std::vector<std::vector<int>> &inputsDims, + const std::vector<std::vector<int>> &inputsStrides) { + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; const typename Cuda::cudnn_scaling_type<T>::type gamma = -1.0f; - for (std::size_t i = 0; i < inputsDims.size(); i++) - { - if (op.getInput(i)->size() == op.getOutput(0)->size()) - { + for (std::size_t i = 0; i < inputsDims.size(); i++) { + if (op.getInput(i)->size() == op.getOutput(0)->size()) { CHECK_CUDNN_STATUS( - cudnnAddTensor(CudaContext::cudnnHandle(), - i==0 ? &alpha: &gamma, - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), - outputGrad.getImpl()->rawPtr(), - &beta, - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(i)->getImpl())->getCudnnTensorDesc(*op.getInput(i)), - op.getInput(i)->grad()->getImpl()->rawPtr())); - } - else // In case of broadcasting + cudnnAddTensor(CudaContext::cudnnHandle(), + i == 0 ? &alpha : &gamma, + std::dynamic_pointer_cast<TensorImpl_cuda_>( + op.getOutput(0)->getImpl()) + ->getCudnnTensorDesc(*op.getOutput(0)), + outputGrad.getImpl()->rawPtr(), + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>( + op.getInput(i)->getImpl()) + ->getCudnnTensorDesc(*op.getInput(i)), + op.getInput(i)->grad()->getImpl()->rawPtr())); + } else // In case of broadcasting { - // Gradient with respect to input_i: sum outputGrad over the broadcasted dimensions using cudnnReduceTensor + // Gradient with respect to input_i: sum outputGrad over the + // broadcasted dimensions using cudnnReduceTensor cudnnReduceTensorDescriptor_t reduceDesc; CHECK_CUDNN_STATUS(cudnnCreateReduceTensorDescriptor(&reduceDesc)); - CHECK_CUDNN_STATUS(cudnnSetReduceTensorDescriptor(reduceDesc, - CUDNN_REDUCE_TENSOR_ADD, - CudaContext::data_type<T>::value, - CUDNN_PROPAGATE_NAN, - CUDNN_REDUCE_TENSOR_NO_INDICES, - CUDNN_32BIT_INDICES)); - - cudnnTensorDescriptor_t outputDesc = std::dynamic_pointer_cast<TensorImpl_cuda_>(outputGrad.getImpl())->getCudnnTensorDesc(*op.getOutput(0)); + CHECK_CUDNN_STATUS(cudnnSetReduceTensorDescriptor( + reduceDesc, + CUDNN_REDUCE_TENSOR_ADD, + CudaContext::data_type<T>::value, + CUDNN_PROPAGATE_NAN, + CUDNN_REDUCE_TENSOR_NO_INDICES, + CUDNN_32BIT_INDICES)); + + cudnnTensorDescriptor_t outputDesc = + std::dynamic_pointer_cast<TensorImpl_cuda_>( + outputGrad.getImpl()) + ->getCudnnTensorDesc(*op.getOutput(0)); // Create a Tensor descriptor with the broadcasted dims and strides cudnnTensorDescriptor_t tensorDesc; CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc)); - CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc, - CudaContext::data_type<T>::value, - inputsDims[i].size(), - inputsDims[i].data(), - inputsStrides[i].data())); + CHECK_CUDNN_STATUS( + cudnnSetTensorNdDescriptor(tensorDesc, + CudaContext::data_type<T>::value, + inputsDims[i].size(), + inputsDims[i].data(), + inputsStrides[i].data())); size_t workspaceSize; - CHECK_CUDNN_STATUS(cudnnGetReductionWorkspaceSize(CudaContext::cudnnHandle(), - reduceDesc, - outputDesc, - tensorDesc, - &workspaceSize)); + CHECK_CUDNN_STATUS( + cudnnGetReductionWorkspaceSize(CudaContext::cudnnHandle(), + reduceDesc, + outputDesc, + tensorDesc, + &workspaceSize)); void *d_workspace; CHECK_CUDA_STATUS(cudaMalloc(&d_workspace, workspaceSize)); - CHECK_CUDNN_STATUS(cudnnReduceTensor(CudaContext::cudnnHandle(), - reduceDesc, - NULL, - 0, - d_workspace, - workspaceSize, - i==0 ? &alpha: &gamma, - outputDesc, - outputGrad.getImpl()->rawPtr(), - &beta, - tensorDesc, - op.getInput(i)->grad()->getImpl()->rawPtr())); + CHECK_CUDNN_STATUS(cudnnReduceTensor( + CudaContext::cudnnHandle(), + reduceDesc, + NULL, + 0, + d_workspace, + workspaceSize, + i == 0 ? &alpha : &gamma, + outputDesc, + outputGrad.getImpl()->rawPtr(), + &beta, + tensorDesc, + op.getInput(i)->grad()->getImpl()->rawPtr())); CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc)); } diff --git a/src/operator/TanhImpl.cpp b/src/operator/TanhImpl.cpp index 96c0330febba35cfea04bbbac97d9308195d6309..f217ed867c1357b977cf9a3cf367ab493641b257 100644 --- a/src/operator/TanhImpl.cpp +++ b/src/operator/TanhImpl.cpp @@ -20,21 +20,25 @@ #include "aidge/utils/Types.h" void Aidge::TanhImpl_cuda::forward() { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); assert(mOp.getRawInput(0) && "missing input #0"); - const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0)); + const auto &input = + op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0)); // Lazy-initialize CuDNN Tanh descriptor if (mTanhDesc == nullptr) { - #if CUDNN_VERSION >= 5000 - CHECK_CUDNN_STATUS(cudnnCreateActivationDescriptor(&mTanhDesc)); - CHECK_CUDNN_STATUS(cudnnSetActivationDescriptor( - mTanhDesc, CUDNN_ACTIVATION_TANH, CUDNN_NOT_PROPAGATE_NAN, 0.0)); - #else - mTanhDesc = CUDNN_ACTIVATION_TANH; - #endif +#if CUDNN_VERSION >= 5000 + CHECK_CUDNN_STATUS(cudnnCreateActivationDescriptor(&mTanhDesc)); + CHECK_CUDNN_STATUS( + cudnnSetActivationDescriptor(mTanhDesc, + CUDNN_ACTIVATION_TANH, + CUDNN_NOT_PROPAGATE_NAN, + 0.0)); +#else + mTanhDesc = CUDNN_ACTIVATION_TANH; +#endif } // Do the actual forward computation @@ -42,44 +46,51 @@ void Aidge::TanhImpl_cuda::forward() { // excepted when the convolution is performed in double precision. if (op.getOutput(0)->dataType() == DataType::Float64) { forward_<double>(input); - } - else { + } else { forward_<float>(input); } } -template <class T> -void Aidge::TanhImpl_cuda::forward_(const Tensor& input) { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); +template <class T> void Aidge::TanhImpl_cuda::forward_(const Tensor &input) { + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; - CHECK_CUDNN_STATUS( - cudnnActivationForward(CudaContext::cudnnHandle(), - mTanhDesc, - &alpha, - std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input), - input.getImpl()->rawPtr(), - &beta, - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), - std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr())); + CHECK_CUDNN_STATUS(cudnnActivationForward( + CudaContext::cudnnHandle(), + mTanhDesc, + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl()) + ->getCudnnTensorDesc(input), + input.getImpl()->rawPtr(), + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl()) + ->getCudnnTensorDesc(*op.getOutput(0)), + std::static_pointer_cast<Tensor>(op.getRawOutput(0)) + ->getImpl() + ->rawPtr())); } void Aidge::TanhImpl_cuda::backward() { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); assert(op.getOutput(0)->grad() && "missing output #0"); - const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad()); + const auto &output_grad = + op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, + *op.getOutput(0)->grad()); // Lazy-initialize CuDNN Tanh descriptor if (mTanhDesc == nullptr) { - #if CUDNN_VERSION >= 5000 - CHECK_CUDNN_STATUS(cudnnCreateActivationDescriptor(&mTanhDesc)); - CHECK_CUDNN_STATUS(cudnnSetActivationDescriptor( - mTanhDesc, CUDNN_ACTIVATION_SIGMOID, CUDNN_NOT_PROPAGATE_NAN, 0.0)); - #else - mTanhDesc = CUDNN_ACTIVATION_SIGMOID; - #endif +#if CUDNN_VERSION >= 5000 + CHECK_CUDNN_STATUS(cudnnCreateActivationDescriptor(&mTanhDesc)); + CHECK_CUDNN_STATUS( + cudnnSetActivationDescriptor(mTanhDesc, + CUDNN_ACTIVATION_SIGMOID, + CUDNN_NOT_PROPAGATE_NAN, + 0.0)); +#else + mTanhDesc = CUDNN_ACTIVATION_SIGMOID; +#endif } // Do the actual backward computation @@ -87,37 +98,44 @@ void Aidge::TanhImpl_cuda::backward() { // excepted when the convolution is performed in double precision. if (op.getInput(0)->grad()->dataType() == DataType::Float64) { backward_<double>(output_grad); - } - else { + } else { backward_<float>(output_grad); } } template <class T> -void Aidge::TanhImpl_cuda::backward_(const Tensor& output_grad) { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); +void Aidge::TanhImpl_cuda::backward_(const Tensor &output_grad) { + const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp); const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; - CHECK_CUDNN_STATUS( - cudnnActivationBackward(CudaContext::cudnnHandle(), - mTanhDesc, - &alpha, - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), - std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr(), - std::dynamic_pointer_cast<TensorImpl_cuda_>(output_grad.getImpl())->getCudnnTensorDesc(output_grad), - output_grad.getImpl()->rawPtr(), - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->getImpl())->getCudnnTensorDesc(*op.getInput(0)), - std::static_pointer_cast<Tensor>(op.getRawInput(0))->getImpl()->rawPtr(), - &beta, - std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->grad()->getImpl())->getCudnnTensorDesc(*op.getInput(0)->grad()), - op.getInput(0)->grad()->getImpl()->rawPtr())); + CHECK_CUDNN_STATUS(cudnnActivationBackward( + CudaContext::cudnnHandle(), + mTanhDesc, + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl()) + ->getCudnnTensorDesc(*op.getOutput(0)), + std::static_pointer_cast<Tensor>(op.getRawOutput(0)) + ->getImpl() + ->rawPtr(), + std::dynamic_pointer_cast<TensorImpl_cuda_>(output_grad.getImpl()) + ->getCudnnTensorDesc(output_grad), + output_grad.getImpl()->rawPtr(), + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->getImpl()) + ->getCudnnTensorDesc(*op.getInput(0)), + std::static_pointer_cast<Tensor>(op.getRawInput(0)) + ->getImpl() + ->rawPtr(), + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>( + op.getInput(0)->grad()->getImpl()) + ->getCudnnTensorDesc(*op.getInput(0)->grad()), + op.getInput(0)->grad()->getImpl()->rawPtr())); } Aidge::TanhImpl_cuda::~TanhImpl_cuda() { if (mTanhDesc != nullptr) { - #if CUDNN_VERSION >= 5000 - cudnnDestroyActivationDescriptor(mTanhDesc); - #endif +#if CUDNN_VERSION >= 5000 + cudnnDestroyActivationDescriptor(mTanhDesc); +#endif } } - diff --git a/src/utils/CudaUtils.cpp b/src/utils/CudaUtils.cpp index ca3263a282322e70157b7537c502a63a3edb526f..7e6abadde7b465829b8e49365c4065759e9e850e 100644 --- a/src/utils/CudaUtils.cpp +++ b/src/utils/CudaUtils.cpp @@ -1,7 +1,6 @@ #include "aidge/backend/cuda/utils/CudaUtils.hpp" -const char* Aidge::Cuda::cublasGetErrorString(cublasStatus_t error) -{ +const char *Aidge::Cuda::cublasGetErrorString(cublasStatus_t error) { switch (error) { case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS"; @@ -28,19 +27,24 @@ const char* Aidge::Cuda::cublasGetErrorString(cublasStatus_t error) return "<unknown>"; } -void Aidge::Cuda::setMultiDevicePeerAccess(unsigned int size, unsigned int* devices) -{ +void Aidge::Cuda::setMultiDevicePeerAccess(unsigned int size, + unsigned int *devices) { for (unsigned int i = 0; i < size; ++i) { for (unsigned int j = 0; j < size; ++j) { if (i != j) { int canAccessPeer = 0; CHECK_CUDA_STATUS(cudaDeviceCanAccessPeer(&canAccessPeer, - devices[j], devices[i])); + devices[j], + devices[i])); if (canAccessPeer) { CHECK_CUDA_STATUS(cudaSetDevice(devices[j])); - const cudaError_t status = cudaDeviceEnablePeerAccess(devices[i], 0); + const cudaError_t status = + cudaDeviceEnablePeerAccess(devices[i], 0); if (status == cudaErrorPeerAccessAlreadyEnabled) { - fmt::print("Peer access already enabled between device {} and device {}\n", devices[j], devices[i]); + fmt::print("Peer access already enabled between " + "device {} and device {}\n", + devices[j], + devices[i]); } else { CHECK_CUDA_STATUS(status); } diff --git a/unit_tests/Test_AddImpl.cpp b/unit_tests/Test_AddImpl.cpp index dffabe6aab92bdfdd0c79b61ab59e9bc6efb9d94..3993cdf3f60dd1eaab11b87ede06b53b59d06832 100644 --- a/unit_tests/Test_AddImpl.cpp +++ b/unit_tests/Test_AddImpl.cpp @@ -9,9 +9,9 @@ * ********************************************************************************/ -#include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution #include <catch2/catch_test_macros.hpp> +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution #include "aidge/backend/cpu.hpp" #include "aidge/backend/cuda.hpp" @@ -23,59 +23,60 @@ using namespace Aidge; TEST_CASE("[gpu/operator] Add(forward)", "[Add][GPU]") { SECTION("Same input") { - std::shared_ptr<Tensor> input1 = std::make_shared<Tensor>(Array4D<float,3,3,3,2> { - { // - { // - {{20, 47},{21, 48},{22, 49}}, // - {{23, 50},{24, 51},{25, 52}}, // - {{26, 53},{27, 54},{28, 55}} // - }, // - { // - {{29, 56},{30, 57},{31, 58}}, // - {{32, 59},{33, 60},{34, 61}}, // - {{35, 62},{36, 63},{37, 64}} // - }, // - { // - {{38, 65},{39, 66},{40, 67}}, // - {{41, 68},{42, 69},{43, 70}}, // - {{44, 71},{45, 72},{46, 73}} // - } // - } // - }); // - input1->setBackend("cuda"); - std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<float,3,3,3,2> { - { + std::shared_ptr<Tensor> input1 = + std::make_shared<Tensor>(Array4D<float, 3, 3, 3, 2>{ { - {{40, 94},{42, 96},{44, 98}}, - {{46, 100},{48, 102},{50, 104}}, - {{52, 106},{54, 108},{56, 110}} - }, - { - {{58, 112},{60, 114},{62, 116}}, - {{64, 118},{66, 120},{68, 122}}, - {{70, 124},{72, 126},{74, 128}} - }, - { - {{76, 130},{78, 132},{80, 134}}, - {{82, 136},{84, 138},{86, 140}}, - {{88, 142},{90, 144},{92, 146}} - } - } - }); + // + { + // + {{20, 47}, {21, 48}, {22, 49}}, // + {{23, 50}, {24, 51}, {25, 52}}, // + {{26, 53}, {27, 54}, {28, 55}} // + }, // + { + // + {{29, 56}, {30, 57}, {31, 58}}, // + {{32, 59}, {33, 60}, {34, 61}}, // + {{35, 62}, {36, 63}, {37, 64}} // + }, // + { + // + {{38, 65}, {39, 66}, {40, 67}}, // + {{41, 68}, {42, 69}, {43, 70}}, // + {{44, 71}, {45, 72}, {46, 73}} // + } // + } // + }); // + input1->setBackend("cuda"); + std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>( + Array4D<float, 3, 3, 3, 2>{{{{{40, 94}, {42, 96}, {44, 98}}, + {{46, 100}, {48, 102}, {50, 104}}, + {{52, 106}, {54, 108}, {56, 110}}}, + {{{58, 112}, {60, 114}, {62, 116}}, + {{64, 118}, {66, 120}, {68, 122}}, + {{70, 124}, {72, 126}, {74, 128}}}, + {{{76, 130}, {78, 132}, {80, 134}}, + {{82, 136}, {84, 138}, {86, 140}}, + {{88, 142}, {90, 144}, {92, 146}}}}}); std::shared_ptr<Node> myAdd = Add(); - auto op = std::static_pointer_cast<OperatorTensor>(myAdd -> getOperator()); + auto op = + std::static_pointer_cast<OperatorTensor>(myAdd->getOperator()); op->associateInput(0, input1); op->associateInput(1, input1); op->setBackend("cuda"); op->setDataType(DataType::Float32); myAdd->forward(); - float* computedOutput = new float[input1->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * expectedOutput->size(), cudaMemcpyDeviceToHost); + float *computedOutput = new float[input1->size()](); + cudaMemcpy(computedOutput, + op->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * expectedOutput->size(), + cudaMemcpyDeviceToHost); - for(int i = 0; i < expectedOutput->size(); i++){ - const float targetOutput = *(static_cast<float*>(expectedOutput->getImpl()->rawPtr()) + i); + for (int i = 0; i < expectedOutput->size(); i++) { + const float targetOutput = *( + static_cast<float *>(expectedOutput->getImpl()->rawPtr()) + i); REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); } @@ -83,99 +84,131 @@ TEST_CASE("[gpu/operator] Add(forward)", "[Add][GPU]") { } SECTION("Broadcasting") { - std::shared_ptr<Tensor> input_0 = std::make_shared<Tensor>(Array4D<float,3,1,3,2> { - { // - { // - {{0, 1},{2, 3},{4, 5}} // - }, // - { // - {{6, 7},{8, 9},{10, 11}} // - }, // - { // - {{12, 13},{14, 15},{16, 17}} // - } // - } // - }); // - std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<float,1,3,3,2> { - { // - { // - {{20, 21},{22, 23},{24, 25}}, // - {{26, 27},{28, 29},{30, 31}}, // - {{32, 33},{34, 35},{36, 37}} // - } // - } // - }); // - - std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<float,2> {{100,200}}); + std::shared_ptr<Tensor> input_0 = + std::make_shared<Tensor>(Array4D<float, 3, 1, 3, 2>{ + { + // + { + // + {{0, 1}, {2, 3}, {4, 5}} // + }, // + { + // + {{6, 7}, {8, 9}, {10, 11}} // + }, // + { + // + {{12, 13}, {14, 15}, {16, 17}} // + } // + } // + }); // + std::shared_ptr<Tensor> input_1 = + std::make_shared<Tensor>(Array4D<float, 1, 3, 3, 2>{ + { + // + { + // + {{20, 21}, {22, 23}, {24, 25}}, // + {{26, 27}, {28, 29}, {30, 31}}, // + {{32, 33}, {34, 35}, {36, 37}} // + } // + } // + }); // + + std::shared_ptr<Tensor> input_2 = + std::make_shared<Tensor>(Array1D<float, 2>{{100, 200}}); input_0->setBackend("cuda"); input_1->setBackend("cuda"); input_2->setBackend("cuda"); - /// Input0(d0, 1, d2, d3) + Input1(1, d1, d2, d3) = Output(d0, d1, d2, d3) - std::shared_ptr<Tensor> expectedOutput0 = std::make_shared<Tensor>(Array4D<float,3,3,3,2> { - { // - { // - {{ 20, 22},{ 24, 26},{ 28, 30}}, // - {{ 26, 28},{ 30, 32},{ 34, 36}}, // - {{ 32, 34},{ 36, 38},{ 40, 42}} // - }, // - { // - {{ 26, 28},{ 30, 32},{ 34, 36}}, // - {{ 32, 34},{ 36, 38},{ 40, 42}}, // - {{ 38, 40},{ 42, 44},{ 46, 48}} // - }, // - { // - {{ 32, 34},{ 36, 38},{40, 42}}, // - {{ 38, 40},{ 42, 44},{46, 48}}, // - {{ 44, 46},{ 48, 50},{52, 54}} // - } // - } // - }); // + /// Input0(d0, 1, d2, d3) + Input1(1, d1, d2, d3) = Output(d0, d1, d2, + /// d3) + std::shared_ptr<Tensor> expectedOutput0 = + std::make_shared<Tensor>(Array4D<float, 3, 3, 3, 2>{ + { + // + { + // + {{20, 22}, {24, 26}, {28, 30}}, // + {{26, 28}, {30, 32}, {34, 36}}, // + {{32, 34}, {36, 38}, {40, 42}} // + }, // + { + // + {{26, 28}, {30, 32}, {34, 36}}, // + {{32, 34}, {36, 38}, {40, 42}}, // + {{38, 40}, {42, 44}, {46, 48}} // + }, // + { + // + {{32, 34}, {36, 38}, {40, 42}}, // + {{38, 40}, {42, 44}, {46, 48}}, // + {{44, 46}, {48, 50}, {52, 54}} // + } // + } // + }); // std::shared_ptr<Node> myAdd0 = Add(); - auto op0 = std::static_pointer_cast<OperatorTensor>(myAdd0 -> getOperator()); + auto op0 = + std::static_pointer_cast<OperatorTensor>(myAdd0->getOperator()); op0->associateInput(0, input_0); op0->associateInput(1, input_1); op0->setDataType(DataType::Float32); op0->setBackend("cuda"); myAdd0->forward(); - float* computedOutput0 = new float[expectedOutput0->size()](); - cudaMemcpy(computedOutput0, op0->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * expectedOutput0->size(), cudaMemcpyDeviceToHost); + float *computedOutput0 = new float[expectedOutput0->size()](); + cudaMemcpy(computedOutput0, + op0->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * expectedOutput0->size(), + cudaMemcpyDeviceToHost); - for(int i = 0; i < expectedOutput0->size(); i++){ - const float targetOutput = *(static_cast<float*>(expectedOutput0->getImpl()->rawPtr()) + i); + for (int i = 0; i < expectedOutput0->size(); i++) { + const float targetOutput = + *(static_cast<float *>(expectedOutput0->getImpl()->rawPtr()) + + i); REQUIRE(fabs(computedOutput0[i] - targetOutput) < 1e-6); } delete[] computedOutput0; /// Input0(d0, d1, d2, d3) + Input1(d3) = Output(d0, d1, d2, d3) - std::shared_ptr<Tensor> expectedOutput1 = std::make_shared<Tensor>(Array4D<float,3,1,3,2> { - { // - { // - {{100, 201},{102, 203},{104, 205}} // - }, // - { // - {{106, 207},{108, 209},{110, 211}} // - }, // - { // - {{112, 213},{114, 215},{116, 217}} // - } // - } // - }); // + std::shared_ptr<Tensor> expectedOutput1 = + std::make_shared<Tensor>(Array4D<float, 3, 1, 3, 2>{ + { + // + { + // + {{100, 201}, {102, 203}, {104, 205}} // + }, // + { + // + {{106, 207}, {108, 209}, {110, 211}} // + }, // + { + // + {{112, 213}, {114, 215}, {116, 217}} // + } // + } // + }); // std::shared_ptr<Node> myAdd1 = Add(); - auto op1 = std::static_pointer_cast<OperatorTensor>(myAdd1 -> getOperator()); + auto op1 = + std::static_pointer_cast<OperatorTensor>(myAdd1->getOperator()); op1->associateInput(0, input_0); op1->associateInput(1, input_2); op1->setDataType(DataType::Float32); op1->setBackend("cuda"); myAdd1->forward(); - float* computedOutput1 = new float[expectedOutput1->size()](); - cudaMemcpy(computedOutput1, op1->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * expectedOutput1->size(), cudaMemcpyDeviceToHost); - - for(int i = 0; i < expectedOutput1->size(); i++){ - const float targetOutput = *(static_cast<float*>(expectedOutput1->getImpl()->rawPtr()) + i); + float *computedOutput1 = new float[expectedOutput1->size()](); + cudaMemcpy(computedOutput1, + op1->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * expectedOutput1->size(), + cudaMemcpyDeviceToHost); + + for (int i = 0; i < expectedOutput1->size(); i++) { + const float targetOutput = + *(static_cast<float *>(expectedOutput1->getImpl()->rawPtr()) + + i); REQUIRE(fabs(computedOutput1[i] - targetOutput) < 1e-6); } @@ -188,26 +221,30 @@ TEST_CASE("[gpu/operator] Add(forward)", "[Add][GPU]") { std::random_device rd; std::mt19937 gen(rd()); std::uniform_real_distribution<float> valueDist( - 0.1f, 1.1f); // Random float distribution between 0 and 1 - std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1), - std::size_t(10)); - std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(4), std::size_t(5)); - std::uniform_int_distribution<int> boolDist(0,1); + 0.1f, + 1.1f); // Random float distribution between 0 and 1 + std::uniform_int_distribution<std::size_t> dimSizeDist( + std::size_t(1), + std::size_t(10)); + std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(4), + std::size_t(5)); + std::uniform_int_distribution<int> boolDist(0, 1); // To measure execution time of 'forward()' std::chrono::time_point<std::chrono::system_clock> start; std::chrono::time_point<std::chrono::system_clock> end; std::chrono::duration<double, std::micro> duration{}; std::size_t number_of_operation = 0; - for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) - { + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { // Create Add Operator CUDA std::shared_ptr<Node> myAddCUDA = Add("myaddcuda"); - auto op_cuda = std::static_pointer_cast<OperatorTensor>(myAddCUDA -> getOperator()); + auto op_cuda = std::static_pointer_cast<OperatorTensor>( + myAddCUDA->getOperator()); // Create Add Operator CPU std::shared_ptr<Node> myAddCPU = Add("myaddcpu"); - auto op_cpu = std::static_pointer_cast<OperatorTensor>(myAddCPU -> getOperator()); + auto op_cpu = std::static_pointer_cast<OperatorTensor>( + myAddCPU->getOperator()); op_cpu->setDataType(DataType::Float32); op_cpu->setBackend("cpu"); @@ -218,23 +255,35 @@ TEST_CASE("[gpu/operator] Add(forward)", "[Add][GPU]") { // To test broadcasting, set some dims to 1 if (boolDist(gen)) { dims0.push_back(1); - }else{ + } else { dims0.push_back(dim); } if (boolDist(gen)) { dims1.push_back(1); - }else{ + } else { dims1.push_back(dim); } dims.push_back(std::max(dims0[i], dims1[i])); } - const std::size_t nb_elements0 = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); - const std::size_t nb_elements1 = std::accumulate(dims1.cbegin(), dims1.cend(), std::size_t(1), std::multiplies<std::size_t>()); - const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t nb_elements0 = + std::accumulate(dims0.cbegin(), + dims0.cend(), + std::size_t(1), + std::multiplies<std::size_t>()); + const std::size_t nb_elements1 = + std::accumulate(dims1.cbegin(), + dims1.cend(), + std::size_t(1), + std::multiplies<std::size_t>()); + const std::size_t nb_elements = + std::accumulate(dims.cbegin(), + dims.cend(), + std::size_t(1), + std::multiplies<std::size_t>()); number_of_operation += nb_elements; - float* array0 = new float[nb_elements0]; - float* array1 = new float[nb_elements1]; + float *array0 = new float[nb_elements0]; + float *array1 = new float[nb_elements1]; for (std::size_t i = 0; i < nb_elements0; ++i) { array0[i] = valueDist(gen); @@ -244,23 +293,27 @@ TEST_CASE("[gpu/operator] Add(forward)", "[Add][GPU]") { } // input0 CUDA - float* array0_d, *array1_d; + float *array0_d, *array1_d; std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>(); T0_cuda->setDataType(DataType::Float32); T0_cuda->setBackend("cuda"); T0_cuda->resize(dims0); op_cuda->associateInput(0, T0_cuda); - cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements0); - cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements0, cudaMemcpyHostToDevice); + cudaMalloc(reinterpret_cast<void **>(&array0_d), + sizeof(float) * nb_elements0); + cudaMemcpy(array0_d, + array0, + sizeof(float) * nb_elements0, + cudaMemcpyHostToDevice); T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements0); // input0 CPU std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>(); - op_cpu->associateInput(0,T0_cpu); + op_cpu->associateInput(0, T0_cpu); T0_cpu->setDataType(DataType::Float32); T0_cpu->setBackend("cpu"); T0_cpu->resize(dims0); - T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements0); + T0_cpu->getImpl()->setRawPtr(array0, nb_elements0); // input1 CUDA std::shared_ptr<Tensor> T1_cuda = std::make_shared<Tensor>(); @@ -268,17 +321,21 @@ TEST_CASE("[gpu/operator] Add(forward)", "[Add][GPU]") { T1_cuda->setBackend("cuda"); T1_cuda->resize(dims1); op_cuda->associateInput(1, T1_cuda); - cudaMalloc(reinterpret_cast<void **>(&array1_d), sizeof(float) * nb_elements1); - cudaMemcpy(array1_d, array1, sizeof(float) * nb_elements1, cudaMemcpyHostToDevice); + cudaMalloc(reinterpret_cast<void **>(&array1_d), + sizeof(float) * nb_elements1); + cudaMemcpy(array1_d, + array1, + sizeof(float) * nb_elements1, + cudaMemcpyHostToDevice); T1_cuda->getImpl()->setRawPtr(array1_d, nb_elements1); // input1 CPU std::shared_ptr<Tensor> T1_cpu = std::make_shared<Tensor>(); - op_cpu->associateInput(1,T1_cpu); + op_cpu->associateInput(1, T1_cpu); T1_cpu->setDataType(DataType::Float32); T1_cpu->setBackend("cpu"); T1_cpu->resize(dims1); - T1_cpu -> getImpl() -> setRawPtr(array1, nb_elements1); + T1_cpu->getImpl()->setRawPtr(array1, nb_elements1); // forward CUDA op_cuda->setDataType(DataType::Float32); @@ -286,14 +343,19 @@ TEST_CASE("[gpu/operator] Add(forward)", "[Add][GPU]") { start = std::chrono::system_clock::now(); op_cuda->forward(); end = std::chrono::system_clock::now(); - duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + duration += std::chrono::duration_cast<std::chrono::microseconds>( + end - start); float *computedOutput = new float[nb_elements](); - cudaMemcpy(computedOutput, op_cuda->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * nb_elements, cudaMemcpyDeviceToHost); + cudaMemcpy(computedOutput, + op_cuda->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * nb_elements, + cudaMemcpyDeviceToHost); // forward CPU op_cpu->forward(); - float *computedCPU = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr()); + float *computedCPU = static_cast<float *>( + op_cpu->getOutput(0)->getImpl()->rawPtr()); REQUIRE(approxEq<float>(*computedOutput, *computedCPU)); delete[] array0; @@ -301,107 +363,134 @@ TEST_CASE("[gpu/operator] Add(forward)", "[Add][GPU]") { delete[] computedOutput; cudaFree(array0_d); cudaFree(array1_d); - } } - } TEST_CASE("[gpu/operator] Add(backward)", "[Add][GPU]") { - std::shared_ptr<Tensor> input_0 = std::make_shared<Tensor>(Array4D<float,3,1,3,2> { - { // - { // - {{0, 1},{2, 3},{4, 5}} // - }, // - { // - {{6, 7},{8, 9},{10, 11}} // - }, // - { // - {{12, 13},{14, 15},{16, 17}} // - } // - } // - }); // - std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<float,1,3,3,2> { - { // - { // - {{20, 21},{22, 23},{24, 25}}, // - {{26, 27},{28, 29},{30, 31}}, // - {{32, 33},{34, 35},{36, 37}} // - } // - } // - }); // - - input_0->setBackend("cuda"); - input_1->setBackend("cuda"); - std::shared_ptr<Node> myAdd = Add(); - auto op = std::static_pointer_cast<OperatorTensor>(myAdd -> getOperator()); - op->associateInput(0, input_0); - op->associateInput(1, input_1); - op->setDataType(DataType::Float32); - op->setBackend("cuda"); - myAdd->forward(); - - // Run and test backward operation - std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array4D<float,3,3,3,2> { - { // - { // - {{ 1, 2},{ 3, 4},{ 5, 6}}, // - {{ 7, 8},{ 9, 10},{ 11, 12}}, // - {{ 13, 14},{ 15, 16},{ 17, 18}} // - }, // - { // - {{ 19, 20},{ 21, 22},{ 23, 24}}, // - {{ 25, 26},{ 27, 28},{ 29, 30}}, // - {{ 31, 32},{ 33, 34},{ 35, 36}} // - }, // - { // - {{ 37, 38},{ 39, 40},{41, 42}}, // - {{ 43, 44},{ 45, 46},{47, 48}}, // - {{ 49, 50},{ 51, 52},{53, 54}} // - } // - } // - }); // - myOutputGrad->setBackend("cuda"); - op->getOutput(0)->setGrad(myOutputGrad); - REQUIRE_NOTHROW(myAdd->backward()); - - std::shared_ptr<Tensor> expectedInput1Grad = std::make_shared<Tensor>(Array4D<float,3,1,3,2> { - { // - { // - {{21, 24},{27, 30},{33, 36}} // - }, // - { // - {{75, 78},{81, 84},{87, 90}} // - }, // - { // - {{129, 132},{135, 138},{141, 144}}// - } // - } // - }); // - std::shared_ptr<Tensor> expectedInput2Grad = std::make_shared<Tensor>(Array4D<float,1,3,3,2> { - { // - { // - {{57, 60},{63, 66},{69, 72}}, // - {{75, 78},{81, 84},{87, 90}}, // - {{93, 96},{99, 102},{105, 108}} // - } // - } // - }); // - - float *computedGrad1Cuda = new float[expectedInput1Grad->size()](); - cudaMemcpy(computedGrad1Cuda, op->getInput(0)->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInput1Grad->size(), cudaMemcpyDeviceToHost); - float *computedGrad2Cuda = new float[expectedInput2Grad->size()](); - cudaMemcpy(computedGrad2Cuda, op->getInput(1)->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInput2Grad->size(), cudaMemcpyDeviceToHost); - - for(int i = 0; i < expectedInput1Grad->size(); i++){ - const float targetOutput = *(static_cast<float*>(expectedInput1Grad->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedGrad1Cuda[i] - targetOutput) < 1e-6); - } - for(int i = 0; i < expectedInput2Grad->size(); i++){ - const float targetOutput = *(static_cast<float*>(expectedInput2Grad->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedGrad2Cuda[i] - targetOutput) < 1e-6); - } + std::shared_ptr<Tensor> input_0 = + std::make_shared<Tensor>(Array4D<float, 3, 1, 3, 2>{ + { + // + { + // + {{0, 1}, {2, 3}, {4, 5}} // + }, // + { + // + {{6, 7}, {8, 9}, {10, 11}} // + }, // + { + // + {{12, 13}, {14, 15}, {16, 17}} // + } // + } // + }); // + std::shared_ptr<Tensor> input_1 = + std::make_shared<Tensor>(Array4D<float, 1, 3, 3, 2>{ + { + // + { + // + {{20, 21}, {22, 23}, {24, 25}}, // + {{26, 27}, {28, 29}, {30, 31}}, // + {{32, 33}, {34, 35}, {36, 37}} // + } // + } // + }); // + + input_0->setBackend("cuda"); + input_1->setBackend("cuda"); + std::shared_ptr<Node> myAdd = Add(); + auto op = std::static_pointer_cast<OperatorTensor>(myAdd->getOperator()); + op->associateInput(0, input_0); + op->associateInput(1, input_1); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + myAdd->forward(); + + // Run and test backward operation + std::shared_ptr<Tensor> myOutputGrad = + std::make_shared<Tensor>(Array4D<float, 3, 3, 3, 2>{ + { + // + { + // + {{1, 2}, {3, 4}, {5, 6}}, // + {{7, 8}, {9, 10}, {11, 12}}, // + {{13, 14}, {15, 16}, {17, 18}} // + }, // + { + // + {{19, 20}, {21, 22}, {23, 24}}, // + {{25, 26}, {27, 28}, {29, 30}}, // + {{31, 32}, {33, 34}, {35, 36}} // + }, // + { + // + {{37, 38}, {39, 40}, {41, 42}}, // + {{43, 44}, {45, 46}, {47, 48}}, // + {{49, 50}, {51, 52}, {53, 54}} // + } // + } // + }); // + myOutputGrad->setBackend("cuda"); + op->getOutput(0)->setGrad(myOutputGrad); + REQUIRE_NOTHROW(myAdd->backward()); + + std::shared_ptr<Tensor> expectedInput1Grad = + std::make_shared<Tensor>(Array4D<float, 3, 1, 3, 2>{ + { + // + { + // + {{21, 24}, {27, 30}, {33, 36}} // + }, // + { + // + {{75, 78}, {81, 84}, {87, 90}} // + }, // + { + // + {{129, 132}, {135, 138}, {141, 144}} // + } // + } // + }); // + std::shared_ptr<Tensor> expectedInput2Grad = + std::make_shared<Tensor>(Array4D<float, 1, 3, 3, 2>{ + { + // + { + // + {{57, 60}, {63, 66}, {69, 72}}, // + {{75, 78}, {81, 84}, {87, 90}}, // + {{93, 96}, {99, 102}, {105, 108}} // + } // + } // + }); // + + float *computedGrad1Cuda = new float[expectedInput1Grad->size()](); + cudaMemcpy(computedGrad1Cuda, + op->getInput(0)->grad()->getImpl()->rawPtr(), + sizeof(float) * expectedInput1Grad->size(), + cudaMemcpyDeviceToHost); + float *computedGrad2Cuda = new float[expectedInput2Grad->size()](); + cudaMemcpy(computedGrad2Cuda, + op->getInput(1)->grad()->getImpl()->rawPtr(), + sizeof(float) * expectedInput2Grad->size(), + cudaMemcpyDeviceToHost); + + for (int i = 0; i < expectedInput1Grad->size(); i++) { + const float targetOutput = *( + static_cast<float *>(expectedInput1Grad->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedGrad1Cuda[i] - targetOutput) < 1e-6); + } + for (int i = 0; i < expectedInput2Grad->size(); i++) { + const float targetOutput = *( + static_cast<float *>(expectedInput2Grad->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedGrad2Cuda[i] - targetOutput) < 1e-6); + } - delete[] computedGrad1Cuda; - delete[] computedGrad2Cuda; + delete[] computedGrad1Cuda; + delete[] computedGrad2Cuda; } \ No newline at end of file diff --git a/unit_tests/Test_AndImpl.cpp b/unit_tests/Test_AndImpl.cpp index 66de926088bb47c06ea1f9f10655730404787149..1e6ee3396b7ce729c73a6792cf9698cf15cd1013 100644 --- a/unit_tests/Test_AndImpl.cpp +++ b/unit_tests/Test_AndImpl.cpp @@ -9,8 +9,8 @@ * ********************************************************************************/ -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution #include <catch2/catch_test_macros.hpp> +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution #include "aidge/backend/cpu.hpp" #include "aidge/backend/cuda.hpp" @@ -21,104 +21,113 @@ using namespace Aidge; TEST_CASE("[gpu/operator] And(forward)", "[And][GPU]") { SECTION("Same size inputs") { - std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<float,3,3,3,2> { - { // - { // - {{20, 15},{31, 11},{22, 49}}, // - {{41, 10},{24, 51},{27, 52}}, // - {{26, 53},{27, 54},{28, 55}} // - }, // - { // - {{29, 56},{30, 57},{31, 58}}, // - {{32, 59},{33, 60},{34, 61}}, // - {{35, 62},{36, 63},{37, 64}} // - }, // - { // - {{38, 65},{39, 66},{40, 67}}, // - {{41, 68},{42, 69},{43, 70}}, // - {{44, 71},{45, 72},{46, 73}} // - } // - } // - }); // - input_1->setBackend("cuda"); - std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array4D<float,3,3,3,2> { - { // - { // - {{20, 47},{21, 48},{22, 49}}, // - {{23, 50},{24, 51},{25, 52}}, // - {{17, 53},{27, 26},{14, 33}} // - }, // - { // - {{29, 56},{30, 57},{31, 58}}, // - {{72, 44},{33, 20},{27, 55}}, // - {{35, 24},{25, 63},{28, 64}} // - }, // - { // - {{32, 65},{39, 66},{40, 70}}, // - {{41, 53},{42, 60},{34, 70}}, // - {{44, 71},{30, 12},{46, 73}} // - } // - } // - }); // - input_2->setBackend("cuda"); - const Tensor myOutput = Tensor(Array4D<float,3,3,3,2> { - { + std::shared_ptr<Tensor> input_1 = + std::make_shared<Tensor>(Array4D<float, 3, 3, 3, 2>{ { - {{1, 0},{0, 0},{1, 1}}, - {{0, 0},{1, 1},{0, 1}}, - {{0, 1},{1, 0},{0, 0}} - }, - { - {{1, 1},{1, 1},{1, 1}}, - {{0, 0},{1, 0},{0, 0}}, - {{1, 0},{0, 1},{0, 1}} - }, + // + { + // + {{20, 15}, {31, 11}, {22, 49}}, // + {{41, 10}, {24, 51}, {27, 52}}, // + {{26, 53}, {27, 54}, {28, 55}} // + }, // + { + // + {{29, 56}, {30, 57}, {31, 58}}, // + {{32, 59}, {33, 60}, {34, 61}}, // + {{35, 62}, {36, 63}, {37, 64}} // + }, // + { + // + {{38, 65}, {39, 66}, {40, 67}}, // + {{41, 68}, {42, 69}, {43, 70}}, // + {{44, 71}, {45, 72}, {46, 73}} // + } // + } // + }); // + input_1->setBackend("cuda"); + std::shared_ptr<Tensor> input_2 = + std::make_shared<Tensor>(Array4D<float, 3, 3, 3, 2>{ { - {{0, 1},{1, 1},{1, 0}}, - {{1, 0},{1, 0},{0, 1}}, - {{1, 1},{0, 0},{1, 1}} - } - } - }); + // + { + // + {{20, 47}, {21, 48}, {22, 49}}, // + {{23, 50}, {24, 51}, {25, 52}}, // + {{17, 53}, {27, 26}, {14, 33}} // + }, // + { + // + {{29, 56}, {30, 57}, {31, 58}}, // + {{72, 44}, {33, 20}, {27, 55}}, // + {{35, 24}, {25, 63}, {28, 64}} // + }, // + { + // + {{32, 65}, {39, 66}, {40, 70}}, // + {{41, 53}, {42, 60}, {34, 70}}, // + {{44, 71}, {30, 12}, {46, 73}} // + } // + } // + }); // + input_2->setBackend("cuda"); + const Tensor myOutput = + Tensor(Array4D<float, 3, 3, 3, 2>{{{{{1, 0}, {0, 0}, {1, 1}}, + {{0, 0}, {1, 1}, {0, 1}}, + {{0, 1}, {1, 0}, {0, 0}}}, + {{{1, 1}, {1, 1}, {1, 1}}, + {{0, 0}, {1, 0}, {0, 0}}, + {{1, 0}, {0, 1}, {0, 1}}}, + {{{0, 1}, {1, 1}, {1, 0}}, + {{1, 0}, {1, 0}, {0, 1}}, + {{1, 1}, {0, 0}, {1, 1}}}}}); std::shared_ptr<Node> myAnd = And(); - auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator()); + auto op = + std::static_pointer_cast<OperatorTensor>(myAnd->getOperator()); op->associateInput(0, input_1); op->associateInput(1, input_2); op->setBackend("cuda"); op->setDataType(DataType::Float32); myAnd->forward(); - std::shared_ptr<Tensor> outputFallback; - const auto& cudaOutput = op->getOutput(0)->refCastFrom(outputFallback, myOutput); + const auto &cudaOutput = + op->getOutput(0)->refCastFrom(outputFallback, myOutput); REQUIRE(approxEq<float>(cudaOutput, myOutput)); } SECTION("Broadcasting") { - std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<float,1,3,3,2> { - { // - { // - {{10, 20},{22, 23},{20, 20}}, // - {{10, 15},{10, 29},{20, 20}}, // - {{26, 25},{33, 20},{10, 20}} // - } // - } // - }); // + std::shared_ptr<Tensor> input_1 = + std::make_shared<Tensor>(Array4D<float, 1, 3, 3, 2>{ + { + // + { + // + {{10, 20}, {22, 23}, {20, 20}}, // + {{10, 15}, {10, 29}, {20, 20}}, // + {{26, 25}, {33, 20}, {10, 20}} // + } // + } // + }); // input_1->setBackend("cuda"); - std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<float,2> {{10, 20}}); - const Tensor myOutput = Tensor(Array4D<float,1,3,3,2> { - { // - { // - {{ 1, 1},{ 0, 0},{ 0, 1}}, // - {{ 1, 0},{ 1, 0},{ 0, 1}}, // - {{ 0, 0},{ 0, 1},{ 1, 1}} // - } // - } // - }); // + std::shared_ptr<Tensor> input_2 = + std::make_shared<Tensor>(Array1D<float, 2>{{10, 20}}); + const Tensor myOutput = Tensor(Array4D<float, 1, 3, 3, 2>{ + { + // + { + // + {{1, 1}, {0, 0}, {0, 1}}, // + {{1, 0}, {1, 0}, {0, 1}}, // + {{0, 0}, {0, 1}, {1, 1}} // + } // + } // + }); // input_2->setBackend("cuda"); std::shared_ptr<Node> myAnd = And(); - auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator()); + auto op = + std::static_pointer_cast<OperatorTensor>(myAnd->getOperator()); op->associateInput(0, input_1); op->associateInput(1, input_2); op->setDataType(DataType::Float32); @@ -126,7 +135,8 @@ TEST_CASE("[gpu/operator] And(forward)", "[And][GPU]") { myAnd->forward(); std::shared_ptr<Tensor> outputFallback; - const auto& cudaOutput = op->getOutput(0)->refCastFrom(outputFallback, myOutput); + const auto &cudaOutput = + op->getOutput(0)->refCastFrom(outputFallback, myOutput); REQUIRE(approxEq<float>(cudaOutput, myOutput)); } } \ No newline at end of file diff --git a/unit_tests/Test_ArgMaxImpl.cpp b/unit_tests/Test_ArgMaxImpl.cpp index d123b5bd3376c7169b2e003d8b366bb9045fe3e1..0fe7927fdb8ed9513f0e894aef428ead9abc238d 100644 --- a/unit_tests/Test_ArgMaxImpl.cpp +++ b/unit_tests/Test_ArgMaxImpl.cpp @@ -9,9 +9,9 @@ * ********************************************************************************/ -#include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution #include <catch2/catch_test_macros.hpp> +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution #include "aidge/backend/cpu.hpp" #include "aidge/backend/cuda.hpp" @@ -23,133 +23,104 @@ using namespace Aidge; TEST_CASE("[cpu/operator] ArgMax(forward)", "[ArgMax][CPU]") { SECTION("3D Tensor") { - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,2,3,4> { - { - { - { 1.0, 2.0, 3.0, 4.0}, - { 8.0, 0.0, 17.0, 1.0}, - { 5.0, 10.0, 6.0, 0.0} - }, - { - { 7.0, 1.0, 9.0, 4.0}, - { 0.0, 8.0, 4.0, 2.0}, - { 9.0, 2.0, 0.0, 5.0} - } - } - }); - myInput->setBackend("cuda"); + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>( + Array3D<float, 2, 3, 4>{{{{1.0, 2.0, 3.0, 4.0}, + {8.0, 0.0, 17.0, 1.0}, + {5.0, 10.0, 6.0, 0.0}}, + {{7.0, 1.0, 9.0, 4.0}, + {0.0, 8.0, 4.0, 2.0}, + {9.0, 2.0, 0.0, 5.0}}}}); + myInput->setBackend("cuda"); SECTION("Axis 2") { - const Tensor myOutput = Tensor(Array3D<float,2,3, 1> { - { - { - {3.0}, - {2.0}, - {1.0} - }, - { - {2.0}, - {1.0}, - {0.0} - } - } - }); + const Tensor myOutput = Tensor(Array3D<float, 2, 3, 1>{ + {{{3.0}, {2.0}, {1.0}}, {{2.0}, {1.0}, {0.0}}}}); std::shared_ptr<Node> myArgMax = ArgMax(2); - auto op = std::static_pointer_cast<OperatorTensor>(myArgMax -> getOperator()); - op->associateInput(0,myInput); + auto op = std::static_pointer_cast<OperatorTensor>( + myArgMax->getOperator()); + op->associateInput(0, myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); myArgMax->forward(); std::shared_ptr<Tensor> outputFallback; - const auto& cudaOutput = op->getOutput(0)->refCastFrom(outputFallback, myOutput); + const auto &cudaOutput = + op->getOutput(0)->refCastFrom(outputFallback, myOutput); REQUIRE(approxEq<float>(cudaOutput, myOutput)); - } SECTION("Axis 2 with keep_dims false") { - const Tensor myOutput = Tensor(Array2D<float,2,3> { - { - { 3.0, 2.0, 1.0 }, - { 2.0, 1.0, 0.0 } - } - }); + const Tensor myOutput = Tensor( + Array2D<float, 2, 3>{{{3.0, 2.0, 1.0}, {2.0, 1.0, 0.0}}}); - std::shared_ptr<Node> myArgMax = ArgMax(2,0); - auto op = std::static_pointer_cast<OperatorTensor>(myArgMax -> getOperator()); - op->associateInput(0,myInput); + std::shared_ptr<Node> myArgMax = ArgMax(2, 0); + auto op = std::static_pointer_cast<OperatorTensor>( + myArgMax->getOperator()); + op->associateInput(0, myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); myArgMax->forward(); std::shared_ptr<Tensor> outputFallback; - const auto& cudaOutput = op->getOutput(0)->refCastFrom(outputFallback, myOutput); + const auto &cudaOutput = + op->getOutput(0)->refCastFrom(outputFallback, myOutput); REQUIRE(approxEq<float>(cudaOutput, myOutput)); } SECTION("Axis 1") { - const Tensor myOutput = Tensor(Array3D<float,2,1,4> { - { - { - { 1.0, 2.0, 1.0, 0.0 } - }, - { - { 2.0, 1.0, 0.0, 2.0 } - } - } - }); + const Tensor myOutput = Tensor(Array3D<float, 2, 1, 4>{ + {{{1.0, 2.0, 1.0, 0.0}}, {{2.0, 1.0, 0.0, 2.0}}}}); std::shared_ptr<Node> myArgMax = ArgMax(1); - auto op = std::static_pointer_cast<OperatorTensor>(myArgMax -> getOperator()); - op->associateInput(0,myInput); + auto op = std::static_pointer_cast<OperatorTensor>( + myArgMax->getOperator()); + op->associateInput(0, myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); myArgMax->forward(); std::shared_ptr<Tensor> outputFallback; - const auto& cudaOutput = op->getOutput(0)->refCastFrom(outputFallback, myOutput); + const auto &cudaOutput = + op->getOutput(0)->refCastFrom(outputFallback, myOutput); REQUIRE(approxEq<float>(cudaOutput, myOutput)); } SECTION("Axis 0") { - const Tensor myOutput = Tensor(Array3D<float,1,3,4> { - { - { - { 1.0, 0.0, 1.0, 0.0 }, - { 0.0, 1.0, 0.0, 1.0 }, - { 1.0, 0.0, 0.0, 1.0 } - } - } - }); + const Tensor myOutput = + Tensor(Array3D<float, 1, 3, 4>{{{{1.0, 0.0, 1.0, 0.0}, + {0.0, 1.0, 0.0, 1.0}, + {1.0, 0.0, 0.0, 1.0}}}}); std::shared_ptr<Node> myArgMax = ArgMax(0); - auto op = std::static_pointer_cast<OperatorTensor>(myArgMax -> getOperator()); - op->associateInput(0,myInput); + auto op = std::static_pointer_cast<OperatorTensor>( + myArgMax->getOperator()); + op->associateInput(0, myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); myArgMax->forward(); std::shared_ptr<Tensor> outputFallback; - const auto& cudaOutput = op->getOutput(0)->refCastFrom(outputFallback, myOutput); + const auto &cudaOutput = + op->getOutput(0)->refCastFrom(outputFallback, myOutput); REQUIRE(approxEq<float>(cudaOutput, myOutput)); } } SECTION("Select_Last_Index") { - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array1D<float,10> { - { - 1.0, 5.0, 9.0, 0.0, 6.0, 2.0, 9.0, 4.0, 3.0, 9.0 - } - }); - const Tensor myOutput = Tensor(Array1D<float,1> {{9}}); + std::shared_ptr<Tensor> myInput = + std::make_shared<Tensor>(Array1D<float, 10>{ + {1.0, 5.0, 9.0, 0.0, 6.0, 2.0, 9.0, 4.0, 3.0, 9.0}}); + const Tensor myOutput = Tensor(Array1D<float, 1>{{9}}); std::shared_ptr<Node> myArgMax = ArgMax(0, 1, 1); - auto op = std::static_pointer_cast<OperatorTensor>(myArgMax -> getOperator()); - op->associateInput(0,myInput); + auto op = + std::static_pointer_cast<OperatorTensor>(myArgMax->getOperator()); + op->associateInput(0, myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); myArgMax->forward(); std::shared_ptr<Tensor> outputFallback; - const auto& cudaOutput = op->getOutput(0)->refCastFrom(outputFallback, myOutput); + const auto &cudaOutput = + op->getOutput(0)->refCastFrom(outputFallback, myOutput); REQUIRE(approxEq<float>(cudaOutput, myOutput)); } } \ No newline at end of file diff --git a/unit_tests/Test_AvgPoolingImpl.cpp b/unit_tests/Test_AvgPoolingImpl.cpp index 3dccd6b7f909a9e9b4f8affb151898b77d94a7cf..965585244939792a2500d73b334c0dd2f421c934 100644 --- a/unit_tests/Test_AvgPoolingImpl.cpp +++ b/unit_tests/Test_AvgPoolingImpl.cpp @@ -11,8 +11,8 @@ #include <array> #include <cuda_fp16.h> // half type -#include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution #include <catch2/catch_test_macros.hpp> @@ -24,116 +24,141 @@ using namespace Aidge; -TEST_CASE("[gpu/operator] AvgPooling(forward)", "[AvgPooling][GPU]") -{ - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float, 2, 2, 5, 5>{// NCHW - { - {{{0, 1, 2, 3, 4}, - {5, 6, 7, 8, 9}, - {10, 11, 12, 13, 14}, - {15, 16, 17, 18, 19}, - {20, 21, 22, 23, 24}}, - - {{25, 26, 27, 28, 29}, - {30, 31, 32, 33, 34}, - {35, 36, 37, 38, 39}, - {40, 41, 42, 43, 44}, - {45, 46, 47, 48, 49}}}, - {{{100, 101, 102, 103, 104}, - {105, 106, 107, 108, 109}, - {110, 111, 112, 113, 114}, - {115, 116, 117, 118, 119}, - {120, 121, 122, 123, 124}}, - - {{125, 126, 127, 128, 129}, - {130, 131, 132, 133, 134}, - {135, 136, 137, 138, 139}, - {140, 141, 142, 143, 144}, - {145, 146, 147, 148, 149}}}}}); - SECTION("Stride") - { - std::shared_ptr<Node> myAvgPool = AvgPooling({2, 2}, "myAvgPool", {2, 2}); - auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool->getOperator()); - - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 2>{ - {{{{3, 5}, - {13, 15}}, - {{28, 30}, - {38, 40}}}, - {{{103, 105}, - {113, 115}}, - {{128, 130}, - {138, 140}}}}}); +TEST_CASE("[gpu/operator] AvgPooling(forward)", "[AvgPooling][GPU]") { + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>( + Array4D<float, 2, 2, 5, 5>{// NCHW + {{{{0, 1, 2, 3, 4}, + {5, 6, 7, 8, 9}, + {10, 11, 12, 13, 14}, + {15, 16, 17, 18, 19}, + {20, 21, 22, 23, 24}}, + + {{25, 26, 27, 28, 29}, + {30, 31, 32, 33, 34}, + {35, 36, 37, 38, 39}, + {40, 41, 42, 43, 44}, + {45, 46, 47, 48, 49}}}, + {{{100, 101, 102, 103, 104}, + {105, 106, 107, 108, 109}, + {110, 111, 112, 113, 114}, + {115, 116, 117, 118, 119}, + {120, 121, 122, 123, 124}}, + + {{125, 126, 127, 128, 129}, + {130, 131, 132, 133, 134}, + {135, 136, 137, 138, 139}, + {140, 141, 142, 143, 144}, + {145, 146, 147, 148, 149}}}}}); + SECTION("Stride") { + std::shared_ptr<Node> myAvgPool = + AvgPooling({2, 2}, "myAvgPool", {2, 2}); + auto op = + std::static_pointer_cast<OperatorTensor>(myAvgPool->getOperator()); + + std::shared_ptr<Tensor> myOutput = + std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 2>{ + {{{{3, 5}, {13, 15}}, {{28, 30}, {38, 40}}}, + {{{103, 105}, {113, 115}}, {{128, 130}, {138, 140}}}}}); op->associateInput(0, myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); myAvgPool->forward(); float *computedOutput = new float[myOutput->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); - - for (int i = 0; i < myOutput->size(); i++) - { - const float targetOutput = *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i); + cudaMemcpy(computedOutput, + op->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * myOutput->size(), + cudaMemcpyDeviceToHost); + + for (int i = 0; i < myOutput->size(); i++) { + const float targetOutput = + *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i); REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); } delete[] computedOutput; } - SECTION("Stride >= feature dim") - { - std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(Array4D<float, 1, 1, 3, 3>{// NCHW - { - {{{0.3745, 0.9507, 0.7320}, - {0.5987, 0.1560, 0.1560}, - {0.0581, 0.8662, 0.6011}}}}}); - std::shared_ptr<Node> myAvgPool = AvgPooling({3, 3}, "myAvgPool", {3, 3}); - auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool->getOperator()); - - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float, 1, 1, 1, 1>{ - {{{{(0.3745 + 0.9507 + 0.7320 + 0.5987 + 0.1560 + 0.1560 + 0.0581 + 0.8662 + 0.6011) / 9.0}}}}}); + SECTION("Stride >= feature dim") { + std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>( + Array4D<float, 1, 1, 3, 3>{// NCHW + {{{{0.3745, 0.9507, 0.7320}, + {0.5987, 0.1560, 0.1560}, + {0.0581, 0.8662, 0.6011}}}}}); + std::shared_ptr<Node> myAvgPool = + AvgPooling({3, 3}, "myAvgPool", {3, 3}); + auto op = + std::static_pointer_cast<OperatorTensor>(myAvgPool->getOperator()); + + std::shared_ptr<Tensor> myOutput = + std::make_shared<Tensor>(Array4D<float, 1, 1, 1, 1>{ + {{{{(0.3745 + 0.9507 + 0.7320 + 0.5987 + 0.1560 + 0.1560 + + 0.0581 + 0.8662 + 0.6011) / + 9.0}}}}}); op->associateInput(0, myInput2); op->setDataType(DataType::Float32); op->setBackend("cuda"); myAvgPool->forward(); float *computedOutput = new float[myOutput->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); - - for (int i = 0; i < myOutput->size(); i++) - { - const float targetOutput = *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i); + cudaMemcpy(computedOutput, + op->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * myOutput->size(), + cudaMemcpyDeviceToHost); + + for (int i = 0; i < myOutput->size(); i++) { + const float targetOutput = + *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i); REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); } delete[] computedOutput; } - SECTION("half") - { - std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(Array4D<half_float::half, 1, 1, 3, 3>{// NCHW - { - {{{half_float::half(0.3745), half_float::half(0.9507), half_float::half(0.7320)}, - {half_float::half(0.5987), half_float::half(0.1560), half_float::half(0.1560)}, - {half_float::half(0.0581), half_float::half(0.8662), half_float::half(0.6011)}}}}}); + SECTION("half") { + std::shared_ptr<Tensor> myInput2 = + std::make_shared<Tensor>(Array4D<half_float::half, 1, 1, 3, 3>{ + // NCHW + {{{{half_float::half(0.3745), + half_float::half(0.9507), + half_float::half(0.7320)}, + {half_float::half(0.5987), + half_float::half(0.1560), + half_float::half(0.1560)}, + {half_float::half(0.0581), + half_float::half(0.8662), + half_float::half(0.6011)}}}}}); myInput2->setBackend("cuda"); - std::shared_ptr<Node> myAvgPool = AvgPooling({3, 3}, "myAvgPoolcdw", {3, 3}); - auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool->getOperator()); - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<half_float::half, 1, 1, 1, 1>{ - {{{{(half_float::half(0.3745) + half_float::half(0.9507) + half_float::half(0.7320) + half_float::half(0.5987) + half_float::half(0.1560) + half_float::half(0.1560) + half_float::half(0.0581) + half_float::half(0.8662) + half_float::half(0.6011)) / half_float::half(9.0)}}}}}); + std::shared_ptr<Node> myAvgPool = + AvgPooling({3, 3}, "myAvgPoolcdw", {3, 3}); + auto op = + std::static_pointer_cast<OperatorTensor>(myAvgPool->getOperator()); + std::shared_ptr<Tensor> myOutput = + std::make_shared<Tensor>(Array4D<half_float::half, 1, 1, 1, 1>{ + {{{{(half_float::half(0.3745) + half_float::half(0.9507) + + half_float::half(0.7320) + half_float::half(0.5987) + + half_float::half(0.1560) + half_float::half(0.1560) + + half_float::half(0.0581) + half_float::half(0.8662) + + half_float::half(0.6011)) / + half_float::half(9.0)}}}}}); op->associateInput(0, myInput2); op->setDataType(DataType::Float16); op->setBackend("cuda"); myAvgPool->forward(); - half_float::half *computedOutput = new half_float::half[myOutput->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(half_float::half) * myOutput->size(), cudaMemcpyDeviceToHost); - - for (int i = 0; i < myOutput->size(); i++) - { - const half_float::half targetOutput = *(static_cast<half_float::half *>(myOutput->getImpl()->rawPtr()) + i); + half_float::half *computedOutput = + new half_float::half[myOutput->size()](); + cudaMemcpy(computedOutput, + op->getOutput(0)->getImpl()->rawPtr(), + sizeof(half_float::half) * myOutput->size(), + cudaMemcpyDeviceToHost); + + for (int i = 0; i < myOutput->size(); i++) { + const half_float::half targetOutput = + *(static_cast<half_float::half *>( + myOutput->getImpl()->rawPtr()) + + i); REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); } @@ -148,149 +173,150 @@ TEST_CASE("[gpu/operator] AvgPooling(forward)", "[AvgPooling][GPU]") std::random_device rd; std::mt19937 gen(rd()); std::uniform_real_distribution<float> valueDist( - 0.1f, 1.1f); // Random float distribution between 0 and 1 - std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(kernel), - std::size_t(10)); + 0.1f, + 1.1f); // Random float distribution between 0 and 1 + std::uniform_int_distribution<std::size_t> dimSizeDist( + std::size_t(kernel), + std::size_t(10)); // To measure execution time of 'AveragePooling_Op::forward()' std::chrono::time_point<std::chrono::system_clock> start; std::chrono::time_point<std::chrono::system_clock> end; std::chrono::duration<double, std::micro> duration{}; std::size_t number_of_operation = 0; - for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) - { + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { // Create AveragePooling Operator CUDA - std::shared_ptr<Node> myAvgPoolCuda = AvgPooling({kernel, kernel}, "myAvgPoolCuda", {stride, stride}); - auto op_cuda = std::static_pointer_cast<OperatorTensor>(myAvgPoolCuda->getOperator()); + std::shared_ptr<Node> myAvgPoolCuda = AvgPooling({kernel, kernel}, + "myAvgPoolCuda", + {stride, stride}); + auto op_cuda = std::static_pointer_cast<OperatorTensor>( + myAvgPoolCuda->getOperator()); op_cuda->setDataType(DataType::Float32); op_cuda->setBackend("cuda"); // Create AveragePooling Operator CUDA - std::shared_ptr<Node> myAvgPoolCpu = AvgPooling({kernel, kernel}, "myAvgPoolCpu", {stride, stride}); - auto op_cpu = std::static_pointer_cast<OperatorTensor>(myAvgPoolCpu->getOperator()); + std::shared_ptr<Node> myAvgPoolCpu = + AvgPooling({kernel, kernel}, "myAvgPoolCpu", {stride, stride}); + auto op_cpu = std::static_pointer_cast<OperatorTensor>( + myAvgPoolCpu->getOperator()); op_cpu->setDataType(DataType::Float32); op_cpu->setBackend("cpu"); // generate a random Tensor const std::size_t nbDims = 4; std::vector<std::size_t> dims; - for (std::size_t i = 0; i < nbDims; ++i) - { + for (std::size_t i = 0; i < nbDims; ++i) { dims.push_back(dimSizeDist(gen)); } - const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t nb_elements = + std::accumulate(dims.cbegin(), + dims.cend(), + std::size_t(1), + std::multiplies<std::size_t>()); number_of_operation += nb_elements; // Fill input tensor float *array0 = new float[nb_elements]; - for (std::size_t i = 0; i < nb_elements; ++i) - { + for (std::size_t i = 0; i < nb_elements; ++i) { array0[i] = valueDist(gen); } // input0 CUDA - float* array0_d; + float *array0_d; std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>(); T0_cuda->setDataType(DataType::Float32); T0_cuda->setBackend("cuda"); T0_cuda->resize(dims); op_cuda->associateInput(0, T0_cuda); - cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements); - cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice); + cudaMalloc(reinterpret_cast<void **>(&array0_d), + sizeof(float) * nb_elements); + cudaMemcpy(array0_d, + array0, + sizeof(float) * nb_elements, + cudaMemcpyHostToDevice); T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements); // input0 CPU std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>(); - op_cpu->associateInput(0,T0_cpu); + op_cpu->associateInput(0, T0_cpu); T0_cpu->setDataType(DataType::Float32); T0_cpu->setBackend("cpu"); T0_cpu->resize(dims); - T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements); + T0_cpu->getImpl()->setRawPtr(array0, nb_elements); - // Run inference + // Run inference start = std::chrono::system_clock::now(); op_cuda->forward(); end = std::chrono::system_clock::now(); - duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + duration += std::chrono::duration_cast<std::chrono::microseconds>( + end - start); - const std::size_t outSize = op_cuda->getOutput(0)->size(); + const std::size_t outSize = op_cuda->getOutput(0)->size(); float *computed_cuda = new float[outSize](); - cudaMemcpy(computed_cuda, op_cuda->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * outSize, cudaMemcpyDeviceToHost); + cudaMemcpy(computed_cuda, + op_cuda->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * outSize, + cudaMemcpyDeviceToHost); // forward CPU op_cpu->forward(); - float *computed_cpu = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr()); + float *computed_cpu = static_cast<float *>( + op_cpu->getOutput(0)->getImpl()->rawPtr()); REQUIRE(approxEq<float>(*computed_cuda, *computed_cpu)); delete[] computed_cuda; delete[] array0; cudaFree(array0_d); } - std::cout << "number of elements over time spent: " << (number_of_operation / duration.count()) << std::endl; + std::cout << "number of elements over time spent: " + << (number_of_operation / duration.count()) << std::endl; std::cout << "total time: " << duration.count() << "μs" << std::endl; } } -TEST_CASE("[gpu/operator] AvgPooling(backward)", "[AvgPooling][GPU]") -{ +TEST_CASE("[gpu/operator] AvgPooling(backward)", "[AvgPooling][GPU]") { // Run forward operation - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float, 1, 1, 4, 4> {// NCHW - { - { - { - {1, 2, 3, 4}, - {5, 6, 7, 8}, - {9, 10, 11, 12}, - {13, 14, 15, 16} - } - } - } - }); + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>( + Array4D<float, 1, 1, 4, 4>{// NCHW + {{{{1, 2, 3, 4}, + {5, 6, 7, 8}, + {9, 10, 11, 12}, + {13, 14, 15, 16}}}}}); myInput->setBackend("cuda"); std::shared_ptr<Node> myAvgPool = AvgPooling({2, 2}, "myAvgPool", {2, 2}); - auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool->getOperator()); + auto op = + std::static_pointer_cast<OperatorTensor>(myAvgPool->getOperator()); op->associateInput(0, myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); myAvgPool->forward(); // Run and test backward operation - std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array4D<float, 1,1,2,2> { - { - { - { - {1, 2}, - {3, 4} - } - } - } - }); + std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>( + Array4D<float, 1, 1, 2, 2>{{{{{1, 2}, {3, 4}}}}}); myOutputGrad->setBackend("cuda"); std::shared_ptr<Tensor> predictedOutput = op->getOutput(0); std::shared_ptr<Tensor> input = op->getInput(0); predictedOutput->setGrad(myOutputGrad); REQUIRE_NOTHROW(myAvgPool->backward()); - std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array4D<float, 1, 1, 4, 4>{ - { - { - { - {0.25, 0.25, 0.5, 0.5}, - {0.25, 0.25, 0.5, 0.5}, - {0.75, 0.75, 1, 1}, - {0.75, 0.75, 1, 1} - } - } - } - }); + std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>( + Array4D<float, 1, 1, 4, 4>{{{{{0.25, 0.25, 0.5, 0.5}, + {0.25, 0.25, 0.5, 0.5}, + {0.75, 0.75, 1, 1}, + {0.75, 0.75, 1, 1}}}}}); float *computedGradCuda = new float[expectedInputGrad->size()](); - cudaMemcpy(computedGradCuda, input->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInputGrad->size(), cudaMemcpyDeviceToHost); - - for(int i = 0; i < expectedInputGrad->size(); i++){ - const float targetOutput = *(static_cast<float*>(expectedInputGrad->getImpl()->rawPtr()) + i); + cudaMemcpy(computedGradCuda, + input->grad()->getImpl()->rawPtr(), + sizeof(float) * expectedInputGrad->size(), + cudaMemcpyDeviceToHost); + + for (int i = 0; i < expectedInputGrad->size(); i++) { + const float targetOutput = *( + static_cast<float *>(expectedInputGrad->getImpl()->rawPtr()) + i); REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6); } diff --git a/unit_tests/Test_BatchNormImpl.cpp b/unit_tests/Test_BatchNormImpl.cpp index c83624020d86a2eb786d249c5ee664ca3bfdde3b..d1c8be720a5f76005d37e98604035c793b34103f 100644 --- a/unit_tests/Test_BatchNormImpl.cpp +++ b/unit_tests/Test_BatchNormImpl.cpp @@ -11,108 +11,116 @@ #include <array> #include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution #include <catch2/catch_test_macros.hpp> +#include "Test_cuda.hpp" #include "aidge/backend/cpu.hpp" #include "aidge/backend/cuda.hpp" #include "aidge/data/Tensor.hpp" #include "aidge/utils/TensorUtils.hpp" -#include "Test_cuda.hpp" using namespace Aidge; TEST_CASE("[gpu/operator] BatchNorm(forward)") { SECTION("Static Input") { - std::shared_ptr<Node> myBatchNorm = BatchNorm<2>(3, 0.00001F, 0.1F, "mybatchnorm"); - auto op = std::static_pointer_cast<OperatorTensor>(myBatchNorm -> getOperator()); + std::shared_ptr<Node> myBatchNorm = + BatchNorm<2>(3, 0.00001F, 0.1F, "mybatchnorm"); + auto op = std::static_pointer_cast<OperatorTensor>( + myBatchNorm->getOperator()); op->setDataType(DataType::Float32); op->setBackend("cuda"); - std::shared_ptr<Tensor> myWeights= std::make_shared<Tensor>(Array1D<float,3> {{0.9159252643585205, 0.18772238492965698, 0.4479946792125702}}); - std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float,3> {{0.33898890018463135, 0.3167555630207062, 0.7047033309936523}}); - std::shared_ptr<Tensor> myMean = std::make_shared<Tensor>(Array1D<float,3> {{0.45547693967819214, 0.22650663554668427, 0.6612948179244995}}); - std::shared_ptr<Tensor> myVar = std::make_shared<Tensor>(Array1D<float,3> {{0.02570258639752865, 0.026536229997873306, 0.15111008286476135}}); - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,3,3,3> { //NCHW - { - { - {{0.12943482, 0.6451229 , 0.24979436}, - {0.7551012, 0.32007095, 0.89463896}, - {0.7087448, 0.6266124, 0.4782957 }}, - - {{0.13796203, 0.9950787, 0.71555305}, - {0.01347321, 0.4395316, 0.43097174}, - {0.6056306 , 0.9561122 , 0.5783939 }}, - - {{0.7174486 , 0.503465 , 0.23695093}, - {0.5145477, 0.39576462, 0.02779444}, - {0.60789394 ,0.14119725 ,0.20753163}} - }, - - - {{{0.74452287, 0.5354875 , 0.8148496 }, - {0.73356223, 0.4304034 , 0.11783765}, - {0.8966221, 0.41049036, 0.95982736}}, - - {{0.03161403, 0.71250844, 0.14337301}, - {0.5338889 , 0.13484782, 0.8055851 }, - {0.71784616 ,0.8349626 , 0.10107189}}, - - {{0.85701346, 0.58286697, 0.9836816 }, - {0.36061534, 0.03660944, 0.7375317 }, - {0.6977233, 0.51965624, 0.29440993}} - } - } - }); - - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,3,3,3> { - { - { - {{-1.5233592, 1.4222438, -0.83586717}, - { 2.0504384, -0.43444824, 2.847476 }, - { 1.7856512, 1.3165123, 0.46932936}}, - - {{ 0.21473758 , 1.2022772, 0.8802177 }, - { 0.07130594 , 0.5621954, 0.55233306}, - { 0.7535689 , 1.1573814, 0.72218764}}, - - {{ 0.7694162 , 0.52281666, 0.2156798 }, - { 0.5355886 , 0.3987003, -0.02535689}, - { 0.6431629 , 0.10533108 , 0.18177633}}}, - - - {{{ 1.990015, 0.7960079, 2.3917203 }, - { 1.9274082, 0.19576907, -1.5896021 }, - { 2.8588037 , 0.08202624 , 3.2198315 }}, - - {{ 0.09220716, 0.8767097, 0.22097193}, - { 0.6709106 , 0.2111495, 0.9839494 }, - { 0.8828597 , 1.0177971 , 0.17223406}}, - - {{ 0.9302539 , 0.6143213 , 1.0762292 }, - { 0.35819346, -0.01519828, 0.79256046}, - { 0.7466844 , 0.5414758 , 0.28189686}} - } - } - }); + std::shared_ptr<Tensor> myWeights = + std::make_shared<Tensor>(Array1D<float, 3>{{0.9159252643585205, + 0.18772238492965698, + 0.4479946792125702}}); + std::shared_ptr<Tensor> myBias = + std::make_shared<Tensor>(Array1D<float, 3>{{0.33898890018463135, + 0.3167555630207062, + 0.7047033309936523}}); + std::shared_ptr<Tensor> myMean = + std::make_shared<Tensor>(Array1D<float, 3>{{0.45547693967819214, + 0.22650663554668427, + 0.6612948179244995}}); + std::shared_ptr<Tensor> myVar = + std::make_shared<Tensor>(Array1D<float, 3>{{0.02570258639752865, + 0.026536229997873306, + 0.15111008286476135}}); + std::shared_ptr<Tensor> myInput = + std::make_shared<Tensor>(Array4D<float, 2, 3, 3, 3>{ + // NCHW + {{{{0.12943482, 0.6451229, 0.24979436}, + {0.7551012, 0.32007095, 0.89463896}, + {0.7087448, 0.6266124, 0.4782957}}, + + {{0.13796203, 0.9950787, 0.71555305}, + {0.01347321, 0.4395316, 0.43097174}, + {0.6056306, 0.9561122, 0.5783939}}, + + {{0.7174486, 0.503465, 0.23695093}, + {0.5145477, 0.39576462, 0.02779444}, + {0.60789394, 0.14119725, 0.20753163}}}, + + {{{0.74452287, 0.5354875, 0.8148496}, + {0.73356223, 0.4304034, 0.11783765}, + {0.8966221, 0.41049036, 0.95982736}}, + + {{0.03161403, 0.71250844, 0.14337301}, + {0.5338889, 0.13484782, 0.8055851}, + {0.71784616, 0.8349626, 0.10107189}}, + + {{0.85701346, 0.58286697, 0.9836816}, + {0.36061534, 0.03660944, 0.7375317}, + {0.6977233, 0.51965624, 0.29440993}}}}}); + + std::shared_ptr<Tensor> myOutput = + std::make_shared<Tensor>(Array4D<float, 2, 3, 3, 3>{ + {{{{-1.5233592, 1.4222438, -0.83586717}, + {2.0504384, -0.43444824, 2.847476}, + {1.7856512, 1.3165123, 0.46932936}}, + + {{0.21473758, 1.2022772, 0.8802177}, + {0.07130594, 0.5621954, 0.55233306}, + {0.7535689, 1.1573814, 0.72218764}}, + + {{0.7694162, 0.52281666, 0.2156798}, + {0.5355886, 0.3987003, -0.02535689}, + {0.6431629, 0.10533108, 0.18177633}}}, + + {{{1.990015, 0.7960079, 2.3917203}, + {1.9274082, 0.19576907, -1.5896021}, + {2.8588037, 0.08202624, 3.2198315}}, + + {{0.09220716, 0.8767097, 0.22097193}, + {0.6709106, 0.2111495, 0.9839494}, + {0.8828597, 1.0177971, 0.17223406}}, + + {{0.9302539, 0.6143213, 1.0762292}, + {0.35819346, -0.01519828, 0.79256046}, + {0.7466844, 0.5414758, 0.28189686}}}}}); myInput->setBackend("cuda"); myWeights->setBackend("cuda"); myBias->setBackend("cuda"); myMean->setBackend("cuda"); myVar->setBackend("cuda"); - op->associateInput(0,myInput); - op->associateInput(1,myWeights); - op->associateInput(2,myBias); - op->associateInput(3,myMean); - op->associateInput(4,myVar); + op->associateInput(0, myInput); + op->associateInput(1, myWeights); + op->associateInput(2, myBias); + op->associateInput(3, myMean); + op->associateInput(4, myVar); op->forward(); - float* computedOutput = new float[myOutput->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + float *computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, + op->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * myOutput->size(), + cudaMemcpyDeviceToHost); - for(int i = 0; i < myOutput->size(); i++){ - const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + for (int i = 0; i < myOutput->size(); i++) { + const float targetOutput = + *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i); REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-5); } @@ -127,44 +135,50 @@ TEST_CASE("[gpu/operator] BatchNorm(forward)") { std::random_device rd; std::mt19937 gen(rd()); std::uniform_real_distribution<float> valueDist( - 0.1f, 1.1f); // Random float distribution between 0 and 1 - std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1), - std::size_t(10)); + 0.1f, + 1.1f); // Random float distribution between 0 and 1 + std::uniform_int_distribution<std::size_t> dimSizeDist( + std::size_t(1), + std::size_t(10)); // To measure execution time of 'forward()' std::chrono::time_point<std::chrono::system_clock> start; std::chrono::time_point<std::chrono::system_clock> end; std::chrono::duration<double, std::micro> duration{}; - for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) - { + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { // generate a random Tensor const std::size_t nbDims = 4; std::vector<std::size_t> dims; - for (std::size_t i = 0; i < nbDims; ++i) - { + for (std::size_t i = 0; i < nbDims; ++i) { dims.push_back(dimSizeDist(gen)); } - const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t nb_elements = + std::accumulate(dims.cbegin(), + dims.cend(), + std::size_t(1), + std::multiplies<std::size_t>()); const std::size_t nbChannels = dims[1]; - // Create BatchNorm Operator Cuda - std::shared_ptr<Node> myBatchNormCuda = BatchNorm<2>(nbChannels, epsilon, momentum, "mybatchnormcuda"); - auto op_cuda = std::static_pointer_cast<OperatorTensor>(myBatchNormCuda -> getOperator()); + std::shared_ptr<Node> myBatchNormCuda = + BatchNorm<2>(nbChannels, epsilon, momentum, "mybatchnormcuda"); + auto op_cuda = std::static_pointer_cast<OperatorTensor>( + myBatchNormCuda->getOperator()); op_cuda->setDataType(DataType::Float32); op_cuda->setBackend("cuda"); // Create BatchNorm Operator CPU - std::shared_ptr<Node> myBatchNormCpu = BatchNorm<2>(nbChannels, epsilon, momentum, "mybatchnormcuda"); - auto op_cpu = std::static_pointer_cast<OperatorTensor>(myBatchNormCpu -> getOperator()); + std::shared_ptr<Node> myBatchNormCpu = + BatchNorm<2>(nbChannels, epsilon, momentum, "mybatchnormcuda"); + auto op_cpu = std::static_pointer_cast<OperatorTensor>( + myBatchNormCpu->getOperator()); op_cpu->setDataType(DataType::Float32); op_cpu->setBackend("cpu"); - float* array0 = new float[nb_elements]; - float* weights = new float[nbChannels]; - float* bias = new float[nbChannels]; - float* mean = new float[nbChannels]; - float* var = new float[nbChannels]; - + float *array0 = new float[nb_elements]; + float *weights = new float[nbChannels]; + float *bias = new float[nbChannels]; + float *mean = new float[nbChannels]; + float *var = new float[nbChannels]; for (std::size_t i = 0; i < nb_elements; ++i) { array0[i] = valueDist(gen); @@ -177,23 +191,27 @@ TEST_CASE("[gpu/operator] BatchNorm(forward)") { } // input0 CUDA - float* array0_d, *weight_d, *bias_d, *mean_d, *var_d; + float *array0_d, *weight_d, *bias_d, *mean_d, *var_d; std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>(); T0_cuda->setDataType(DataType::Float32); T0_cuda->setBackend("cuda"); T0_cuda->resize(dims); op_cuda->associateInput(0, T0_cuda); - cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements); - cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice); + cudaMalloc(reinterpret_cast<void **>(&array0_d), + sizeof(float) * nb_elements); + cudaMemcpy(array0_d, + array0, + sizeof(float) * nb_elements, + cudaMemcpyHostToDevice); T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements); // input0 CPU std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>(); - op_cpu->associateInput(0,T0_cpu); + op_cpu->associateInput(0, T0_cpu); T0_cpu->setDataType(DataType::Float32); T0_cpu->setBackend("cpu"); T0_cpu->resize(dims); - T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements); + T0_cpu->getImpl()->setRawPtr(array0, nb_elements); // weight CUDA std::shared_ptr<Tensor> Tw_cuda = std::make_shared<Tensor>(); @@ -201,17 +219,21 @@ TEST_CASE("[gpu/operator] BatchNorm(forward)") { Tw_cuda->setBackend("cuda"); Tw_cuda->resize({nbChannels}); op_cuda->associateInput(1, Tw_cuda); - cudaMalloc(reinterpret_cast<void **>(&weight_d), sizeof(float) * nbChannels); - cudaMemcpy(weight_d, weights, sizeof(float) * nbChannels, cudaMemcpyHostToDevice); + cudaMalloc(reinterpret_cast<void **>(&weight_d), + sizeof(float) * nbChannels); + cudaMemcpy(weight_d, + weights, + sizeof(float) * nbChannels, + cudaMemcpyHostToDevice); Tw_cuda->getImpl()->setRawPtr(weight_d, nbChannels); // weight CPU std::shared_ptr<Tensor> Tw_cpu = std::make_shared<Tensor>(); - op_cpu->associateInput(1,Tw_cpu); + op_cpu->associateInput(1, Tw_cpu); Tw_cpu->setDataType(DataType::Float32); Tw_cpu->setBackend("cpu"); Tw_cpu->resize({nbChannels}); - Tw_cpu -> getImpl() -> setRawPtr(weights, nbChannels); + Tw_cpu->getImpl()->setRawPtr(weights, nbChannels); // bias CUDA std::shared_ptr<Tensor> Tb_cuda = std::make_shared<Tensor>(); @@ -219,17 +241,21 @@ TEST_CASE("[gpu/operator] BatchNorm(forward)") { Tb_cuda->setBackend("cuda"); Tb_cuda->resize({nbChannels}); op_cuda->associateInput(2, Tb_cuda); - cudaMalloc(reinterpret_cast<void **>(&bias_d), sizeof(float) * nbChannels); - cudaMemcpy(bias_d, bias, sizeof(float) * nbChannels, cudaMemcpyHostToDevice); + cudaMalloc(reinterpret_cast<void **>(&bias_d), + sizeof(float) * nbChannels); + cudaMemcpy(bias_d, + bias, + sizeof(float) * nbChannels, + cudaMemcpyHostToDevice); Tb_cuda->getImpl()->setRawPtr(bias_d, nbChannels); // bias CPU std::shared_ptr<Tensor> Tb_cpu = std::make_shared<Tensor>(); - op_cpu->associateInput(2,Tb_cpu); + op_cpu->associateInput(2, Tb_cpu); Tb_cpu->setDataType(DataType::Float32); Tb_cpu->setBackend("cpu"); Tb_cpu->resize({nbChannels}); - Tb_cpu -> getImpl() -> setRawPtr(bias, nbChannels); + Tb_cpu->getImpl()->setRawPtr(bias, nbChannels); // mean CUDA std::shared_ptr<Tensor> Tm_cuda = std::make_shared<Tensor>(); @@ -237,17 +263,21 @@ TEST_CASE("[gpu/operator] BatchNorm(forward)") { Tm_cuda->setBackend("cuda"); Tm_cuda->resize({nbChannels}); op_cuda->associateInput(3, Tm_cuda); - cudaMalloc(reinterpret_cast<void **>(&mean_d), sizeof(float) * nbChannels); - cudaMemcpy(mean_d, mean, sizeof(float) * nbChannels, cudaMemcpyHostToDevice); + cudaMalloc(reinterpret_cast<void **>(&mean_d), + sizeof(float) * nbChannels); + cudaMemcpy(mean_d, + mean, + sizeof(float) * nbChannels, + cudaMemcpyHostToDevice); Tm_cuda->getImpl()->setRawPtr(mean_d, nbChannels); // mean CPU std::shared_ptr<Tensor> Tm_cpu = std::make_shared<Tensor>(); - op_cpu->associateInput(3,Tm_cpu); + op_cpu->associateInput(3, Tm_cpu); Tm_cpu->setDataType(DataType::Float32); Tm_cpu->setBackend("cpu"); Tm_cpu->resize({nbChannels}); - Tm_cpu -> getImpl() -> setRawPtr(mean, nbChannels); + Tm_cpu->getImpl()->setRawPtr(mean, nbChannels); // var CUDA std::shared_ptr<Tensor> Tv_cuda = std::make_shared<Tensor>(); @@ -255,31 +285,40 @@ TEST_CASE("[gpu/operator] BatchNorm(forward)") { Tv_cuda->setBackend("cuda"); Tv_cuda->resize({nbChannels}); op_cuda->associateInput(4, Tv_cuda); - cudaMalloc(reinterpret_cast<void **>(&var_d), sizeof(float) * nbChannels); - cudaMemcpy(var_d, var, sizeof(float) * nbChannels, cudaMemcpyHostToDevice); + cudaMalloc(reinterpret_cast<void **>(&var_d), + sizeof(float) * nbChannels); + cudaMemcpy(var_d, + var, + sizeof(float) * nbChannels, + cudaMemcpyHostToDevice); Tv_cuda->getImpl()->setRawPtr(var_d, nbChannels); // var CPU std::shared_ptr<Tensor> Tv_cpu = std::make_shared<Tensor>(); - op_cpu->associateInput(4,Tv_cpu); + op_cpu->associateInput(4, Tv_cpu); Tv_cpu->setDataType(DataType::Float32); Tv_cpu->setBackend("cpu"); Tv_cpu->resize({nbChannels}); - Tv_cpu -> getImpl() -> setRawPtr(var, nbChannels); + Tv_cpu->getImpl()->setRawPtr(var, nbChannels); // forward CUDA start = std::chrono::system_clock::now(); op_cuda->forward(); end = std::chrono::system_clock::now(); - duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + duration += std::chrono::duration_cast<std::chrono::microseconds>( + end - start); - const std::size_t outSize = op_cuda->getOutput(0)->size(); + const std::size_t outSize = op_cuda->getOutput(0)->size(); float *computed_cuda = new float[outSize](); - cudaMemcpy(computed_cuda, op_cuda->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * outSize, cudaMemcpyDeviceToHost); + cudaMemcpy(computed_cuda, + op_cuda->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * outSize, + cudaMemcpyDeviceToHost); // forward CPU op_cpu->forward(); - float *computed_cpu = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr()); + float *computed_cpu = static_cast<float *>( + op_cpu->getOutput(0)->getImpl()->rawPtr()); REQUIRE(approxEq<float>(*computed_cuda, *computed_cpu)); delete[] array0; @@ -295,52 +334,34 @@ TEST_CASE("[gpu/operator] BatchNorm(forward)") { cudaFree(var_d); } std::cout << "total time: " << duration.count() << "μs" << std::endl; - } } TEST_CASE("[gpu/operator] BatchNorm(backward)") { SECTION("Static Input") { - std::shared_ptr<Node> myBatchNorm = BatchNorm<2>(3, 0.00001F, 0.1F, "mybatchnorm"); - auto op = std::static_pointer_cast<OperatorTensor>(myBatchNorm -> getOperator()); + std::shared_ptr<Node> myBatchNorm = + BatchNorm<2>(3, 0.00001F, 0.1F, "mybatchnorm"); + auto op = std::static_pointer_cast<OperatorTensor>( + myBatchNorm->getOperator()); op->setDataType(DataType::Float32); op->setBackend("cuda"); // Forward - std::shared_ptr<Tensor> myWeights= std::make_shared<Tensor>(Array1D<float,3> {{-1.58390772, -0.48463920, 1.30413496}}); - std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float,3> {{0.06150287, -0.03140282, -0.49673468}}); - std::shared_ptr<Tensor> myMean = std::make_shared<Tensor>(Array1D<float,3> {{0.68328333, -0.47286209, 1.11688483}}); - std::shared_ptr<Tensor> myVar = std::make_shared<Tensor>(Array1D<float,3> {{0.84838068, 1.05930495, 0.53670371}}); - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,3,2,2> { //NCHW - { - { - { - {1.46650600, 1.24083233}, - {-0.33106008, -0.15137172} - }, - { - { 0.06625678, -1.83266091}, - { 0.53444749, -0.05167147} - }, - { - { 0.41069385, -0.70850474}, - { 0.23363227, 0.06111236} - } - }, - { - { - { 0.16707586, 1.07217050}, - { 1.18544745, 0.03441877} - }, - { - { 0.88106865, 0.33312374}, - { 0.87147945, 1.46628737} - }, - { - { 0.23930393, -0.94172227}, - { 1.48735642, 0.46449399} - } - } - } - }); + std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>( + Array1D<float, 3>{{-1.58390772, -0.48463920, 1.30413496}}); + std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>( + Array1D<float, 3>{{0.06150287, -0.03140282, -0.49673468}}); + std::shared_ptr<Tensor> myMean = std::make_shared<Tensor>( + Array1D<float, 3>{{0.68328333, -0.47286209, 1.11688483}}); + std::shared_ptr<Tensor> myVar = std::make_shared<Tensor>( + Array1D<float, 3>{{0.84838068, 1.05930495, 0.53670371}}); + std::shared_ptr<Tensor> myInput = + std::make_shared<Tensor>(Array4D<float, 2, 3, 2, 2>{ + // NCHW + {{{{1.46650600, 1.24083233}, {-0.33106008, -0.15137172}}, + {{0.06625678, -1.83266091}, {0.53444749, -0.05167147}}, + {{0.41069385, -0.70850474}, {0.23363227, 0.06111236}}}, + {{{0.16707586, 1.07217050}, {1.18544745, 0.03441877}}, + {{0.88106865, 0.33312374}, {0.87147945, 1.46628737}}, + {{0.23930393, -0.94172227}, {1.48735642, 0.46449399}}}}}); myInput->setBackend("cuda"); myWeights->setBackend("cuda"); @@ -348,47 +369,24 @@ TEST_CASE("[gpu/operator] BatchNorm(backward)") { myMean->setBackend("cuda"); myVar->setBackend("cuda"); - op->associateInput(0,myInput); - op->associateInput(1,myWeights); - op->associateInput(2,myBias); - op->associateInput(3,myMean); - op->associateInput(4,myVar); + op->associateInput(0, myInput); + op->associateInput(1, myWeights); + op->associateInput(2, myBias); + op->associateInput(3, myMean); + op->associateInput(4, myVar); op->forward(); // Backward - std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array4D<float,2,3,2,2> { - { - { - { - { 1.34347093, 0.90813798}, - { 0.39607167, 1.20428133} - }, - { - { 0.16845724, 0.48487359}, - { 0.40748054, -0.21790814} - }, - { - {-1.83932650, -0.42746788}, - { 0.97129798, 2.04073548} - } - }, - { - { - {-0.95714629, 0.18446854}, - { 1.14551663, -1.38118088} - }, - { - {-0.44466951, 2.73914146}, - { 0.57898718, 2.23699141} - }, - { - { 0.25004527, -0.18481003}, - {-0.72439206, 0.87744337} - } - - } - } - }); + std::shared_ptr<Tensor> myOutputGrad = + std::make_shared<Tensor>(Array4D<float, 2, 3, 2, 2>{ + {{{{1.34347093, 0.90813798}, {0.39607167, 1.20428133}}, + {{0.16845724, 0.48487359}, {0.40748054, -0.21790814}}, + {{-1.83932650, -0.42746788}, {0.97129798, 2.04073548}}}, + {{{-0.95714629, 0.18446854}, {1.14551663, -1.38118088}}, + {{-0.44466951, 2.73914146}, {0.57898718, 2.23699141}}, + {{0.25004527, -0.18481003}, {-0.72439206, 0.87744337}} + + }}}); myOutputGrad->setBackend("cuda"); std::shared_ptr<Tensor> predictedOutput = op->getOutput(0); @@ -398,38 +396,30 @@ TEST_CASE("[gpu/operator] BatchNorm(backward)") { predictedOutput->setGrad(myOutputGrad); REQUIRE_NOTHROW(myBatchNorm->backward()); - std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array4D<float, 2, 3, 2, 2>{ - { - { - { - {-0.92418045, -0.26092845}, - {-1.53920066, -3.14756274}}, + std::shared_ptr<Tensor> expectedInputGrad = + std::make_shared<Tensor>(Array4D<float, 2, 3, 2, 2>{ + {{{{-0.92418045, -0.26092845}, {-1.53920066, -3.14756274}}, - {{ 0.26948565, -0.18548687}, - { 0.21506749, 0.45458069}}, + {{0.26948565, -0.18548687}, {0.21506749, 0.45458069}}, - {{-3.57358932, -1.30609703}, - { 1.61337423, 3.55250096}}}, + {{-3.57358932, -1.30609703}, {1.61337423, 3.55250096}}}, + {{{2.41264391, 1.16695499}, {-0.90373814, 3.19601130}}, - {{{ 2.41264391, 1.16695499}, - {-0.90373814, 3.19601130}}, + {{0.71554798, -1.04076481}, {0.17618656, -0.60461664}}, - {{ 0.71554798, -1.04076481}, - { 0.17618656, -0.60461664}}, - - {{ 0.26926503, -0.92978811}, - {-1.13964832, 1.51398242} - } - } - } - }); + {{0.26926503, -0.92978811}, {-1.13964832, 1.51398242}}}}}); float *computedGradCuda = new float[expectedInputGrad->size()](); - cudaMemcpy(computedGradCuda, input->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInputGrad->size(), cudaMemcpyDeviceToHost); - - for(int i = 0; i < expectedInputGrad->size(); i++){ - const float targetOutput = *(static_cast<float*>(expectedInputGrad->getImpl()->rawPtr()) + i); + cudaMemcpy(computedGradCuda, + input->grad()->getImpl()->rawPtr(), + sizeof(float) * expectedInputGrad->size(), + cudaMemcpyDeviceToHost); + + for (int i = 0; i < expectedInputGrad->size(); i++) { + const float targetOutput = *( + static_cast<float *>(expectedInputGrad->getImpl()->rawPtr()) + + i); REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6); } diff --git a/unit_tests/Test_CastMove.cpp b/unit_tests/Test_CastMove.cpp index c96600f79967c69e43b3c334d3624f6514b6f936..ef5a334402f9ebc95d4121353261e67a888b7e3f 100644 --- a/unit_tests/Test_CastMove.cpp +++ b/unit_tests/Test_CastMove.cpp @@ -14,63 +14,66 @@ #include <string> #include "aidge/data/Tensor.hpp" -#include "aidge/utils/TensorUtils.hpp" -#include "aidge/graph/Node.hpp" #include "aidge/graph/GraphView.hpp" +#include "aidge/graph/Node.hpp" #include "aidge/graph/OpArgs.hpp" -#include "aidge/scheduler/SequentialScheduler.hpp" #include "aidge/recipes/Recipes.hpp" +#include "aidge/scheduler/SequentialScheduler.hpp" +#include "aidge/utils/TensorUtils.hpp" #include "aidge/backend/cuda.hpp" using namespace Aidge; TEST_CASE("[cuda/castmove] CastMove(forward)") { - std::shared_ptr<Tensor> inputTensor = - std::make_shared<Tensor>(Array4D<int, 2, 1, 5, 5>{{{{{0, 1, 2, 3, 4}, - {5, 6, 7, 8, 9}, - {10, 11, 12, 13, 14}, - {15, 16, 17, 18, 19}, - {20, 21, 22, 23, 24}}}, - {{{25, 26, 27, 28, 29}, - {30, 31, 32, 33, 34}, - {35, 36, 37, 38, 39}, - {40, 41, 42, 43, 44}, - {45, 46, 47, 48, 49}}}}}); - - std::shared_ptr<Tensor> weight1 = std::make_shared<Tensor>( - Array4D<int, 3, 1, 3, 3>{{{{{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}}, - {{{10, 11, 12}, {13, 14, 15}, {16, 17, 18}}}, - {{{19, 20, 21}, {22, 23, 24}, {25, 26, 27}}}}}); - - std::shared_ptr<Tensor> bias1 = std::make_shared<Tensor>(Array1D<int, 3>{{1, 2, 3}}); + std::shared_ptr<Tensor> inputTensor = std::make_shared<Tensor>( + Array4D<int, 2, 1, 5, 5>{{{{{0, 1, 2, 3, 4}, + {5, 6, 7, 8, 9}, + {10, 11, 12, 13, 14}, + {15, 16, 17, 18, 19}, + {20, 21, 22, 23, 24}}}, + {{{25, 26, 27, 28, 29}, + {30, 31, 32, 33, 34}, + {35, 36, 37, 38, 39}, + {40, 41, 42, 43, 44}, + {45, 46, 47, 48, 49}}}}}); + + std::shared_ptr<Tensor> weight1 = + std::make_shared<Tensor>(Array4D<int, 3, 1, 3, 3>{ + {{{{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}}, + {{{10, 11, 12}, {13, 14, 15}, {16, 17, 18}}}, + {{{19, 20, 21}, {22, 23, 24}, {25, 26, 27}}}}}); + + std::shared_ptr<Tensor> bias1 = + std::make_shared<Tensor>(Array1D<int, 3>{{1, 2, 3}}); SECTION("Test implicit") { std::shared_ptr<GraphView> g = - Sequential({ - Conv(1, 3, {3, 3}, "conv1"), - Conv(3, 4, {1, 1}, "conv2"), - Conv(4, 3, {1, 1}, "conv3")}); + Sequential({Conv(1, 3, {3, 3}, "conv1"), + Conv(3, 4, {1, 1}, "conv2"), + Conv(4, 3, {1, 1}, "conv3")}); g->getNode("conv1")->getOperator()->setInput(0, inputTensor); g->getNode("conv1")->getOperator()->setInput(1, weight1); g->getNode("conv1")->getOperator()->setInput(2, bias1); - std::shared_ptr<Tensor> weight2 = - std::make_shared<Tensor>(Array4D<int, 4, 3, 1, 1>{{{{{1}}, {{2}}, {{3}}}, - {{{4}}, {{5}}, {{6}}}, - {{{7}}, {{8}}, {{9}}}, - {{{10}}, {{11}}, {{12}}}}}); - std::shared_ptr<Tensor> bias2 = std::make_shared<Tensor>(Array1D<int, 4>{{1, 2, 3, 4}}); + std::shared_ptr<Tensor> weight2 = std::make_shared<Tensor>( + Array4D<int, 4, 3, 1, 1>{{{{{1}}, {{2}}, {{3}}}, + {{{4}}, {{5}}, {{6}}}, + {{{7}}, {{8}}, {{9}}}, + {{{10}}, {{11}}, {{12}}}}}); + std::shared_ptr<Tensor> bias2 = + std::make_shared<Tensor>(Array1D<int, 4>{{1, 2, 3, 4}}); g->getNode("conv2")->getOperator()->setInput(1, weight2); g->getNode("conv2")->getOperator()->setInput(2, bias2); // *(g->getNode("conv2")->getOperator()->input(1, weight2); std::shared_ptr<Tensor> weight3 = std::make_shared<Tensor>( - Array4D<int, 3, 4, 1, 1>{{{{{1}}, {{2}}, {{3}}, {{4}}}, - {{{5}}, {{6}}, {{7}}, {{8}}}, - {{{9}}, {{10}}, {{11}}, {{12}}}}}); - std::shared_ptr<Tensor> bias3 = std::make_shared<Tensor>(Array1D<int, 3>{{1, 2, 3}}); + Array4D<int, 3, 4, 1, 1>{{{{{1}}, {{2}}, {{3}}, {{4}}}, + {{{5}}, {{6}}, {{7}}, {{8}}}, + {{{9}}, {{10}}, {{11}}, {{12}}}}}); + std::shared_ptr<Tensor> bias3 = + std::make_shared<Tensor>(Array1D<int, 3>{{1, 2, 3}}); g->getNode("conv3")->getOperator()->setInput(1, weight3); g->getNode("conv3")->getOperator()->setInput(2, bias3); @@ -85,77 +88,121 @@ TEST_CASE("[cuda/castmove] CastMove(forward)") { REQUIRE_NOTHROW(scheduler.forward()); scheduler.saveSchedulingDiagram("schedulingSequential"); - std::shared_ptr<Tensor> expectedOutput1 = std::make_shared<Tensor>(Array4D<int, 2, 3, 3, 3>{ - {{{{367, 412, 457}, {592, 637, 682}, {817, 862, 907}}, - {{854, 980, 1106}, {1484, 1610, 1736}, {2114, 2240, 2366}}, - {{1341, 1548, 1755}, {2376, 2583, 2790}, {3411, 3618, 3825}}}, - {{{1492, 1537, 1582}, {1717, 1762, 1807}, {1942, 1987, 2032}}, - {{4004, 4130, 4256}, {4634, 4760, 4886}, {5264, 5390, 5516}}, - {{6516, 6723, 6930}, {7551, 7758, 7965}, {8586, 8793, 9000}}}}}); - - std::shared_ptr<Tensor> expectedOutput2 = std::make_shared<Tensor>(Array4D<int, 2, 4, 3, 3>{ - {{{{6099, 7017, 7935}, {10689, 11607, 12525}, {15279, 16197, 17115}}, - {{13786, 15838, 17890}, {24046, 26098, 28150}, {34306, 36358, 38410}}, - {{21473, 24659, 27845}, {37403, 40589, 43775}, {53333, 56519, 59705}}, - {{29160, 33480, 37800}, {50760, 55080, 59400}, {72360, 76680, 81000}}}, - {{{29049, 29967, 30885}, {33639, 34557, 35475}, {38229, 39147, 40065}}, - {{65086, 67138, 69190}, {75346, 77398, 79450}, {85606, 87658, 89710}}, - {{101123, 104309, 107495}, {117053, 120239, 123425}, {132983, 136169, 139355}}, - {{137160, 141480, 145800}, {158760, 163080, 167400}, {180360, 184680, 189000}}}}}); - - std::shared_ptr<Tensor> expectedOutput3 = std::make_shared<Tensor>(Array4D<int, 2, 3, 3, 3>{ - {{{{214731, 246591, 278451}, {374031, 405891, 437751}, {533331, 565191, 597051}}, - {{496804, 570568, 644332}, {865624, 939388, 1013152}, {1234444, 1308208, 1381972}}, - {{778877, 894545, 1010213}, {1357217, 1472885, 1588553}, {1935557, 2051225, 2166893}}}, - {{{1011231, 1043091, 1074951}, {1170531, 1202391, 1234251}, {1329831, 1361691, 1393551}}, - {{2340904, 2414668, 2488432}, {2709724, 2783488, 2857252}, {3078544, 3152308, 3226072}}, - {{3670577, 3786245, 3901913}, {4248917, 4364585, 4480253}, {4827257, 4942925, 5058593}}}}}); - - std::shared_ptr<Tensor> other1 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv1")->getOperator())->getOutput(0); + std::shared_ptr<Tensor> expectedOutput1 = std::make_shared< + Tensor>(Array4D<int, 2, 3, 3, 3>{ + {{{{367, 412, 457}, {592, 637, 682}, {817, 862, 907}}, + {{854, 980, 1106}, {1484, 1610, 1736}, {2114, 2240, 2366}}, + {{1341, 1548, 1755}, {2376, 2583, 2790}, {3411, 3618, 3825}}}, + {{{1492, 1537, 1582}, {1717, 1762, 1807}, {1942, 1987, 2032}}, + {{4004, 4130, 4256}, {4634, 4760, 4886}, {5264, 5390, 5516}}, + {{6516, 6723, 6930}, {7551, 7758, 7965}, {8586, 8793, 9000}}}}}); + + std::shared_ptr<Tensor> expectedOutput2 = std::make_shared<Tensor>( + Array4D<int, 2, 4, 3, 3>{{{{{6099, 7017, 7935}, + {10689, 11607, 12525}, + {15279, 16197, 17115}}, + {{13786, 15838, 17890}, + {24046, 26098, 28150}, + {34306, 36358, 38410}}, + {{21473, 24659, 27845}, + {37403, 40589, 43775}, + {53333, 56519, 59705}}, + {{29160, 33480, 37800}, + {50760, 55080, 59400}, + {72360, 76680, 81000}}}, + {{{29049, 29967, 30885}, + {33639, 34557, 35475}, + {38229, 39147, 40065}}, + {{65086, 67138, 69190}, + {75346, 77398, 79450}, + {85606, 87658, 89710}}, + {{101123, 104309, 107495}, + {117053, 120239, 123425}, + {132983, 136169, 139355}}, + {{137160, 141480, 145800}, + {158760, 163080, 167400}, + {180360, 184680, 189000}}}}}); + + std::shared_ptr<Tensor> expectedOutput3 = std::make_shared<Tensor>( + Array4D<int, 2, 3, 3, 3>{{{{{214731, 246591, 278451}, + {374031, 405891, 437751}, + {533331, 565191, 597051}}, + {{496804, 570568, 644332}, + {865624, 939388, 1013152}, + {1234444, 1308208, 1381972}}, + {{778877, 894545, 1010213}, + {1357217, 1472885, 1588553}, + {1935557, 2051225, 2166893}}}, + {{{1011231, 1043091, 1074951}, + {1170531, 1202391, 1234251}, + {1329831, 1361691, 1393551}}, + {{2340904, 2414668, 2488432}, + {2709724, 2783488, 2857252}, + {3078544, 3152308, 3226072}}, + {{3670577, 3786245, 3901913}, + {4248917, 4364585, 4480253}, + {4827257, 4942925, 5058593}}}}}); + + std::shared_ptr<Tensor> other1 = + std::static_pointer_cast<OperatorTensor>( + g->getNode("conv1")->getOperator()) + ->getOutput(0); Tensor hostOther1(other1->dataType()); hostOther1.setBackend("cpu"); hostOther1.copyCastFrom(*other1); - REQUIRE(approxEq<half_float::half, int>(hostOther1, *expectedOutput1, 0.001, 0.0)); - - std::shared_ptr<Tensor> other2 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv2")->getOperator())->getOutput(0); + REQUIRE(approxEq<half_float::half, int>(hostOther1, + *expectedOutput1, + 0.001, + 0.0)); + + std::shared_ptr<Tensor> other2 = + std::static_pointer_cast<OperatorTensor>( + g->getNode("conv2")->getOperator()) + ->getOutput(0); Tensor hostOther2(other2->dataType()); hostOther2.setBackend("cpu"); hostOther2.copyCastFrom(*other2); - REQUIRE(approxEq<float, int>(hostOther2, *expectedOutput2, 0.001, 0.0)); + REQUIRE( + approxEq<float, int>(hostOther2, *expectedOutput2, 0.001, 0.0)); - std::shared_ptr<Tensor> other3 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv3")->getOperator())->getOutput(0); + std::shared_ptr<Tensor> other3 = + std::static_pointer_cast<OperatorTensor>( + g->getNode("conv3")->getOperator()) + ->getOutput(0); Tensor hostOther3(other3->dataType()); hostOther3.setBackend("cpu"); hostOther3.copyCastFrom(*other3); - REQUIRE(approxEq<double, int>(hostOther3, *expectedOutput3, 0.001, 0.0)); + REQUIRE( + approxEq<double, int>(hostOther3, *expectedOutput3, 0.001, 0.0)); } SECTION("Test explicit") { std::shared_ptr<GraphView> g = - Sequential({ - Conv(1, 3, {3, 3}, "conv1"), - Conv(3, 4, {1, 1}, "conv2"), - Conv(4, 3, {1, 1}, "conv3")}); + Sequential({Conv(1, 3, {3, 3}, "conv1"), + Conv(3, 4, {1, 1}, "conv2"), + Conv(4, 3, {1, 1}, "conv3")}); g->getNode("conv1")->getOperator()->setInput(0, inputTensor); g->getNode("conv1")->getOperator()->setInput(1, weight1); g->getNode("conv1")->getOperator()->setInput(2, bias1); - std::shared_ptr<Tensor> weight2 = - std::make_shared<Tensor>(Array4D<int, 4, 3, 1, 1>{{{{{1}}, {{2}}, {{3}}}, - {{{4}}, {{5}}, {{6}}}, - {{{7}}, {{8}}, {{9}}}, - {{{10}}, {{11}}, {{12}}}}}); - std::shared_ptr<Tensor> bias2 = std::make_shared<Tensor>(Array1D<int, 4>{{1, 2, 3, 4}}); + std::shared_ptr<Tensor> weight2 = std::make_shared<Tensor>( + Array4D<int, 4, 3, 1, 1>{{{{{1}}, {{2}}, {{3}}}, + {{{4}}, {{5}}, {{6}}}, + {{{7}}, {{8}}, {{9}}}, + {{{10}}, {{11}}, {{12}}}}}); + std::shared_ptr<Tensor> bias2 = + std::make_shared<Tensor>(Array1D<int, 4>{{1, 2, 3, 4}}); g->getNode("conv2")->getOperator()->setInput(1, weight2); g->getNode("conv2")->getOperator()->setInput(2, bias2); // *(g->getNode("conv2")->getOperator()->input(1, weight2); std::shared_ptr<Tensor> weight3 = std::make_shared<Tensor>( - Array4D<int, 3, 4, 1, 1>{{{{{1}}, {{2}}, {{3}}, {{4}}}, - {{{5}}, {{6}}, {{7}}, {{8}}}, - {{{9}}, {{10}}, {{11}}, {{12}}}}}); - std::shared_ptr<Tensor> bias3 = std::make_shared<Tensor>(Array1D<int, 3>{{1, 2, 3}}); + Array4D<int, 3, 4, 1, 1>{{{{{1}}, {{2}}, {{3}}, {{4}}}, + {{{5}}, {{6}}, {{7}}, {{8}}}, + {{{9}}, {{10}}, {{11}}, {{12}}}}}); + std::shared_ptr<Tensor> bias3 = + std::make_shared<Tensor>(Array1D<int, 3>{{1, 2, 3}}); g->getNode("conv3")->getOperator()->setInput(1, weight3); g->getNode("conv3")->getOperator()->setInput(2, bias3); @@ -172,48 +219,91 @@ TEST_CASE("[cuda/castmove] CastMove(forward)") { REQUIRE_NOTHROW(scheduler.forward()); scheduler.saveSchedulingDiagram("schedulingSequential"); - std::shared_ptr<Tensor> expectedOutput1 = std::make_shared<Tensor>(Array4D<int, 2, 3, 3, 3>{ - {{{{367, 412, 457}, {592, 637, 682}, {817, 862, 907}}, - {{854, 980, 1106}, {1484, 1610, 1736}, {2114, 2240, 2366}}, - {{1341, 1548, 1755}, {2376, 2583, 2790}, {3411, 3618, 3825}}}, - {{{1492, 1537, 1582}, {1717, 1762, 1807}, {1942, 1987, 2032}}, - {{4004, 4130, 4256}, {4634, 4760, 4886}, {5264, 5390, 5516}}, - {{6516, 6723, 6930}, {7551, 7758, 7965}, {8586, 8793, 9000}}}}}); - - std::shared_ptr<Tensor> expectedOutput2 = std::make_shared<Tensor>(Array4D<int, 2, 4, 3, 3>{ - {{{{6099, 7017, 7935}, {10689, 11607, 12525}, {15279, 16197, 17115}}, - {{13786, 15838, 17890}, {24046, 26098, 28150}, {34306, 36358, 38410}}, - {{21473, 24659, 27845}, {37403, 40589, 43775}, {53333, 56519, 59705}}, - {{29160, 33480, 37800}, {50760, 55080, 59400}, {72360, 76680, 81000}}}, - {{{29049, 29967, 30885}, {33639, 34557, 35475}, {38229, 39147, 40065}}, - {{65086, 67138, 69190}, {75346, 77398, 79450}, {85606, 87658, 89710}}, - {{101123, 104309, 107495}, {117053, 120239, 123425}, {132983, 136169, 139355}}, - {{137160, 141480, 145800}, {158760, 163080, 167400}, {180360, 184680, 189000}}}}}); - - std::shared_ptr<Tensor> expectedOutput3 = std::make_shared<Tensor>(Array4D<int, 2, 3, 3, 3>{ - {{{{214731, 246591, 278451}, {374031, 405891, 437751}, {533331, 565191, 597051}}, - {{496804, 570568, 644332}, {865624, 939388, 1013152}, {1234444, 1308208, 1381972}}, - {{778877, 894545, 1010213}, {1357217, 1472885, 1588553}, {1935557, 2051225, 2166893}}}, - {{{1011231, 1043091, 1074951}, {1170531, 1202391, 1234251}, {1329831, 1361691, 1393551}}, - {{2340904, 2414668, 2488432}, {2709724, 2783488, 2857252}, {3078544, 3152308, 3226072}}, - {{3670577, 3786245, 3901913}, {4248917, 4364585, 4480253}, {4827257, 4942925, 5058593}}}}}); - - std::shared_ptr<Tensor> other1 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv1")->getOperator())->getOutput(0); + std::shared_ptr<Tensor> expectedOutput1 = std::make_shared< + Tensor>(Array4D<int, 2, 3, 3, 3>{ + {{{{367, 412, 457}, {592, 637, 682}, {817, 862, 907}}, + {{854, 980, 1106}, {1484, 1610, 1736}, {2114, 2240, 2366}}, + {{1341, 1548, 1755}, {2376, 2583, 2790}, {3411, 3618, 3825}}}, + {{{1492, 1537, 1582}, {1717, 1762, 1807}, {1942, 1987, 2032}}, + {{4004, 4130, 4256}, {4634, 4760, 4886}, {5264, 5390, 5516}}, + {{6516, 6723, 6930}, {7551, 7758, 7965}, {8586, 8793, 9000}}}}}); + + std::shared_ptr<Tensor> expectedOutput2 = std::make_shared<Tensor>( + Array4D<int, 2, 4, 3, 3>{{{{{6099, 7017, 7935}, + {10689, 11607, 12525}, + {15279, 16197, 17115}}, + {{13786, 15838, 17890}, + {24046, 26098, 28150}, + {34306, 36358, 38410}}, + {{21473, 24659, 27845}, + {37403, 40589, 43775}, + {53333, 56519, 59705}}, + {{29160, 33480, 37800}, + {50760, 55080, 59400}, + {72360, 76680, 81000}}}, + {{{29049, 29967, 30885}, + {33639, 34557, 35475}, + {38229, 39147, 40065}}, + {{65086, 67138, 69190}, + {75346, 77398, 79450}, + {85606, 87658, 89710}}, + {{101123, 104309, 107495}, + {117053, 120239, 123425}, + {132983, 136169, 139355}}, + {{137160, 141480, 145800}, + {158760, 163080, 167400}, + {180360, 184680, 189000}}}}}); + + std::shared_ptr<Tensor> expectedOutput3 = std::make_shared<Tensor>( + Array4D<int, 2, 3, 3, 3>{{{{{214731, 246591, 278451}, + {374031, 405891, 437751}, + {533331, 565191, 597051}}, + {{496804, 570568, 644332}, + {865624, 939388, 1013152}, + {1234444, 1308208, 1381972}}, + {{778877, 894545, 1010213}, + {1357217, 1472885, 1588553}, + {1935557, 2051225, 2166893}}}, + {{{1011231, 1043091, 1074951}, + {1170531, 1202391, 1234251}, + {1329831, 1361691, 1393551}}, + {{2340904, 2414668, 2488432}, + {2709724, 2783488, 2857252}, + {3078544, 3152308, 3226072}}, + {{3670577, 3786245, 3901913}, + {4248917, 4364585, 4480253}, + {4827257, 4942925, 5058593}}}}}); + + std::shared_ptr<Tensor> other1 = + std::static_pointer_cast<OperatorTensor>( + g->getNode("conv1")->getOperator()) + ->getOutput(0); Tensor hostOther1(other1->dataType()); hostOther1.setBackend("cpu"); hostOther1.copyCastFrom(*other1); - REQUIRE(approxEq<half_float::half, int>(hostOther1, *expectedOutput1, 0.001, 0.0)); - - std::shared_ptr<Tensor> other2 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv2")->getOperator())->getOutput(0); + REQUIRE(approxEq<half_float::half, int>(hostOther1, + *expectedOutput1, + 0.001, + 0.0)); + + std::shared_ptr<Tensor> other2 = + std::static_pointer_cast<OperatorTensor>( + g->getNode("conv2")->getOperator()) + ->getOutput(0); Tensor hostOther2(other2->dataType()); hostOther2.setBackend("cpu"); hostOther2.copyCastFrom(*other2); - REQUIRE(approxEq<float, int>(hostOther2, *expectedOutput2, 0.001, 0.0)); + REQUIRE( + approxEq<float, int>(hostOther2, *expectedOutput2, 0.001, 0.0)); - std::shared_ptr<Tensor> other3 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv3")->getOperator())->getOutput(0); + std::shared_ptr<Tensor> other3 = + std::static_pointer_cast<OperatorTensor>( + g->getNode("conv3")->getOperator()) + ->getOutput(0); Tensor hostOther3(other3->dataType()); hostOther3.setBackend("cpu"); hostOther3.copyCastFrom(*other3); - REQUIRE(approxEq<double, int>(hostOther3, *expectedOutput3, 0.001, 0.0)); + REQUIRE( + approxEq<double, int>(hostOther3, *expectedOutput3, 0.001, 0.0)); } } diff --git a/unit_tests/Test_ConvDepthWiseImpl.cpp b/unit_tests/Test_ConvDepthWiseImpl.cpp index 4655de069cce86e80881a06673621c8159be18f6..130c634b0c0475ae371f8e135fbb66fdc641148a 100644 --- a/unit_tests/Test_ConvDepthWiseImpl.cpp +++ b/unit_tests/Test_ConvDepthWiseImpl.cpp @@ -11,7 +11,7 @@ #include <array> #include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution #include <catch2/catch_test_macros.hpp> @@ -24,143 +24,117 @@ using namespace Aidge; TEST_CASE("[cpu/operator] ConvDepthWise(forward)", "[ConvDepthWise][CPU]") { SECTION("Deterministic Input") { - std::shared_ptr<Node> myCDW = ConvDepthWise(4, {3,3}, "mycdw"); - auto op = std::static_pointer_cast<OperatorTensor>(myCDW -> getOperator()); - std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array4D<float,4,1,3,3> { - { - {{ - { 0, 1, 2}, - { 3, 4, 5}, - { 6, 7, 8} - - }}, - {{ - { 27, 28, 29}, - { 30, 31, 32}, - { 33, 34, 35} - - }}, - {{ - { 54, 55, 56}, - { 57, 58, 59}, - { 60, 61, 62} - }}, - {{ - { 81, 82, 83}, - { 84, 85, 86}, - { 87, 88, 89} - }} - } - }); - std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float,4> {{7,0,9,0}}); - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,4,5,5> { //NCHW - { - { - {{ 0, 1, 2, 3, 4}, - { 5, 6, 7, 8, 9}, - { 10, 11, 12, 13, 14}, - { 15, 16, 17, 18, 19}, - { 20, 21, 22, 23, 24}}, - - {{ 25, 26, 27, 28, 29}, - { 30, 31, 32, 33, 34}, - { 35, 36, 37, 38, 39}, - { 40, 41, 42, 43, 44}, - { 45, 46, 47, 48, 49}}, - - {{ 50, 51, 52, 53, 54}, - { 55, 56, 57, 58, 59}, - { 60, 61, 62, 63, 64}, - { 65, 66, 67, 68, 69}, - { 70, 71, 72, 73, 74}}, - - {{ 75, 76, 77, 78, 79}, - { 80, 81, 82, 83, 84}, - { 85, 86, 87, 88, 89}, - { 90, 91, 92, 93, 94}, - { 95, 96, 97, 98, 99}} - }, - { - {{100, 101, 102, 103, 104}, - {105, 106, 107, 108, 109}, - {110, 111, 112, 113, 114}, - {115, 116, 117, 118, 119}, - {120, 121, 122, 123, 124}}, - - {{125, 126, 127, 128, 129}, - {130, 131, 132, 133, 134}, - {135, 136, 137, 138, 139}, - {140, 141, 142, 143, 144}, - {145, 146, 147, 148, 149}}, - - {{150, 151, 152, 153, 154}, - {155, 156, 157, 158, 159}, - {160, 161, 162, 163, 164}, - {165, 166, 167, 168, 169}, - {170, 171, 172, 173, 174}}, - - {{175, 176, 177, 178, 179}, - {180, 181, 182, 183, 184}, - {185, 186, 187, 188, 189}, - {190, 191, 192, 193, 194}, - {195, 196, 197, 198, 199}} - } - } - }); - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,4,3,3> { - { - { - {{ 319, 355, 391}, - { 499, 535, 571}, - { 679, 715, 751}}, - - {{ 8745, 9024, 9303}, - { 10140, 10419, 10698}, - { 11535, 11814, 12093}}, - - {{ 29337, 29859, 30381}, - { 31947, 32469, 32991}, - { 34557, 35079, 35601}}, - - {{ 62061, 62826, 63591}, - { 65886, 66651, 67416}, - { 69711, 70476, 71241}} - }, - { - {{ 3919, 3955, 3991}, - { 4099, 4135, 4171}, - { 4279, 4315, 4351}}, - - {{ 36645, 36924, 37203}, - { 38040, 38319, 38598}, - { 39435, 39714, 39993}}, - - {{ 81537, 82059, 82581}, - { 84147, 84669, 85191}, - { 86757, 87279, 87801}}, - - {{138561, 139326, 140091}, - {142386, 143151, 143916}, - {146211, 146976, 147741}} - } - } - }); + std::shared_ptr<Node> myCDW = ConvDepthWise(4, {3, 3}, "mycdw"); + auto op = + std::static_pointer_cast<OperatorTensor>(myCDW->getOperator()); + std::shared_ptr<Tensor> myWeights = + std::make_shared<Tensor>(Array4D<float, 4, 1, 3, 3>{ + {{{{0, 1, 2}, {3, 4, 5}, {6, 7, 8} + + }}, + {{{27, 28, 29}, {30, 31, 32}, {33, 34, 35} + + }}, + {{{54, 55, 56}, {57, 58, 59}, {60, 61, 62}}}, + {{{81, 82, 83}, {84, 85, 86}, {87, 88, 89}}}}}); + std::shared_ptr<Tensor> myBias = + std::make_shared<Tensor>(Array1D<float, 4>{{7, 0, 9, 0}}); + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>( + Array4D<float, 2, 4, 5, 5>{// NCHW + {{{{0, 1, 2, 3, 4}, + {5, 6, 7, 8, 9}, + {10, 11, 12, 13, 14}, + {15, 16, 17, 18, 19}, + {20, 21, 22, 23, 24}}, + + {{25, 26, 27, 28, 29}, + {30, 31, 32, 33, 34}, + {35, 36, 37, 38, 39}, + {40, 41, 42, 43, 44}, + {45, 46, 47, 48, 49}}, + + {{50, 51, 52, 53, 54}, + {55, 56, 57, 58, 59}, + {60, 61, 62, 63, 64}, + {65, 66, 67, 68, 69}, + {70, 71, 72, 73, 74}}, + + {{75, 76, 77, 78, 79}, + {80, 81, 82, 83, 84}, + {85, 86, 87, 88, 89}, + {90, 91, 92, 93, 94}, + {95, 96, 97, 98, 99}}}, + {{{100, 101, 102, 103, 104}, + {105, 106, 107, 108, 109}, + {110, 111, 112, 113, 114}, + {115, 116, 117, 118, 119}, + {120, 121, 122, 123, 124}}, + + {{125, 126, 127, 128, 129}, + {130, 131, 132, 133, 134}, + {135, 136, 137, 138, 139}, + {140, 141, 142, 143, 144}, + {145, 146, 147, 148, 149}}, + + {{150, 151, 152, 153, 154}, + {155, 156, 157, 158, 159}, + {160, 161, 162, 163, 164}, + {165, 166, 167, 168, 169}, + {170, 171, 172, 173, 174}}, + + {{175, 176, 177, 178, 179}, + {180, 181, 182, 183, 184}, + {185, 186, 187, 188, 189}, + {190, 191, 192, 193, 194}, + {195, 196, 197, 198, 199}}}}}); + std::shared_ptr<Tensor> myOutput = + std::make_shared<Tensor>(Array4D<float, 2, 4, 3, 3>{ + {{{{319, 355, 391}, {499, 535, 571}, {679, 715, 751}}, + + {{8745, 9024, 9303}, + {10140, 10419, 10698}, + {11535, 11814, 12093}}, + + {{29337, 29859, 30381}, + {31947, 32469, 32991}, + {34557, 35079, 35601}}, + + {{62061, 62826, 63591}, + {65886, 66651, 67416}, + {69711, 70476, 71241}}}, + {{{3919, 3955, 3991}, {4099, 4135, 4171}, {4279, 4315, 4351}}, + + {{36645, 36924, 37203}, + {38040, 38319, 38598}, + {39435, 39714, 39993}}, + + {{81537, 82059, 82581}, + {84147, 84669, 85191}, + {86757, 87279, 87801}}, + + {{138561, 139326, 140091}, + {142386, 143151, 143916}, + {146211, 146976, 147741}}}}}); myInput->setBackend("cuda"); myWeights->setBackend("cuda"); myBias->setBackend("cuda"); - op -> associateInput(0, myInput); - op -> associateInput(1, myWeights); - op -> associateInput(2, myBias); + op->associateInput(0, myInput); + op->associateInput(1, myWeights); + op->associateInput(2, myBias); op->setDataType(DataType::Float32); op->setBackend("cuda"); - myCDW -> forward(); + myCDW->forward(); - float* computedOutput = new float[myOutput->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + float *computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, + op->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * myOutput->size(), + cudaMemcpyDeviceToHost); - for(int i = 0; i < myOutput->size(); i++){ - const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + for (int i = 0; i < myOutput->size(); i++) { + const float targetOutput = + *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i); REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); } @@ -173,50 +147,69 @@ TEST_CASE("[cpu/operator] ConvDepthWise(forward)", "[ConvDepthWise][CPU]") { std::random_device rd; std::mt19937 gen(rd()); std::uniform_real_distribution<float> valueDist( - 0.1f, 1.1f); // Random float distribution between 0 and 1 - std::uniform_int_distribution<std::size_t> kernelDist(1, std::size_t(5)); - std::uniform_int_distribution<std::size_t> dimSizeDist(1, std::size_t(10)); + 0.1f, + 1.1f); // Random float distribution between 0 and 1 + std::uniform_int_distribution<std::size_t> kernelDist(1, + std::size_t(5)); + std::uniform_int_distribution<std::size_t> dimSizeDist( + 1, + std::size_t(10)); // To measure execution time of 'forward()' std::chrono::time_point<std::chrono::system_clock> start; std::chrono::time_point<std::chrono::system_clock> end; std::chrono::duration<double, std::micro> duration{}; - for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) - { + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { const std::size_t kernel = kernelDist(gen); - std::uniform_int_distribution<std::size_t> resolutionDist(std::size_t(kernel+2), - std::size_t(10)); + std::uniform_int_distribution<std::size_t> resolutionDist( + std::size_t(kernel + 2), + std::size_t(10)); const std::size_t nbDims = 4; // input (batch, ch, Xin, Yin) // weight (outCh, ch, kernelX, kernelY) std::vector<std::size_t> dims; for (std::size_t i = 0; i < nbDims; ++i) { - if(i < 2) + if (i < 2) dims.push_back(dimSizeDist(gen)); else dims.push_back(resolutionDist(gen)); } - dims[1] = 1; // TODO FIX: ConvDepthWise doesn't give the same output in CUDA as in CPU unless channels is 1 + dims[1] = 1; // TODO FIX: ConvDepthWise doesn't give the same + // output in CUDA as in CPU unless channels is 1 const std::size_t nbChannels = dims[1]; - const std::vector<std::size_t> dimsW{nbChannels,nbChannels,kernel,kernel}; - - const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); - const std::size_t wieghtSize = std::accumulate(dimsW.cbegin(), dimsW.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::vector<std::size_t> dimsW{nbChannels, + nbChannels, + kernel, + kernel}; + + const std::size_t nb_elements = + std::accumulate(dims.cbegin(), + dims.cend(), + std::size_t(1), + std::multiplies<std::size_t>()); + const std::size_t wieghtSize = + std::accumulate(dimsW.cbegin(), + dimsW.cend(), + std::size_t(1), + std::multiplies<std::size_t>()); // Create ConvDepthWise Operator CUDA - std::shared_ptr<Node> myConvCUDA = ConvDepthWise(nbChannels,{kernel,kernel}, "myconvcuda"); - auto op_cuda = std::static_pointer_cast<OperatorTensor>(myConvCUDA -> getOperator()); + std::shared_ptr<Node> myConvCUDA = + ConvDepthWise(nbChannels, {kernel, kernel}, "myconvcuda"); + auto op_cuda = std::static_pointer_cast<OperatorTensor>( + myConvCUDA->getOperator()); // Create ConvDepthWise Operator CPU - std::shared_ptr<Node> myConvCPU = ConvDepthWise(nbChannels,{kernel,kernel}, "myconvcpu"); - auto op_cpu = std::static_pointer_cast<OperatorTensor>(myConvCPU -> getOperator()); + std::shared_ptr<Node> myConvCPU = + ConvDepthWise(nbChannels, {kernel, kernel}, "myconvcpu"); + auto op_cpu = std::static_pointer_cast<OperatorTensor>( + myConvCPU->getOperator()); op_cpu->setDataType(DataType::Float32); op_cpu->setBackend("cpu"); - - float* array0 = new float[nb_elements]; - float* weights = new float[wieghtSize]; - float* bias = new float[nbChannels]; + float *array0 = new float[nb_elements]; + float *weights = new float[wieghtSize]; + float *bias = new float[nbChannels]; for (std::size_t i = 0; i < nb_elements; ++i) { array0[i] = valueDist(gen); @@ -229,23 +222,27 @@ TEST_CASE("[cpu/operator] ConvDepthWise(forward)", "[ConvDepthWise][CPU]") { } // input0 CUDA - float* array0_d, *weight_d, *bias_d; + float *array0_d, *weight_d, *bias_d; std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>(); T0_cuda->setDataType(DataType::Float32); T0_cuda->setBackend("cuda"); T0_cuda->resize(dims); op_cuda->associateInput(0, T0_cuda); - cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements); - cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice); + cudaMalloc(reinterpret_cast<void **>(&array0_d), + sizeof(float) * nb_elements); + cudaMemcpy(array0_d, + array0, + sizeof(float) * nb_elements, + cudaMemcpyHostToDevice); T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements); // input0 CPU std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>(); - op_cpu->associateInput(0,T0_cpu); + op_cpu->associateInput(0, T0_cpu); T0_cpu->setDataType(DataType::Float32); T0_cpu->setBackend("cpu"); T0_cpu->resize(dims); - T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements); + T0_cpu->getImpl()->setRawPtr(array0, nb_elements); // weight CUDA std::shared_ptr<Tensor> Tw_cuda = std::make_shared<Tensor>(); @@ -253,17 +250,21 @@ TEST_CASE("[cpu/operator] ConvDepthWise(forward)", "[ConvDepthWise][CPU]") { Tw_cuda->setBackend("cuda"); Tw_cuda->resize(dimsW); op_cuda->associateInput(1, Tw_cuda); - cudaMalloc(reinterpret_cast<void **>(&weight_d), sizeof(float) * wieghtSize); - cudaMemcpy(weight_d, weights, sizeof(float) * wieghtSize, cudaMemcpyHostToDevice); + cudaMalloc(reinterpret_cast<void **>(&weight_d), + sizeof(float) * wieghtSize); + cudaMemcpy(weight_d, + weights, + sizeof(float) * wieghtSize, + cudaMemcpyHostToDevice); Tw_cuda->getImpl()->setRawPtr(weight_d, wieghtSize); // weight CPU std::shared_ptr<Tensor> Tw_cpu = std::make_shared<Tensor>(); - op_cpu->associateInput(1,Tw_cpu); + op_cpu->associateInput(1, Tw_cpu); Tw_cpu->setDataType(DataType::Float32); Tw_cpu->setBackend("cpu"); Tw_cpu->resize(dimsW); - Tw_cpu -> getImpl() -> setRawPtr(weights, wieghtSize); + Tw_cpu->getImpl()->setRawPtr(weights, wieghtSize); // bias CUDA std::shared_ptr<Tensor> Tb_cuda = std::make_shared<Tensor>(); @@ -271,17 +272,21 @@ TEST_CASE("[cpu/operator] ConvDepthWise(forward)", "[ConvDepthWise][CPU]") { Tb_cuda->setBackend("cuda"); Tb_cuda->resize({nbChannels}); op_cuda->associateInput(2, Tb_cuda); - cudaMalloc(reinterpret_cast<void **>(&bias_d), sizeof(float) * nbChannels); - cudaMemcpy(bias_d, bias, sizeof(float) * nbChannels, cudaMemcpyHostToDevice); + cudaMalloc(reinterpret_cast<void **>(&bias_d), + sizeof(float) * nbChannels); + cudaMemcpy(bias_d, + bias, + sizeof(float) * nbChannels, + cudaMemcpyHostToDevice); Tb_cuda->getImpl()->setRawPtr(bias_d, nbChannels); // bias CPU std::shared_ptr<Tensor> Tb_cpu = std::make_shared<Tensor>(); - op_cpu->associateInput(2,Tb_cpu); + op_cpu->associateInput(2, Tb_cpu); Tb_cpu->setDataType(DataType::Float32); Tb_cpu->setBackend("cpu"); Tb_cpu->resize({nbChannels}); - Tb_cpu -> getImpl() -> setRawPtr(bias, nbChannels); + Tb_cpu->getImpl()->setRawPtr(bias, nbChannels); // forward CUDA op_cuda->setDataType(DataType::Float32); @@ -289,15 +294,20 @@ TEST_CASE("[cpu/operator] ConvDepthWise(forward)", "[ConvDepthWise][CPU]") { start = std::chrono::system_clock::now(); op_cuda->forward(); end = std::chrono::system_clock::now(); - duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + duration += std::chrono::duration_cast<std::chrono::microseconds>( + end - start); - const std::size_t outSize = op_cuda->getOutput(0)->size(); + const std::size_t outSize = op_cuda->getOutput(0)->size(); float *computed_cuda = new float[outSize](); - cudaMemcpy(computed_cuda, op_cuda->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * outSize, cudaMemcpyDeviceToHost); + cudaMemcpy(computed_cuda, + op_cuda->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * outSize, + cudaMemcpyDeviceToHost); // forward CPU op_cpu->forward(); - float *computed_cpu = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr()); + float *computed_cpu = static_cast<float *>( + op_cpu->getOutput(0)->getImpl()->rawPtr()); REQUIRE(approxEq<float>(*computed_cuda, *computed_cpu)); diff --git a/unit_tests/Test_ConvImpl.cpp b/unit_tests/Test_ConvImpl.cpp index 72a4040a8ecbd091e24f8441d9c29970ea82c606..5c5d9dafd543fe22f437fb6f382857fcbca9ce06 100644 --- a/unit_tests/Test_ConvImpl.cpp +++ b/unit_tests/Test_ConvImpl.cpp @@ -11,7 +11,7 @@ #include <array> #include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution #include <catch2/catch_test_macros.hpp> @@ -24,43 +24,38 @@ using namespace Aidge; TEST_CASE("[gpu/operator] Conv(forward)") { SECTION("Simple Conv no bias") { - std::shared_ptr<Node> myConv = Conv(1,1,{3,3}, "myconv"); - auto op = std::static_pointer_cast<OperatorTensor>(myConv->getOperator()); + std::shared_ptr<Node> myConv = Conv(1, 1, {3, 3}, "myconv"); + auto op = + std::static_pointer_cast<OperatorTensor>(myConv->getOperator()); op->setDataType(DataType::Float32); op->setBackend("cuda"); - std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array4D<float,1,1,3,3> { - { - { - {{ 0, 1, 2}, - { 3, 4, 5}, - { 6, 7, 8}} - } - } - }); - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,1,1,3,3> { //NCHW - { - { - {{ 0, 1, 2}, - { 3, 4, 5}, - { 6, 7, 8}} - } - } - }); - const float myOutput = 0*0+1*1+2*2+3*3+4*4+5*5+6*6+7*7+8*8; + std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>( + Array4D<float, 1, 1, 3, 3>{{{{{0, 1, 2}, {3, 4, 5}, {6, 7, 8}}}}}); + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>( + Array4D<float, 1, 1, 3, 3>{// NCHW + {{{{0, 1, 2}, {3, 4, 5}, {6, 7, 8}}}}}); + const float myOutput = 0 * 0 + 1 * 1 + 2 * 2 + 3 * 3 + 4 * 4 + 5 * 5 + + 6 * 6 + 7 * 7 + 8 * 8; myInput->setBackend("cuda"); myWeights->setBackend("cuda"); - op->associateInput(0,myInput); - op->associateInput(1,myWeights); + op->associateInput(0, myInput); + op->associateInput(1, myWeights); myConv->forward(); REQUIRE(op->getOutput(0)->size() == 1); std::array<float, 9> kernel; - cudaMemcpy(&kernel[0], myWeights->getImpl()->rawPtr(), 9 * sizeof(float), cudaMemcpyDeviceToHost); + cudaMemcpy(&kernel[0], + myWeights->getImpl()->rawPtr(), + 9 * sizeof(float), + cudaMemcpyDeviceToHost); std::array<float, 9> input; - cudaMemcpy(&input[0], myInput->getImpl()->rawPtr(), 9 * sizeof(float), cudaMemcpyDeviceToHost); + cudaMemcpy(&input[0], + myInput->getImpl()->rawPtr(), + 9 * sizeof(float), + cudaMemcpyDeviceToHost); for (int i = 0; i < 9; ++i) { REQUIRE(kernel[i] == i); @@ -68,155 +63,117 @@ TEST_CASE("[gpu/operator] Conv(forward)") { } float computedOutput; - cudaMemcpy(&computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float), cudaMemcpyDeviceToHost); + cudaMemcpy(&computedOutput, + op->getOutput(0)->getImpl()->rawPtr(), + sizeof(float), + cudaMemcpyDeviceToHost); REQUIRE(fabs(computedOutput - myOutput) < 1e-6); } SECTION("Classic Conv") { - std::shared_ptr<Node> myConv = Conv(3,4,{3,3}, "myconv"); - auto op = std::static_pointer_cast<OperatorTensor>(myConv->getOperator()); + std::shared_ptr<Node> myConv = Conv(3, 4, {3, 3}, "myconv"); + auto op = + std::static_pointer_cast<OperatorTensor>(myConv->getOperator()); op->setDataType(DataType::Float32); op->setBackend("cuda"); - std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array4D<float,4,3,3,3> { - { - { - {{ 0, 1, 2}, - { 3, 4, 5}, - { 6, 7, 8}}, - {{ 9, 10, 11}, - { 12, 13, 14}, - { 15, 16, 17}}, - {{ 18, 19, 20}, - { 21, 22, 23}, - { 24, 25, 26}} - }, - { - {{ 27, 28, 29}, - { 30, 31, 32}, - { 33, 34, 35}}, - {{ 36, 37, 38}, - { 39, 40, 41}, - { 42, 43, 44}}, - {{ 45, 46, 47}, - { 48, 49, 50}, - { 51, 52, 53}} - }, - { - {{ 54, 55, 56}, - { 57, 58, 59}, - { 60, 61, 62}}, - {{ 63, 64, 65}, - { 66, 67, 68}, - { 69, 70, 71}}, - {{ 72, 73, 74}, - { 75, 76, 77}, - { 78, 79, 80}} - }, - { - {{ 81, 82, 83}, - { 84, 85, 86}, - { 87, 88, 89}}, - {{ 90, 91, 92}, - { 93, 94, 95}, - { 96, 97, 98}}, - {{ 99, 100, 101}, - {102, 103, 104}, - {105, 106, 107}} - } - } - }); - std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float,4> {{7,0,9,0}}); - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,3,5,5> { //NCHW - { - { - {{ 0, 1, 2, 3, 4}, - { 5, 6, 7, 8, 9}, - { 10, 11, 12, 13, 14}, - { 15, 16, 17, 18, 19}, - { 20, 21, 22, 23, 24}}, - - {{ 25, 26, 27, 28, 29}, - { 30, 31, 32, 33, 34}, - { 35, 36, 37, 38, 39}, - { 40, 41, 42, 43, 44}, - { 45, 46, 47, 48, 49}}, - - {{ 50, 51, 52, 53, 54}, - { 55, 56, 57, 58, 59}, - { 60, 61, 62, 63, 64}, - { 65, 66, 67, 68, 69}, - { 70, 71, 72, 73, 74}} - }, - { - {{ 75, 76, 77, 78, 79}, - { 80, 81, 82, 83, 84}, - { 85, 86, 87, 88, 89}, - { 90, 91, 92, 93, 94}, - { 95, 96, 97, 98, 99}}, - - {{100, 101, 102, 103, 104}, - {105, 106, 107, 108, 109}, - {110, 111, 112, 113, 114}, - {115, 116, 117, 118, 119}, - {120, 121, 122, 123, 124}}, - - {{125, 126, 127, 128, 129}, - {130, 131, 132, 133, 134}, - {135, 136, 137, 138, 139}, - {140, 141, 142, 143, 144}, - {145, 146, 147, 148, 149}} - } - } - }); - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,4,3,3> { - { - { - {{ 15226, 15577, 15928}, - { 16981, 17332, 17683}, - { 18736, 19087, 19438}}, - {{ 37818, 38898, 39978}, - { 43218, 44298, 45378}, - { 48618, 49698, 50778}}, - {{ 60426, 62235, 64044}, - { 69471, 71280, 73089}, - { 78516, 80325, 82134}}, - {{ 83016, 85554, 88092}, - { 95706, 98244, 100782}, - {108396, 110934, 113472}} - }, - { - {{ 41551, 41902, 42253}, - { 43306, 43657, 44008}, - { 45061, 45412, 45763}}, - {{118818, 119898, 120978}, - {124218, 125298, 126378}, - {129618, 130698, 131778}}, - {{196101, 197910, 199719}, - {205146, 206955, 208764}, - {214191, 216000, 217809}}, - {{273366, 275904, 278442}, - {286056, 288594, 291132}, - {298746, 301284, 303822}} - } - } - }); + std::shared_ptr<Tensor> myWeights = + std::make_shared<Tensor>(Array4D<float, 4, 3, 3, 3>{ + {{{{0, 1, 2}, {3, 4, 5}, {6, 7, 8}}, + {{9, 10, 11}, {12, 13, 14}, {15, 16, 17}}, + {{18, 19, 20}, {21, 22, 23}, {24, 25, 26}}}, + {{{27, 28, 29}, {30, 31, 32}, {33, 34, 35}}, + {{36, 37, 38}, {39, 40, 41}, {42, 43, 44}}, + {{45, 46, 47}, {48, 49, 50}, {51, 52, 53}}}, + {{{54, 55, 56}, {57, 58, 59}, {60, 61, 62}}, + {{63, 64, 65}, {66, 67, 68}, {69, 70, 71}}, + {{72, 73, 74}, {75, 76, 77}, {78, 79, 80}}}, + {{{81, 82, 83}, {84, 85, 86}, {87, 88, 89}}, + {{90, 91, 92}, {93, 94, 95}, {96, 97, 98}}, + {{99, 100, 101}, {102, 103, 104}, {105, 106, 107}}}}}); + std::shared_ptr<Tensor> myBias = + std::make_shared<Tensor>(Array1D<float, 4>{{7, 0, 9, 0}}); + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>( + Array4D<float, 2, 3, 5, 5>{// NCHW + {{{{0, 1, 2, 3, 4}, + {5, 6, 7, 8, 9}, + {10, 11, 12, 13, 14}, + {15, 16, 17, 18, 19}, + {20, 21, 22, 23, 24}}, + + {{25, 26, 27, 28, 29}, + {30, 31, 32, 33, 34}, + {35, 36, 37, 38, 39}, + {40, 41, 42, 43, 44}, + {45, 46, 47, 48, 49}}, + + {{50, 51, 52, 53, 54}, + {55, 56, 57, 58, 59}, + {60, 61, 62, 63, 64}, + {65, 66, 67, 68, 69}, + {70, 71, 72, 73, 74}}}, + {{{75, 76, 77, 78, 79}, + {80, 81, 82, 83, 84}, + {85, 86, 87, 88, 89}, + {90, 91, 92, 93, 94}, + {95, 96, 97, 98, 99}}, + + {{100, 101, 102, 103, 104}, + {105, 106, 107, 108, 109}, + {110, 111, 112, 113, 114}, + {115, 116, 117, 118, 119}, + {120, 121, 122, 123, 124}}, + + {{125, 126, 127, 128, 129}, + {130, 131, 132, 133, 134}, + {135, 136, 137, 138, 139}, + {140, 141, 142, 143, 144}, + {145, 146, 147, 148, 149}}}}}); + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>( + Array4D<float, 2, 4, 3, 3>{{{{{15226, 15577, 15928}, + {16981, 17332, 17683}, + {18736, 19087, 19438}}, + {{37818, 38898, 39978}, + {43218, 44298, 45378}, + {48618, 49698, 50778}}, + {{60426, 62235, 64044}, + {69471, 71280, 73089}, + {78516, 80325, 82134}}, + {{83016, 85554, 88092}, + {95706, 98244, 100782}, + {108396, 110934, 113472}}}, + {{{41551, 41902, 42253}, + {43306, 43657, 44008}, + {45061, 45412, 45763}}, + {{118818, 119898, 120978}, + {124218, 125298, 126378}, + {129618, 130698, 131778}}, + {{196101, 197910, 199719}, + {205146, 206955, 208764}, + {214191, 216000, 217809}}, + {{273366, 275904, 278442}, + {286056, 288594, 291132}, + {298746, 301284, 303822}}}}}); myInput->setBackend("cuda"); myWeights->setBackend("cuda"); myBias->setBackend("cuda"); - op->associateInput(0,myInput); - op->associateInput(1,myWeights); - op->associateInput(2,myBias); + op->associateInput(0, myInput); + op->associateInput(1, myWeights); + op->associateInput(2, myBias); myConv->forward(); // op->getOutput(0)->print(); - float* computedOutput = new float[myOutput->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + float *computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, + op->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * myOutput->size(), + cudaMemcpyDeviceToHost); - for(int i = 0; i < myOutput->size(); i++){ - const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + for (int i = 0; i < myOutput->size(); i++) { + const float targetOutput = + *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i); REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); } @@ -229,48 +186,66 @@ TEST_CASE("[gpu/operator] Conv(forward)") { std::random_device rd; std::mt19937 gen(rd()); std::uniform_real_distribution<float> valueDist( - 0.1f, 1.1f); // Random float distribution between 0 and 1 - std::uniform_int_distribution<std::size_t> kernelDist(1, std::size_t(5)); - std::uniform_int_distribution<std::size_t> dimSizeDist(1, std::size_t(10)); + 0.1f, + 1.1f); // Random float distribution between 0 and 1 + std::uniform_int_distribution<std::size_t> kernelDist(1, + std::size_t(5)); + std::uniform_int_distribution<std::size_t> dimSizeDist( + 1, + std::size_t(10)); // To measure execution time of 'forward()' std::chrono::time_point<std::chrono::system_clock> start; std::chrono::time_point<std::chrono::system_clock> end; std::chrono::duration<double, std::micro> duration{}; - for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) - { + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { const std::size_t kernel = kernelDist(gen); - std::uniform_int_distribution<std::size_t> resolutionDist(std::size_t(kernel), - std::size_t(10)); + std::uniform_int_distribution<std::size_t> resolutionDist( + std::size_t(kernel), + std::size_t(10)); const std::size_t nbDims = 4; std::vector<std::size_t> dims; for (std::size_t i = 0; i < nbDims; ++i) { - if(i < 2) + if (i < 2) dims.push_back(dimSizeDist(gen)); else dims.push_back(resolutionDist(gen)); } const std::size_t outChannels = dimSizeDist(gen); - const std::vector<std::size_t> dimsW{outChannels,dims[1],kernel,kernel}; + const std::vector<std::size_t> dimsW{outChannels, + dims[1], + kernel, + kernel}; const std::size_t inChannels = dims[1]; - const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); - const std::size_t wieghtSize = std::accumulate(dimsW.cbegin(), dimsW.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t nb_elements = + std::accumulate(dims.cbegin(), + dims.cend(), + std::size_t(1), + std::multiplies<std::size_t>()); + const std::size_t wieghtSize = + std::accumulate(dimsW.cbegin(), + dimsW.cend(), + std::size_t(1), + std::multiplies<std::size_t>()); // Create Conv Operator CUDA - std::shared_ptr<Node> myConvCUDA = Conv(inChannels,outChannels,{kernel,kernel}, "myconvcuda"); - auto op_cuda = std::static_pointer_cast<OperatorTensor>(myConvCUDA -> getOperator()); + std::shared_ptr<Node> myConvCUDA = + Conv(inChannels, outChannels, {kernel, kernel}, "myconvcuda"); + auto op_cuda = std::static_pointer_cast<OperatorTensor>( + myConvCUDA->getOperator()); // Create Conv Operator CPU - std::shared_ptr<Node> myConvCPU = Conv(inChannels,outChannels,{kernel,kernel}, "myconvcpu"); - auto op_cpu = std::static_pointer_cast<OperatorTensor>(myConvCPU -> getOperator()); + std::shared_ptr<Node> myConvCPU = + Conv(inChannels, outChannels, {kernel, kernel}, "myconvcpu"); + auto op_cpu = std::static_pointer_cast<OperatorTensor>( + myConvCPU->getOperator()); op_cpu->setDataType(DataType::Float32); op_cpu->setBackend("cpu"); - - float* array0 = new float[nb_elements]; - float* weights = new float[wieghtSize]; - float* bias = new float[outChannels]; + float *array0 = new float[nb_elements]; + float *weights = new float[wieghtSize]; + float *bias = new float[outChannels]; for (std::size_t i = 0; i < nb_elements; ++i) { array0[i] = valueDist(gen); @@ -283,23 +258,27 @@ TEST_CASE("[gpu/operator] Conv(forward)") { } // input0 CUDA - float* array0_d, *weight_d, *bias_d; + float *array0_d, *weight_d, *bias_d; std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>(); T0_cuda->setDataType(DataType::Float32); T0_cuda->setBackend("cuda"); T0_cuda->resize(dims); op_cuda->associateInput(0, T0_cuda); - cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements); - cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice); + cudaMalloc(reinterpret_cast<void **>(&array0_d), + sizeof(float) * nb_elements); + cudaMemcpy(array0_d, + array0, + sizeof(float) * nb_elements, + cudaMemcpyHostToDevice); T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements); // input0 CPU std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>(); - op_cpu->associateInput(0,T0_cpu); + op_cpu->associateInput(0, T0_cpu); T0_cpu->setDataType(DataType::Float32); T0_cpu->setBackend("cpu"); T0_cpu->resize(dims); - T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements); + T0_cpu->getImpl()->setRawPtr(array0, nb_elements); // weight CUDA std::shared_ptr<Tensor> Tw_cuda = std::make_shared<Tensor>(); @@ -307,17 +286,21 @@ TEST_CASE("[gpu/operator] Conv(forward)") { Tw_cuda->setBackend("cuda"); Tw_cuda->resize(dimsW); op_cuda->associateInput(1, Tw_cuda); - cudaMalloc(reinterpret_cast<void **>(&weight_d), sizeof(float) * wieghtSize); - cudaMemcpy(weight_d, weights, sizeof(float) * wieghtSize, cudaMemcpyHostToDevice); + cudaMalloc(reinterpret_cast<void **>(&weight_d), + sizeof(float) * wieghtSize); + cudaMemcpy(weight_d, + weights, + sizeof(float) * wieghtSize, + cudaMemcpyHostToDevice); Tw_cuda->getImpl()->setRawPtr(weight_d, wieghtSize); // weight CPU std::shared_ptr<Tensor> Tw_cpu = std::make_shared<Tensor>(); - op_cpu->associateInput(1,Tw_cpu); + op_cpu->associateInput(1, Tw_cpu); Tw_cpu->setDataType(DataType::Float32); Tw_cpu->setBackend("cpu"); Tw_cpu->resize(dimsW); - Tw_cpu -> getImpl() -> setRawPtr(weights, wieghtSize); + Tw_cpu->getImpl()->setRawPtr(weights, wieghtSize); // bias CUDA std::shared_ptr<Tensor> Tb_cuda = std::make_shared<Tensor>(); @@ -325,17 +308,21 @@ TEST_CASE("[gpu/operator] Conv(forward)") { Tb_cuda->setBackend("cuda"); Tb_cuda->resize({outChannels}); op_cuda->associateInput(2, Tb_cuda); - cudaMalloc(reinterpret_cast<void **>(&bias_d), sizeof(float) * outChannels); - cudaMemcpy(bias_d, bias, sizeof(float) * outChannels, cudaMemcpyHostToDevice); + cudaMalloc(reinterpret_cast<void **>(&bias_d), + sizeof(float) * outChannels); + cudaMemcpy(bias_d, + bias, + sizeof(float) * outChannels, + cudaMemcpyHostToDevice); Tb_cuda->getImpl()->setRawPtr(bias_d, outChannels); // bias CPU std::shared_ptr<Tensor> Tb_cpu = std::make_shared<Tensor>(); - op_cpu->associateInput(2,Tb_cpu); + op_cpu->associateInput(2, Tb_cpu); Tb_cpu->setDataType(DataType::Float32); Tb_cpu->setBackend("cpu"); Tb_cpu->resize({outChannels}); - Tb_cpu -> getImpl() -> setRawPtr(bias, outChannels); + Tb_cpu->getImpl()->setRawPtr(bias, outChannels); // forward CUDA op_cuda->setDataType(DataType::Float32); @@ -343,16 +330,22 @@ TEST_CASE("[gpu/operator] Conv(forward)") { start = std::chrono::system_clock::now(); op_cuda->forward(); end = std::chrono::system_clock::now(); - duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + duration += std::chrono::duration_cast<std::chrono::microseconds>( + end - start); - const std::size_t outSize = op_cuda->getOutput(0)->size(); + const std::size_t outSize = op_cuda->getOutput(0)->size(); float *computed_cuda = new float[outSize](); - cudaMemcpy(computed_cuda, op_cuda->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * outSize, cudaMemcpyDeviceToHost); + cudaMemcpy(computed_cuda, + op_cuda->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * outSize, + cudaMemcpyDeviceToHost); // forward CPU op_cpu->forward(); std::shared_ptr<Tensor> outputFallback; - const auto& cudaOutput = op_cuda->getOutput(0)->refCastFrom(outputFallback, *op_cpu->getOutput(0)); + const auto &cudaOutput = + op_cuda->getOutput(0)->refCastFrom(outputFallback, + *op_cpu->getOutput(0)); REQUIRE(approxEq<float>(cudaOutput, *(op_cpu->getOutput(0)))); delete[] array0; @@ -365,5 +358,4 @@ TEST_CASE("[gpu/operator] Conv(forward)") { } std::cout << "total time: " << duration.count() << "μs" << std::endl; } - } diff --git a/unit_tests/Test_DivImpl.cpp b/unit_tests/Test_DivImpl.cpp index 07cde5d6acb8eeeff2667e5c67aedb87b893e84c..1a7d2719f0c5ddbe85bb6564f6b553c6fd716c8b 100644 --- a/unit_tests/Test_DivImpl.cpp +++ b/unit_tests/Test_DivImpl.cpp @@ -11,7 +11,7 @@ #include <array> #include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution #include <catch2/catch_test_macros.hpp> @@ -23,118 +23,145 @@ namespace Aidge { TEST_CASE("[gpu/operator] Div", "[Div][GPU]") { -constexpr std::uint16_t NBTRIALS = 10; - // Create a random number generator - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_real_distribution<float> valueDist( - 0.1f, 1.1f); // Random float distribution between 0 and 1 - std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1), - std::size_t(10)); - std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(4), std::size_t(5)); - std::uniform_int_distribution<int> boolDist(0,1); - - // To measure execution time of 'forward()' - std::chrono::time_point<std::chrono::system_clock> start; - std::chrono::time_point<std::chrono::system_clock> end; - std::chrono::duration<double, std::micro> duration{}; - std::size_t number_of_operation = 0; - for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) - { - // Create Div Operator CUDA - std::shared_ptr<Node> myDivCUDA = Div(); - auto op_cuda = std::static_pointer_cast<OperatorTensor>(myDivCUDA -> getOperator()); - - // Create Div Operator CPU - std::shared_ptr<Node> myDivCPU = Div(); - auto op_cpu = std::static_pointer_cast<OperatorTensor>(myDivCPU -> getOperator()); - op_cpu->setDataType(DataType::Float32); - op_cpu->setBackend("cpu"); - - const std::size_t nbDims = nbDimsDist(gen); - std::vector<std::size_t> dims0, dims1, dims; - for (std::size_t i = 0; i < nbDims; ++i) { - const std::size_t dim = dimSizeDist(gen); - dims0.push_back(dim); - if (boolDist(gen)) { - dims1.push_back(1); - }else{ - dims1.push_back(dim); - } - dims.push_back(std::max(dims0[i], dims1[i])); - } - - const std::size_t nb_elements0 = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); - const std::size_t nb_elements1 = std::accumulate(dims1.cbegin(), dims1.cend(), std::size_t(1), std::multiplies<std::size_t>()); - const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); - number_of_operation += nb_elements; - float* array0 = new float[nb_elements0]; - float* array1 = new float[nb_elements1]; - - for (std::size_t i = 0; i < nb_elements0; ++i) { - array0[i] = valueDist(gen); - } - for (std::size_t i = 0; i < nb_elements1; ++i) { - array1[i] = valueDist(gen); + constexpr std::uint16_t NBTRIALS = 10; + // Create a random number generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<float> valueDist( + 0.1f, + 1.1f); // Random float distribution between 0 and 1 + std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1), + std::size_t(10)); + std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(4), + std::size_t(5)); + std::uniform_int_distribution<int> boolDist(0, 1); + + // To measure execution time of 'forward()' + std::chrono::time_point<std::chrono::system_clock> start; + std::chrono::time_point<std::chrono::system_clock> end; + std::chrono::duration<double, std::micro> duration{}; + std::size_t number_of_operation = 0; + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { + // Create Div Operator CUDA + std::shared_ptr<Node> myDivCUDA = Div(); + auto op_cuda = + std::static_pointer_cast<OperatorTensor>(myDivCUDA->getOperator()); + + // Create Div Operator CPU + std::shared_ptr<Node> myDivCPU = Div(); + auto op_cpu = + std::static_pointer_cast<OperatorTensor>(myDivCPU->getOperator()); + op_cpu->setDataType(DataType::Float32); + op_cpu->setBackend("cpu"); + + const std::size_t nbDims = nbDimsDist(gen); + std::vector<std::size_t> dims0, dims1, dims; + for (std::size_t i = 0; i < nbDims; ++i) { + const std::size_t dim = dimSizeDist(gen); + dims0.push_back(dim); + if (boolDist(gen)) { + dims1.push_back(1); + } else { + dims1.push_back(dim); } + dims.push_back(std::max(dims0[i], dims1[i])); + } - // input0 CUDA - float* array0_d, *array1_d; - std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>(); - T0_cuda->setDataType(DataType::Float32); - T0_cuda->setBackend("cuda"); - T0_cuda->resize(dims0); - op_cuda->associateInput(0, T0_cuda); - cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements0); - cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements0, cudaMemcpyHostToDevice); - T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements0); - - // input0 CPU - std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>(); - op_cpu->associateInput(0,T0_cpu); - T0_cpu->setDataType(DataType::Float32); - T0_cpu->setBackend("cpu"); - T0_cpu->resize(dims0); - T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements0); - - // input1 CUDA - std::shared_ptr<Tensor> T1_cuda = std::make_shared<Tensor>(); - T1_cuda->setDataType(DataType::Float32); - T1_cuda->setBackend("cuda"); - T1_cuda->resize(dims1); - op_cuda->associateInput(1, T1_cuda); - cudaMalloc(reinterpret_cast<void **>(&array1_d), sizeof(float) * nb_elements1); - cudaMemcpy(array1_d, array1, sizeof(float) * nb_elements1, cudaMemcpyHostToDevice); - T1_cuda->getImpl()->setRawPtr(array1_d, nb_elements1); - - // input1 CPU - std::shared_ptr<Tensor> T1_cpu = std::make_shared<Tensor>(); - op_cpu->associateInput(1,T1_cpu); - T1_cpu->setDataType(DataType::Float32); - T1_cpu->setBackend("cpu"); - T1_cpu->resize(dims1); - T1_cpu -> getImpl() -> setRawPtr(array1, nb_elements1); - - // forward CUDA - op_cuda->setDataType(DataType::Float32); - op_cuda->setBackend("cuda"); - start = std::chrono::system_clock::now(); - op_cuda->forward(); - end = std::chrono::system_clock::now(); - duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); - - // forward CPU - op_cpu->forward(); - float *computedCPU = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr()); - - std::shared_ptr<Tensor> outputFallback; - const auto& cudaOutput = op_cuda->getOutput(0)->refCastFrom(outputFallback, *op_cpu->getOutput(0)); - REQUIRE(approxEq<float>(cudaOutput, *(op_cpu->getOutput(0)))); - - delete[] array0; - delete[] array1; - cudaFree(array0_d); - cudaFree(array1_d); + const std::size_t nb_elements0 = + std::accumulate(dims0.cbegin(), + dims0.cend(), + std::size_t(1), + std::multiplies<std::size_t>()); + const std::size_t nb_elements1 = + std::accumulate(dims1.cbegin(), + dims1.cend(), + std::size_t(1), + std::multiplies<std::size_t>()); + const std::size_t nb_elements = + std::accumulate(dims.cbegin(), + dims.cend(), + std::size_t(1), + std::multiplies<std::size_t>()); + number_of_operation += nb_elements; + float *array0 = new float[nb_elements0]; + float *array1 = new float[nb_elements1]; + + for (std::size_t i = 0; i < nb_elements0; ++i) { + array0[i] = valueDist(gen); + } + for (std::size_t i = 0; i < nb_elements1; ++i) { + array1[i] = valueDist(gen); } + + // input0 CUDA + float *array0_d, *array1_d; + std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>(); + T0_cuda->setDataType(DataType::Float32); + T0_cuda->setBackend("cuda"); + T0_cuda->resize(dims0); + op_cuda->associateInput(0, T0_cuda); + cudaMalloc(reinterpret_cast<void **>(&array0_d), + sizeof(float) * nb_elements0); + cudaMemcpy(array0_d, + array0, + sizeof(float) * nb_elements0, + cudaMemcpyHostToDevice); + T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements0); + + // input0 CPU + std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>(); + op_cpu->associateInput(0, T0_cpu); + T0_cpu->setDataType(DataType::Float32); + T0_cpu->setBackend("cpu"); + T0_cpu->resize(dims0); + T0_cpu->getImpl()->setRawPtr(array0, nb_elements0); + + // input1 CUDA + std::shared_ptr<Tensor> T1_cuda = std::make_shared<Tensor>(); + T1_cuda->setDataType(DataType::Float32); + T1_cuda->setBackend("cuda"); + T1_cuda->resize(dims1); + op_cuda->associateInput(1, T1_cuda); + cudaMalloc(reinterpret_cast<void **>(&array1_d), + sizeof(float) * nb_elements1); + cudaMemcpy(array1_d, + array1, + sizeof(float) * nb_elements1, + cudaMemcpyHostToDevice); + T1_cuda->getImpl()->setRawPtr(array1_d, nb_elements1); + + // input1 CPU + std::shared_ptr<Tensor> T1_cpu = std::make_shared<Tensor>(); + op_cpu->associateInput(1, T1_cpu); + T1_cpu->setDataType(DataType::Float32); + T1_cpu->setBackend("cpu"); + T1_cpu->resize(dims1); + T1_cpu->getImpl()->setRawPtr(array1, nb_elements1); + + // forward CUDA + op_cuda->setDataType(DataType::Float32); + op_cuda->setBackend("cuda"); + start = std::chrono::system_clock::now(); + op_cuda->forward(); + end = std::chrono::system_clock::now(); + duration += + std::chrono::duration_cast<std::chrono::microseconds>(end - start); + + // forward CPU + op_cpu->forward(); + float *computedCPU = + static_cast<float *>(op_cpu->getOutput(0)->getImpl()->rawPtr()); + + std::shared_ptr<Tensor> outputFallback; + const auto &cudaOutput = + op_cuda->getOutput(0)->refCastFrom(outputFallback, + *op_cpu->getOutput(0)); + REQUIRE(approxEq<float>(cudaOutput, *(op_cpu->getOutput(0)))); + + delete[] array0; + delete[] array1; + cudaFree(array0_d); + cudaFree(array1_d); + } } } // namespace Aidge diff --git a/unit_tests/Test_FCImpl.cpp b/unit_tests/Test_FCImpl.cpp index 472fd273b1b5eff49e0d05ebd499afdb1435770c..b95151d26866a689af3aa525350953472e87453c 100644 --- a/unit_tests/Test_FCImpl.cpp +++ b/unit_tests/Test_FCImpl.cpp @@ -11,7 +11,7 @@ #include <array> #include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution #include <catch2/catch_test_macros.hpp> @@ -24,106 +24,128 @@ using namespace Aidge; TEST_CASE("[gpu/operator] FC(forward)", "[FC][GPU]") { SECTION("Static Input") { - std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array2D<float, 5, 75>{ - {{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, - 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, - 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, - 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, - {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, - 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, - 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, - 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, - {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, - 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, - 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, - 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, - {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, - 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, - 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, - 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, - {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, - 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, - 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, - 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}}}); - std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float, 5>{{1, 2, 3, 4, 5}}); - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float, 2, 5>{ - {{23601, 23602, 23603, 23604, 23605}, {68601, 68602, 68603, 68604, 68605}}}); + std::shared_ptr<Tensor> myWeights = + std::make_shared<Tensor>(Array2D<float, 5, 75>{ + {{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}}}); + std::shared_ptr<Tensor> myBias = + std::make_shared<Tensor>(Array1D<float, 5>{{1, 2, 3, 4, 5}}); + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>( + Array2D<float, 2, 5>{{{23601, 23602, 23603, 23604, 23605}, + {68601, 68602, 68603, 68604, 68605}}}); myWeights->setBackend("cuda"); myBias->setBackend("cuda"); std::shared_ptr<Node> myFC = FC(75, 5, false, "myfc"); - auto op = std::static_pointer_cast<OperatorTensor>(myFC -> getOperator()); - op -> associateInput(1, myWeights); - op -> associateInput(2, myBias); + auto op = + std::static_pointer_cast<OperatorTensor>(myFC->getOperator()); + op->associateInput(1, myWeights); + op->associateInput(2, myBias); SECTION("2D input") { - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<float, 2, 75>{ - {{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, - 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, - 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, - 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74}, - {75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, - 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, - 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, - 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, - 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149}}}); + std::shared_ptr<Tensor> myInput = + std::make_shared<Tensor>(Array2D<float, 2, 75>{ + {{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, + 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, + 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, + 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, + 65, 66, 67, 68, 69, 70, 71, 72, 73, 74}, + {75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, + 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, + 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, + 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, + 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, + 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, + 141, 142, 143, 144, 145, 146, 147, 148, 149}}}); myInput->setBackend("cuda"); op->associateInput(0, myInput); - op -> setDataType(DataType::Float32); - op -> setBackend("cuda"); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); myFC->forward(); - float* computedOutput = new float[myOutput->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + float *computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, + op->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * myOutput->size(), + cudaMemcpyDeviceToHost); - for(int i = 0; i < myOutput->size(); i++){ - const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + for (int i = 0; i < myOutput->size(); i++) { + const float targetOutput = + *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i); REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); } delete[] computedOutput; } SECTION("4D input") { - std::shared_ptr<Tensor> myInput = - std::make_shared<Tensor>(Array4D<float, 2, 3, 5, 5>{{{{{0, 1, 2, 3, 4}, - {5, 6, 7, 8, 9}, - {10, 11, 12, 13, 14}, - {15, 16, 17, 18, 19}, - {20, 21, 22, 23, 24}}, - {{25, 26, 27, 28, 29}, - {30, 31, 32, 33, 34}, - {35, 36, 37, 38, 39}, - {40, 41, 42, 43, 44}, - {45, 46, 47, 48, 49}}, - {{50, 51, 52, 53, 54}, - {55, 56, 57, 58, 59}, - {60, 61, 62, 63, 64}, - {65, 66, 67, 68, 69}, - {70, 71, 72, 73, 74}}}, - {{{75, 76, 77, 78, 79}, - {80, 81, 82, 83, 84}, - {85, 86, 87, 88, 89}, - {90, 91, 92, 93, 94}, - {95, 96, 97, 98, 99}}, - {{100, 101, 102, 103, 104}, - {105, 106, 107, 108, 109}, - {110, 111, 112, 113, 114}, - {115, 116, 117, 118, 119}, - {120, 121, 122, 123, 124}}, - {{125, 126, 127, 128, 129}, - {130, 131, 132, 133, 134}, - {135, 136, 137, 138, 139}, - {140, 141, 142, 143, 144}, - {145, 146, 147, 148, 149}}}}}); + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>( + Array4D<float, 2, 3, 5, 5>{{{{{0, 1, 2, 3, 4}, + {5, 6, 7, 8, 9}, + {10, 11, 12, 13, 14}, + {15, 16, 17, 18, 19}, + {20, 21, 22, 23, 24}}, + {{25, 26, 27, 28, 29}, + {30, 31, 32, 33, 34}, + {35, 36, 37, 38, 39}, + {40, 41, 42, 43, 44}, + {45, 46, 47, 48, 49}}, + {{50, 51, 52, 53, 54}, + {55, 56, 57, 58, 59}, + {60, 61, 62, 63, 64}, + {65, 66, 67, 68, 69}, + {70, 71, 72, 73, 74}}}, + {{{75, 76, 77, 78, 79}, + {80, 81, 82, 83, 84}, + {85, 86, 87, 88, 89}, + {90, 91, 92, 93, 94}, + {95, 96, 97, 98, 99}}, + {{100, 101, 102, 103, 104}, + {105, 106, 107, 108, 109}, + {110, 111, 112, 113, 114}, + {115, 116, 117, 118, 119}, + {120, 121, 122, 123, 124}}, + {{125, 126, 127, 128, 129}, + {130, 131, 132, 133, 134}, + {135, 136, 137, 138, 139}, + {140, 141, 142, 143, 144}, + {145, 146, 147, 148, 149}}}}}); myInput->setBackend("cuda"); op->associateInput(0, myInput); - op -> setDataType(DataType::Float32); - op -> setBackend("cuda"); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); myFC->forward(); - float* computedOutput = new float[myOutput->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + float *computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, + op->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * myOutput->size(), + cudaMemcpyDeviceToHost); - for(int i = 0; i < myOutput->size(); i++){ - const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + for (int i = 0; i < myOutput->size(); i++) { + const float targetOutput = + *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i); REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); } @@ -131,21 +153,23 @@ TEST_CASE("[gpu/operator] FC(forward)", "[FC][GPU]") { } } - SECTION("Random Input"){ + SECTION("Random Input") { constexpr std::uint16_t NBTRIALS = 10; // Create a random number generator std::random_device rd; std::mt19937 gen(rd()); std::uniform_real_distribution<float> valueDist( - 0.1f, 1.1f); // Random float distribution between 0 and 1 - std::uniform_int_distribution<std::size_t> dimSizeDist(1, std::size_t(10)); + 0.1f, + 1.1f); // Random float distribution between 0 and 1 + std::uniform_int_distribution<std::size_t> dimSizeDist( + 1, + std::size_t(10)); // To measure execution time of 'forward()' std::chrono::time_point<std::chrono::system_clock> start; std::chrono::time_point<std::chrono::system_clock> end; std::chrono::duration<double, std::micro> duration{}; - for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) - { + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { const std::size_t nbDims = 4; std::vector<std::size_t> dims; for (std::size_t i = 0; i < nbDims; ++i) { @@ -153,28 +177,35 @@ TEST_CASE("[gpu/operator] FC(forward)", "[FC][GPU]") { } const std::size_t outChannels = dimSizeDist(gen); - const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t nb_elements = + std::accumulate(dims.cbegin(), + dims.cend(), + std::size_t(1), + std::multiplies<std::size_t>()); const std::size_t inChannels = nb_elements / dims[0]; const std::vector<std::size_t> dimsW{outChannels, inChannels}; const std::size_t wieghtSize = outChannels * inChannels; // Create FC Operator CUDA - std::shared_ptr<Node> myFCCUDA = FC(inChannels, outChannels, false, "myfccuda"); - auto op_cuda = std::static_pointer_cast<OperatorTensor>(myFCCUDA -> getOperator()); + std::shared_ptr<Node> myFCCUDA = + FC(inChannels, outChannels, false, "myfccuda"); + auto op_cuda = std::static_pointer_cast<OperatorTensor>( + myFCCUDA->getOperator()); op_cuda->setDataType(DataType::Float32); op_cuda->setBackend("cuda"); // Create FC Operator CPU - std::shared_ptr<Node> myFCCPU = FC(inChannels, outChannels, false, "myfccpu"); - auto op_cpu = std::static_pointer_cast<OperatorTensor>(myFCCPU -> getOperator()); + std::shared_ptr<Node> myFCCPU = + FC(inChannels, outChannels, false, "myfccpu"); + auto op_cpu = std::static_pointer_cast<OperatorTensor>( + myFCCPU->getOperator()); op_cpu->setDataType(DataType::Float32); op_cpu->setBackend("cpu"); - - float* array0 = new float[nb_elements]; - float* weights = new float[wieghtSize]; - float* bias = new float[outChannels]; + float *array0 = new float[nb_elements]; + float *weights = new float[wieghtSize]; + float *bias = new float[outChannels]; for (std::size_t i = 0; i < nb_elements; ++i) { array0[i] = valueDist(gen); @@ -187,23 +218,27 @@ TEST_CASE("[gpu/operator] FC(forward)", "[FC][GPU]") { } // input0 CUDA - float* array0_d, *weight_d, *bias_d; + float *array0_d, *weight_d, *bias_d; std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>(); T0_cuda->setDataType(DataType::Float32); T0_cuda->setBackend("cuda"); T0_cuda->resize(dims); op_cuda->associateInput(0, T0_cuda); - cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements); - cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice); + cudaMalloc(reinterpret_cast<void **>(&array0_d), + sizeof(float) * nb_elements); + cudaMemcpy(array0_d, + array0, + sizeof(float) * nb_elements, + cudaMemcpyHostToDevice); T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements); // input0 CPU std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>(); - op_cpu->associateInput(0,T0_cpu); + op_cpu->associateInput(0, T0_cpu); T0_cpu->setDataType(DataType::Float32); T0_cpu->setBackend("cpu"); T0_cpu->resize(dims); - T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements); + T0_cpu->getImpl()->setRawPtr(array0, nb_elements); // weight CUDA std::shared_ptr<Tensor> Tw_cuda = std::make_shared<Tensor>(); @@ -211,17 +246,21 @@ TEST_CASE("[gpu/operator] FC(forward)", "[FC][GPU]") { Tw_cuda->setBackend("cuda"); Tw_cuda->resize(dimsW); op_cuda->associateInput(1, Tw_cuda); - cudaMalloc(reinterpret_cast<void **>(&weight_d), sizeof(float) * wieghtSize); - cudaMemcpy(weight_d, weights, sizeof(float) * wieghtSize, cudaMemcpyHostToDevice); + cudaMalloc(reinterpret_cast<void **>(&weight_d), + sizeof(float) * wieghtSize); + cudaMemcpy(weight_d, + weights, + sizeof(float) * wieghtSize, + cudaMemcpyHostToDevice); Tw_cuda->getImpl()->setRawPtr(weight_d, wieghtSize); // weight CPU std::shared_ptr<Tensor> Tw_cpu = std::make_shared<Tensor>(); - op_cpu->associateInput(1,Tw_cpu); + op_cpu->associateInput(1, Tw_cpu); Tw_cpu->setDataType(DataType::Float32); Tw_cpu->setBackend("cpu"); Tw_cpu->resize(dimsW); - Tw_cpu -> getImpl() -> setRawPtr(weights, wieghtSize); + Tw_cpu->getImpl()->setRawPtr(weights, wieghtSize); // bias CUDA std::shared_ptr<Tensor> Tb_cuda = std::make_shared<Tensor>(); @@ -229,31 +268,40 @@ TEST_CASE("[gpu/operator] FC(forward)", "[FC][GPU]") { Tb_cuda->setBackend("cuda"); Tb_cuda->resize({outChannels}); op_cuda->associateInput(2, Tb_cuda); - cudaMalloc(reinterpret_cast<void **>(&bias_d), sizeof(float) * outChannels); - cudaMemcpy(bias_d, bias, sizeof(float) * outChannels, cudaMemcpyHostToDevice); + cudaMalloc(reinterpret_cast<void **>(&bias_d), + sizeof(float) * outChannels); + cudaMemcpy(bias_d, + bias, + sizeof(float) * outChannels, + cudaMemcpyHostToDevice); Tb_cuda->getImpl()->setRawPtr(bias_d, outChannels); // bias CPU std::shared_ptr<Tensor> Tb_cpu = std::make_shared<Tensor>(); - op_cpu->associateInput(2,Tb_cpu); + op_cpu->associateInput(2, Tb_cpu); Tb_cpu->setDataType(DataType::Float32); Tb_cpu->setBackend("cpu"); Tb_cpu->resize({outChannels}); - Tb_cpu -> getImpl() -> setRawPtr(bias, outChannels); + Tb_cpu->getImpl()->setRawPtr(bias, outChannels); // forward CUDA start = std::chrono::system_clock::now(); op_cuda->forward(); end = std::chrono::system_clock::now(); - duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + duration += std::chrono::duration_cast<std::chrono::microseconds>( + end - start); - const std::size_t outSize = op_cuda->getOutput(0)->size(); + const std::size_t outSize = op_cuda->getOutput(0)->size(); float *computed_cuda = new float[outSize](); - cudaMemcpy(computed_cuda, op_cuda->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * outSize, cudaMemcpyDeviceToHost); + cudaMemcpy(computed_cuda, + op_cuda->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * outSize, + cudaMemcpyDeviceToHost); // forward CPU op_cpu->forward(); - float *computed_cpu = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr()); + float *computed_cpu = static_cast<float *>( + op_cpu->getOutput(0)->getImpl()->rawPtr()); REQUIRE(approxEq<float>(*computed_cuda, *computed_cpu)); delete[] array0; @@ -270,80 +318,78 @@ TEST_CASE("[gpu/operator] FC(forward)", "[FC][GPU]") { TEST_CASE("[gpu/operator] FC(backward)", "[FC][GPU]") { SECTION("2D input") { - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<float, 2, 3>{ - { - {0.1, 0.2, 0.3}, - {0.4, 0.5, 0.6} - }}); + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>( + Array2D<float, 2, 3>{{{0.1, 0.2, 0.3}, {0.4, 0.5, 0.6}}}); myInput->setBackend("cuda"); - std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array2D<float, 2, 3>{ - {{0.1, 0.2, 0.3}, - {0.4, 0.5, 0.6}}}); + std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>( + Array2D<float, 2, 3>{{{0.1, 0.2, 0.3}, {0.4, 0.5, 0.6}}}); - std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float, 2>{{0.1, 0.2}}); + std::shared_ptr<Tensor> myBias = + std::make_shared<Tensor>(Array1D<float, 2>{{0.1, 0.2}}); myWeights->setBackend("cuda"); myBias->setBackend("cuda"); std::shared_ptr<Node> myFC = FC(3, 2, false, "myfc"); - auto op = std::static_pointer_cast<OperatorTensor>(myFC -> getOperator()); + auto op = + std::static_pointer_cast<OperatorTensor>(myFC->getOperator()); op->associateInput(0, myInput); - op -> associateInput(1, myWeights); - op -> associateInput(2, myBias); - op -> setDataType(DataType::Float32); - op -> setBackend("cuda"); + op->associateInput(1, myWeights); + op->associateInput(2, myBias); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); myFC->forward(); // Run and test backward operation - std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array2D<float, 2, 2> { - { - {0.1, 0.2}, - {0.3, 0.4} - } - }); + std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>( + Array2D<float, 2, 2>{{{0.1, 0.2}, {0.3, 0.4}}}); myOutputGrad->setBackend("cuda"); std::shared_ptr<Tensor> predictedOutput = op->getOutput(0); std::shared_ptr<Tensor> input = op->getInput(0); predictedOutput->setGrad(myOutputGrad); REQUIRE_NOTHROW(myFC->backward()); - std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array2D<float,2,3> { - { - {0.09, 0.12, 0.15}, - {0.19, 0.26, 0.33} - } - }); - std::shared_ptr<Tensor> expectedBiasGrad = std::make_shared<Tensor>(Array1D<float,2> { - {0.4, 0.6} - }); - std::shared_ptr<Tensor> expectedWeightsGrad = std::make_shared<Tensor>(Array2D<float,2,3> { - { - {0.13, 0.17, 0.21}, - {0.18, 0.24, 0.3 } - } - }); + std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>( + Array2D<float, 2, 3>{{{0.09, 0.12, 0.15}, {0.19, 0.26, 0.33}}}); + std::shared_ptr<Tensor> expectedBiasGrad = + std::make_shared<Tensor>(Array1D<float, 2>{{0.4, 0.6}}); + std::shared_ptr<Tensor> expectedWeightsGrad = std::make_shared<Tensor>( + Array2D<float, 2, 3>{{{0.13, 0.17, 0.21}, {0.18, 0.24, 0.3}}}); float *computedGradCuda = new float[expectedInputGrad->size()](); - cudaMemcpy(computedGradCuda, input->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInputGrad->size(), cudaMemcpyDeviceToHost); + cudaMemcpy(computedGradCuda, + input->grad()->getImpl()->rawPtr(), + sizeof(float) * expectedInputGrad->size(), + cudaMemcpyDeviceToHost); float *computedGradWCuda = new float[expectedWeightsGrad->size()](); - cudaMemcpy(computedGradWCuda, op->getInput(1)->grad()->getImpl()->rawPtr(), sizeof(float) * expectedWeightsGrad->size(), cudaMemcpyDeviceToHost); + cudaMemcpy(computedGradWCuda, + op->getInput(1)->grad()->getImpl()->rawPtr(), + sizeof(float) * expectedWeightsGrad->size(), + cudaMemcpyDeviceToHost); float *computedGradBCuda = new float[expectedBiasGrad->size()](); - cudaMemcpy(computedGradBCuda, op->getInput(2)->grad()->getImpl()->rawPtr(), sizeof(float) * expectedBiasGrad->size(), cudaMemcpyDeviceToHost); - - for(int i = 0; i < expectedInputGrad->size(); i++){ - const float targetOutput = *(static_cast<float*>(expectedInputGrad->getImpl()->rawPtr()) + i); + cudaMemcpy(computedGradBCuda, + op->getInput(2)->grad()->getImpl()->rawPtr(), + sizeof(float) * expectedBiasGrad->size(), + cudaMemcpyDeviceToHost); + + for (int i = 0; i < expectedInputGrad->size(); i++) { + const float targetOutput = *( + static_cast<float *>(expectedInputGrad->getImpl()->rawPtr()) + + i); REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6); } - for(int i = 0; i < expectedBiasGrad->size(); i++){ - const float targetOutput = *(static_cast<float*>(expectedBiasGrad->getImpl()->rawPtr()) + i); + for (int i = 0; i < expectedBiasGrad->size(); i++) { + const float targetOutput = + *(static_cast<float *>(expectedBiasGrad->getImpl()->rawPtr()) + + i); REQUIRE(fabs(computedGradBCuda[i] - targetOutput) < 1e-6); } - for(int i = 0; i < expectedWeightsGrad->size(); i++){ - const float targetOutput = *(static_cast<float*>(expectedWeightsGrad->getImpl()->rawPtr()) + i); + for (int i = 0; i < expectedWeightsGrad->size(); i++) { + const float targetOutput = + *(static_cast<float *>( + expectedWeightsGrad->getImpl()->rawPtr()) + + i); REQUIRE(fabs(computedGradWCuda[i] - targetOutput) < 1e-6); } - - - delete[] computedGradCuda; delete[] computedGradWCuda; delete[] computedGradBCuda; diff --git a/unit_tests/Test_GlobalAveragePoolingImpl.cpp b/unit_tests/Test_GlobalAveragePoolingImpl.cpp index 0a0f22ab60ced3a3f7648ce798484f72bd67839a..b6eac105e3ea700e13282f111a25a4a7b7a3e0bc 100644 --- a/unit_tests/Test_GlobalAveragePoolingImpl.cpp +++ b/unit_tests/Test_GlobalAveragePoolingImpl.cpp @@ -16,7 +16,7 @@ // #include <memory> #include <array> #include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution #include <catch2/catch_test_macros.hpp> @@ -28,56 +28,49 @@ namespace Aidge { TEST_CASE("[gpu/operator] GlobalAveragePooling", "[GlobalAveragePooling][GPU]") { - SECTION("4D-Tensor") - { - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,1,3,4,4> { //NCHW - { - { - {{0, 1, 2, 3}, - {4, 5, 6, 7}, - {8, 9, 10, 11}, - {12, 13, 14, 15}}, - - {{16, 17, 18, 19}, - {20, 21, 22, 23}, - {24, 25, 26, 27}, - {28, 29, 30, 31}}, - - {{32, 33, 34, 35}, - {36, 37, 38, 39}, - {40, 41, 42, 43}, - {44, 45, 46, 47}} - } - } - }); - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,1,3,1,1> { - { - { - {{ 7.5 }}, - {{ 23.5 }}, - {{ 39.5 }} - } + SECTION("4D-Tensor") { + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>( + Array4D<float, 1, 3, 4, 4>{// NCHW + {{{{0, 1, 2, 3}, + {4, 5, 6, 7}, + {8, 9, 10, 11}, + {12, 13, 14, 15}}, + + {{16, 17, 18, 19}, + {20, 21, 22, 23}, + {24, 25, 26, 27}, + {28, 29, 30, 31}}, + + {{32, 33, 34, 35}, + {36, 37, 38, 39}, + {40, 41, 42, 43}, + {44, 45, 46, 47}}}}}); + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>( + Array4D<float, 1, 3, 1, 1>{{{{{7.5}}, {{23.5}}, {{39.5}}}}}); + myInput->setBackend("cuda"); + myInput->setDataType(DataType::Float32); + // Create MyGlobalAveragePooling Operator + std::shared_ptr<Node> globAvgPool = GlobalAveragePooling(); + auto op = std::static_pointer_cast<OperatorTensor>( + globAvgPool->getOperator()); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + op->associateInput(0, myInput); + + globAvgPool->forward(); + float *computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, + op->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * myOutput->size(), + cudaMemcpyDeviceToHost); + + for (int i = 0; i < myOutput->size(); i++) { + const float targetOutput = + *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); } - }); - myInput->setBackend("cuda"); - myInput->setDataType(DataType::Float32); - // Create MyGlobalAveragePooling Operator - std::shared_ptr<Node> globAvgPool = GlobalAveragePooling(); - auto op = std::static_pointer_cast<OperatorTensor>(globAvgPool->getOperator()); - op->setDataType(DataType::Float32); - op->setBackend("cuda"); - op->associateInput(0, myInput); - - globAvgPool->forward(); - float* computedOutput = new float[myOutput->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); - - for(int i = 0; i < myOutput->size(); i++){ - const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); - } - - delete[] computedOutput; + + delete[] computedOutput; } SECTION("Random Input") { @@ -86,86 +79,101 @@ TEST_CASE("[gpu/operator] GlobalAveragePooling", std::random_device rd; std::mt19937 gen(rd()); std::uniform_real_distribution<float> valueDist( - 0.1f, 1.1f); // Random float distribution between 0 and 1 - std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1), - std::size_t(10)); + 0.1f, + 1.1f); // Random float distribution between 0 and 1 + std::uniform_int_distribution<std::size_t> dimSizeDist( + std::size_t(1), + std::size_t(10)); // To measure execution time of 'AveragePooling_Op::forward()' std::chrono::time_point<std::chrono::system_clock> start; std::chrono::time_point<std::chrono::system_clock> end; std::chrono::duration<double, std::micro> duration{}; std::size_t number_of_operation = 0; - for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) - { + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { // Create GlobalAveragePooling Operator CUDA std::shared_ptr<Node> myGAvgPoolCuda = GlobalAveragePooling(); - auto op_cuda = std::static_pointer_cast<OperatorTensor>(myGAvgPoolCuda->getOperator()); + auto op_cuda = std::static_pointer_cast<OperatorTensor>( + myGAvgPoolCuda->getOperator()); op_cuda->setDataType(DataType::Float32); op_cuda->setBackend("cuda"); // Create GlobalAveragePooling Operator CUDA std::shared_ptr<Node> myGAvgPoolCpu = GlobalAveragePooling(); - auto op_cpu = std::static_pointer_cast<OperatorTensor>(myGAvgPoolCpu->getOperator()); + auto op_cpu = std::static_pointer_cast<OperatorTensor>( + myGAvgPoolCpu->getOperator()); op_cpu->setDataType(DataType::Float32); op_cpu->setBackend("cpu"); // generate a random Tensor const std::size_t nbDims = 4; std::vector<std::size_t> dims; - for (std::size_t i = 0; i < nbDims; ++i) - { + for (std::size_t i = 0; i < nbDims; ++i) { dims.push_back(dimSizeDist(gen)); } - const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t nb_elements = + std::accumulate(dims.cbegin(), + dims.cend(), + std::size_t(1), + std::multiplies<std::size_t>()); number_of_operation += nb_elements; // Fill input tensor float *array0 = new float[nb_elements]; - for (std::size_t i = 0; i < nb_elements; ++i) - { + for (std::size_t i = 0; i < nb_elements; ++i) { array0[i] = valueDist(gen); } // input0 CUDA - float* array0_d; + float *array0_d; std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>(); T0_cuda->setDataType(DataType::Float32); T0_cuda->setBackend("cuda"); T0_cuda->resize(dims); op_cuda->associateInput(0, T0_cuda); - cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements); - cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice); + cudaMalloc(reinterpret_cast<void **>(&array0_d), + sizeof(float) * nb_elements); + cudaMemcpy(array0_d, + array0, + sizeof(float) * nb_elements, + cudaMemcpyHostToDevice); T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements); // input0 CPU std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>(); - op_cpu->associateInput(0,T0_cpu); + op_cpu->associateInput(0, T0_cpu); T0_cpu->setDataType(DataType::Float32); T0_cpu->setBackend("cpu"); T0_cpu->resize(dims); - T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements); + T0_cpu->getImpl()->setRawPtr(array0, nb_elements); - // Run inference + // Run inference start = std::chrono::system_clock::now(); op_cuda->forward(); end = std::chrono::system_clock::now(); - duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + duration += std::chrono::duration_cast<std::chrono::microseconds>( + end - start); - const std::size_t outSize = op_cuda->getOutput(0)->size(); + const std::size_t outSize = op_cuda->getOutput(0)->size(); float *computed_cuda = new float[outSize](); - cudaMemcpy(computed_cuda, op_cuda->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * outSize, cudaMemcpyDeviceToHost); + cudaMemcpy(computed_cuda, + op_cuda->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * outSize, + cudaMemcpyDeviceToHost); // forward CPU op_cpu->forward(); - float *computed_cpu = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr()); + float *computed_cpu = static_cast<float *>( + op_cpu->getOutput(0)->getImpl()->rawPtr()); REQUIRE(approxEq<float>(*computed_cuda, *computed_cpu)); delete[] computed_cuda; delete[] array0; cudaFree(array0_d); } - std::cout << "number of elements over time spent: " << (number_of_operation / duration.count()) << std::endl; + std::cout << "number of elements over time spent: " + << (number_of_operation / duration.count()) << std::endl; std::cout << "total time: " << duration.count() << "μs" << std::endl; } } diff --git a/unit_tests/Test_ILayerNormImpl.cpp b/unit_tests/Test_ILayerNormImpl.cpp index 0487b7c4716596e0d2e7bcbdaf812358be4de3bf..ad95d58c1982a2209c10f1da11c522b6d4695588 100644 --- a/unit_tests/Test_ILayerNormImpl.cpp +++ b/unit_tests/Test_ILayerNormImpl.cpp @@ -26,150 +26,230 @@ using namespace Aidge; TEST_CASE("[gpu/operator] ILayerNorm(forward)", "[ILayerNorm][GPU]") { SECTION("4D Tensor") { - std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array4D<float,2,2,2,10> { - { - { - { - {0.96, 0.48, 0.54, 0.49, 0.59, 0.93, 0.00, 0.00, 0.61, 0.61}, - {0.85, 0.06, 0.11, 0.87, 0.55, 0.12, 0.80, 0.48, 0.41, 0.16} - }, - { - {0.24, 0.46, 0.97, 0.19, 0.65, 0.12, 0.44, 1.00, 0.37, 0.09}, - {0.44, 0.64, 0.21, 0.58, 0.05, 0.24, 0.56, 0.07, 0.49, 0.79} - } - }, - { - { - {0.00, 0.13, 0.55, 0.42, 0.49, 0.28, 0.52, 0.55, 0.34, 0.85}, - {0.98, 0.32, 0.09, 0.05, 0.37, 0.47, 0.63, 0.13, 0.70, 0.02} - }, - { - {0.69, 0.13, 0.74, 0.61, 0.25, 0.87, 0.46, 0.40, 0.81, 0.06}, - {0.89, 0.32, 0.61, 0.24, 0.70, 0.23, 0.09, 0.03, 0.14, 0.80} - } - } - } - }); - - std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float, 10>{{0, 0, 0, 0, 0, 0, 0, 0, 0, 0}}); - std::shared_ptr<Tensor> myWeight = std::make_shared<Tensor>(Array1D<float, 10>{{0.1617684f, 0.3833238f ,-0.6842308f ,-0.4342245f ,-0.4717381f ,-0.1776187f, -0.2728751f, -0.4638580f, 0.2936697f, -0.9011016f}}); + std::shared_ptr<Tensor> input0 = std::make_shared< + Tensor>(Array4D<float, 2, 2, 2, 10>{ + {{{{0.96, 0.48, 0.54, 0.49, 0.59, 0.93, 0.00, 0.00, 0.61, 0.61}, + {0.85, 0.06, 0.11, 0.87, 0.55, 0.12, 0.80, 0.48, 0.41, 0.16}}, + {{0.24, 0.46, 0.97, 0.19, 0.65, 0.12, 0.44, 1.00, 0.37, 0.09}, + {0.44, 0.64, 0.21, 0.58, 0.05, 0.24, 0.56, 0.07, 0.49, 0.79}}}, + {{{0.00, 0.13, 0.55, 0.42, 0.49, 0.28, 0.52, 0.55, 0.34, 0.85}, + {0.98, 0.32, 0.09, 0.05, 0.37, 0.47, 0.63, 0.13, 0.70, 0.02}}, + {{0.69, 0.13, 0.74, 0.61, 0.25, 0.87, 0.46, 0.40, 0.81, 0.06}, + {0.89, + 0.32, + 0.61, + 0.24, + 0.70, + 0.23, + 0.09, + 0.03, + 0.14, + 0.80}}}}}); + + std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>( + Array1D<float, 10>{{0, 0, 0, 0, 0, 0, 0, 0, 0, 0}}); + std::shared_ptr<Tensor> myWeight = + std::make_shared<Tensor>(Array1D<float, 10>{{0.1617684f, + 0.3833238f, + -0.6842308f, + -0.4342245f, + -0.4717381f, + -0.1776187f, + -0.2728751f, + -0.4638580f, + 0.2936697f, + -0.9011016f}}); myWeight->setBackend("cuda"); myBias->setBackend("cuda"); std::shared_ptr<Node> myILayerNorm = ILayerNorm(); - auto op = std::static_pointer_cast<OperatorTensor>(myILayerNorm -> getOperator()); + auto op = std::static_pointer_cast<OperatorTensor>( + myILayerNorm->getOperator()); - op -> associateInput(1, myWeight); - op -> associateInput(2, myBias); + op->associateInput(1, myWeight); + op->associateInput(2, myBias); input0->setBackend("cuda"); - op -> associateInput(0,input0); + op->associateInput(0, input0); op->setDataType(DataType::Float32); op->setBackend("cuda"); op->forward(); // expected output - std::shared_ptr<Tensor> output_ilayernorm = std::make_shared<Tensor>(Array4D<float,2,2,2,10> { - { - { - { - {9.8821178e-02, 4.9410585e-02, 4.9410585e-02, 4.9410585e-02, 4.9410585e-02, 4.9410585e-02, 0.0000000e+00, 0.0000000e+00, 4.9410585e-02, 4.9410585e-02}, - {4.9410585e-02, 0.0000000e+00, 0.0000000e+00, 4.9410585e-02, 4.9410585e-02, 0.0000000e+00, 4.9410585e-02, 4.9410585e-02, 4.9410585e-02, 0.0000000e+00} - }, - { - {0.0000000e+00, 4.9410585e-02, 9.8821178e-02, 0.0000000e+00, 4.9410585e-02, 0.0000000e+00, 4.9410585e-02, 9.8821178e-02, 4.9410585e-02, 0.0000000e+00}, - {4.9410585e-02, 4.9410585e-02, 0.0000000e+00, 4.9410585e-02, 0.0000000e+00, 0.0000000e+00, 4.9410585e-02, 0.0000000e+00, 4.9410585e-02, 4.9410585e-02} - } - }, - { - { - {0.0000000e+00, 0.0000000e+00, 4.9410585e-02, 4.9410585e-02, 4.9410585e-02, 0.0000000e+00, 4.9410585e-02, 4.9410585e-02, 4.9410585e-02, 4.9410585e-02}, - {9.8821178e-02, 4.9410585e-02, 0.0000000e+00, 0.0000000e+00, 4.9410585e-02, 4.9410585e-02, 4.9410585e-02, 0.0000000e+00, 4.9410585e-02, 0.0000000e+00} - }, - { - {4.9410585e-02, 0.0000000e+00, 4.9410585e-02, 4.9410585e-02, 0.0000000e+00, 4.9410585e-02, 4.9410585e-02, 4.9410585e-02, 4.9410585e-02, 0.0000000e+00}, - {4.9410585e-02, 4.9410585e-02, 4.9410585e-02, 0.0000000e+00, 4.9410585e-02, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 4.9410585e-02} - } - } - } - }); - - - float* computedOutput = new float[output_ilayernorm->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * output_ilayernorm->size(), cudaMemcpyDeviceToHost); - - //test if forward result are as expected - for(int i = 0; i < output_ilayernorm->size(); i++){ - const float targetOutput = *(static_cast<float*>(output_ilayernorm->getImpl()->rawPtr()) + i); + std::shared_ptr<Tensor> output_ilayernorm = std::make_shared<Tensor>( + Array4D<float, 2, 2, 2, 10>{{{{{9.8821178e-02, + 4.9410585e-02, + 4.9410585e-02, + 4.9410585e-02, + 4.9410585e-02, + 4.9410585e-02, + 0.0000000e+00, + 0.0000000e+00, + 4.9410585e-02, + 4.9410585e-02}, + {4.9410585e-02, + 0.0000000e+00, + 0.0000000e+00, + 4.9410585e-02, + 4.9410585e-02, + 0.0000000e+00, + 4.9410585e-02, + 4.9410585e-02, + 4.9410585e-02, + 0.0000000e+00}}, + {{0.0000000e+00, + 4.9410585e-02, + 9.8821178e-02, + 0.0000000e+00, + 4.9410585e-02, + 0.0000000e+00, + 4.9410585e-02, + 9.8821178e-02, + 4.9410585e-02, + 0.0000000e+00}, + {4.9410585e-02, + 4.9410585e-02, + 0.0000000e+00, + 4.9410585e-02, + 0.0000000e+00, + 0.0000000e+00, + 4.9410585e-02, + 0.0000000e+00, + 4.9410585e-02, + 4.9410585e-02}}}, + {{{0.0000000e+00, + 0.0000000e+00, + 4.9410585e-02, + 4.9410585e-02, + 4.9410585e-02, + 0.0000000e+00, + 4.9410585e-02, + 4.9410585e-02, + 4.9410585e-02, + 4.9410585e-02}, + {9.8821178e-02, + 4.9410585e-02, + 0.0000000e+00, + 0.0000000e+00, + 4.9410585e-02, + 4.9410585e-02, + 4.9410585e-02, + 0.0000000e+00, + 4.9410585e-02, + 0.0000000e+00}}, + {{4.9410585e-02, + 0.0000000e+00, + 4.9410585e-02, + 4.9410585e-02, + 0.0000000e+00, + 4.9410585e-02, + 4.9410585e-02, + 4.9410585e-02, + 4.9410585e-02, + 0.0000000e+00}, + {4.9410585e-02, + 4.9410585e-02, + 4.9410585e-02, + 0.0000000e+00, + 4.9410585e-02, + 0.0000000e+00, + 0.0000000e+00, + 0.0000000e+00, + 0.0000000e+00, + 4.9410585e-02}}}}}); + + float *computedOutput = new float[output_ilayernorm->size()](); + cudaMemcpy(computedOutput, + op->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * output_ilayernorm->size(), + cudaMemcpyDeviceToHost); + + // test if forward result are as expected + for (int i = 0; i < output_ilayernorm->size(); i++) { + const float targetOutput = *( + static_cast<float *>(output_ilayernorm->getImpl()->rawPtr()) + + i); REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); } - - } - + } } TEST_CASE("[gpu/operator] ILayerNorm(backward)", "[ILayerNorm][GPU]") -{ - std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array4D<float,1,1,1,8> { //NCHW - { - { - { - {1.46650600, 1.24083233, -0.33106008, -0.15137172, 0.06625678, -1.8326609, 0.53444749, -0.05167147}, - }, - }, - } - }); - - std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array4D<float,1,1,1,8> { //NCHW +{ + std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>( + Array4D<float, 1, 1, 1, 8>{// NCHW + { + { + { + {1.46650600, + 1.24083233, + -0.33106008, + -0.15137172, + 0.06625678, + -1.8326609, + 0.53444749, + -0.05167147}, + }, + }, + }}); + + std::shared_ptr<Tensor> myBias = + std::make_shared<Tensor>(Array4D<float, 1, 1, 1, 8>{ + // NCHW { + { { - { - {0.96, 0.54, 0.22, -0.15, 0.17, 0.26, -0.85, 0.5}, - }, + {0.96, 0.54, 0.22, -0.15, 0.17, 0.26, -0.85, 0.5}, }, - } - }); - - std::shared_ptr<Tensor> myWeight = std::make_shared<Tensor>(Array4D<float,1,1,1,8> { //NCHW + }, + }}); + + std::shared_ptr<Tensor> myWeight = + std::make_shared<Tensor>(Array4D<float, 1, 1, 1, 8>{ + // NCHW { + { { - { - {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}, - }, + {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}, }, - } - }); - + }, + }}); - myWeight->setBackend("cuda"); - myBias->setBackend("cuda"); + myWeight->setBackend("cuda"); + myBias->setBackend("cuda"); - std::shared_ptr<Node> myILayerNorm = ILayerNorm(); - auto op = std::static_pointer_cast<OperatorTensor>(myILayerNorm -> getOperator()); + std::shared_ptr<Node> myILayerNorm = ILayerNorm(); + auto op = + std::static_pointer_cast<OperatorTensor>(myILayerNorm->getOperator()); - op -> associateInput(1, myWeight); - op -> associateInput(2, myBias); + op->associateInput(1, myWeight); + op->associateInput(2, myBias); - input0->setBackend("cuda"); + input0->setBackend("cuda"); - op -> associateInput(0,input0); - op->setDataType(DataType::Float32); - op->setBackend("cuda"); - myILayerNorm->forward(); + op->associateInput(0, input0); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + myILayerNorm->forward(); - std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array4D<float,1,1,1,8> { + std::shared_ptr<Tensor> myOutputGrad = + std::make_shared<Tensor>(Array4D<float, 1, 1, 1, 8>{{ { { - { - { 1.34347093, 0.90813798, 0.39607167, 1.20428133, 0.16845724, 0.48487359, 0.40748054, -0.21790814}, - }, + {1.34347093, + 0.90813798, + 0.39607167, + 1.20428133, + 0.16845724, + 0.48487359, + 0.40748054, + -0.21790814}, }, - } - }); - + }, + }}); myOutputGrad->setBackend("cuda"); std::shared_ptr<Tensor> predictedOutput = op->getOutput(0); @@ -177,24 +257,35 @@ TEST_CASE("[gpu/operator] ILayerNorm(backward)", "[ILayerNorm][GPU]") predictedOutput->setGrad(myOutputGrad); REQUIRE_NOTHROW(myILayerNorm->backward()); - std::shared_ptr<Tensor> expectedInputGradILayerNorm = std::make_shared<Tensor>(Array4D<float,1,1,1,8> { + std::shared_ptr<Tensor> expectedInputGradILayerNorm = + std::make_shared<Tensor>(Array4D<float, 1, 1, 1, 8>{{ { { - { - { 0.467678, 0.310749, 0.1129, 0.351786, 0.0507252, 0.101587, 0.130249, -0.0646476}, - }, + {0.467678, + 0.310749, + 0.1129, + 0.351786, + 0.0507252, + 0.101587, + 0.130249, + -0.0646476}, }, - } - }); - + }, + }}); float *computedInputGradCuda = new float[myOutputGrad->size()](); - cudaMemcpy(computedInputGradCuda, op->getInput(0)->grad()->getImpl()->rawPtr(), sizeof(float) * myOutputGrad->size(), cudaMemcpyDeviceToHost); - - //test if backward result are as expected - for(int i = 0; i < expectedInputGradILayerNorm->size(); i++){ - const float targetOutput = *(static_cast<float*>(expectedInputGradILayerNorm->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedInputGradCuda[i] - targetOutput) < 2e-6); + cudaMemcpy(computedInputGradCuda, + op->getInput(0)->grad()->getImpl()->rawPtr(), + sizeof(float) * myOutputGrad->size(), + cudaMemcpyDeviceToHost); + + // test if backward result are as expected + for (int i = 0; i < expectedInputGradILayerNorm->size(); i++) { + const float targetOutput = + *(static_cast<float *>( + expectedInputGradILayerNorm->getImpl()->rawPtr()) + + i); + REQUIRE(fabs(computedInputGradCuda[i] - targetOutput) < 2e-6); } delete[] computedInputGradCuda; diff --git a/unit_tests/Test_LnImpl.cpp b/unit_tests/Test_LnImpl.cpp index 06e2205ba38ce0becd0326bf4d258b9f55a228bd..9933b4b3108cf467cfd601dfba62c20b88e0b44e 100644 --- a/unit_tests/Test_LnImpl.cpp +++ b/unit_tests/Test_LnImpl.cpp @@ -11,7 +11,7 @@ #include <array> #include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution #include <catch2/catch_test_macros.hpp> @@ -23,84 +23,99 @@ namespace Aidge { TEST_CASE("[gpu/operator] Ln", "[Ln][GPU]") { -constexpr std::uint16_t NBTRIALS = 10; - // Create a random number generator - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_real_distribution<float> valueDist( - 0.1f, 1.1f); // Random float distribution between 0 and 1 - std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1), - std::size_t(10)); - std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(8)); - - // To measure execution time of 'forward()' - std::chrono::time_point<std::chrono::system_clock> start; - std::chrono::time_point<std::chrono::system_clock> end; - std::chrono::duration<double, std::micro> duration{}; - std::size_t number_of_operation = 0; - for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) - { - // Create Ln Operator CUDA - std::shared_ptr<Node> myLnCUDA = Ln(); - auto op_cuda = std::static_pointer_cast<OperatorTensor>(myLnCUDA -> getOperator()); - - // Create Ln Operator CPU - std::shared_ptr<Node> myLnCPU = Ln(); - auto op_cpu = std::static_pointer_cast<OperatorTensor>(myLnCPU -> getOperator()); - op_cpu->setDataType(DataType::Float32); - op_cpu->setBackend("cpu"); - - const std::size_t nbDims = nbDimsDist(gen); - std::vector<std::size_t> dims; - for (std::size_t i = 0; i < nbDims; ++i) { - dims.push_back(dimSizeDist(gen)); - } - - const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); - number_of_operation += nb_elements; - float* array0 = new float[nb_elements]; - - for (std::size_t i = 0; i < nb_elements; ++i) { - array0[i] = valueDist(gen); - } - - // input0 CUDA - float* array0_d; - std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>(); - T0_cuda->setDataType(DataType::Float32); - T0_cuda->setBackend("cuda"); - T0_cuda->resize(dims); - op_cuda->associateInput(0, T0_cuda); - cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements); - cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice); - T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements); - - // input0 CPU - std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>(); - op_cpu->associateInput(0,T0_cpu); - T0_cpu->setDataType(DataType::Float32); - T0_cpu->setBackend("cpu"); - T0_cpu->resize(dims); - T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements); - - // forward CUDA - op_cuda->setDataType(DataType::Float32); - op_cuda->setBackend("cuda"); - start = std::chrono::system_clock::now(); - op_cuda->forward(); - end = std::chrono::system_clock::now(); - duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); - - // forward CPU - op_cpu->forward(); - float *computedCPU = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr()); - - std::shared_ptr<Tensor> outputFallback; - const auto& cudaOutput = op_cuda->getOutput(0)->refCastFrom(outputFallback, *op_cpu->getOutput(0)); - REQUIRE(approxEq<float>(cudaOutput, *(op_cpu->getOutput(0)))); - - delete[] array0; - cudaFree(array0_d); + constexpr std::uint16_t NBTRIALS = 10; + // Create a random number generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<float> valueDist( + 0.1f, + 1.1f); // Random float distribution between 0 and 1 + std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1), + std::size_t(10)); + std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), + std::size_t(8)); + + // To measure execution time of 'forward()' + std::chrono::time_point<std::chrono::system_clock> start; + std::chrono::time_point<std::chrono::system_clock> end; + std::chrono::duration<double, std::micro> duration{}; + std::size_t number_of_operation = 0; + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { + // Create Ln Operator CUDA + std::shared_ptr<Node> myLnCUDA = Ln(); + auto op_cuda = + std::static_pointer_cast<OperatorTensor>(myLnCUDA->getOperator()); + + // Create Ln Operator CPU + std::shared_ptr<Node> myLnCPU = Ln(); + auto op_cpu = + std::static_pointer_cast<OperatorTensor>(myLnCPU->getOperator()); + op_cpu->setDataType(DataType::Float32); + op_cpu->setBackend("cpu"); + + const std::size_t nbDims = nbDimsDist(gen); + std::vector<std::size_t> dims; + for (std::size_t i = 0; i < nbDims; ++i) { + dims.push_back(dimSizeDist(gen)); } + + const std::size_t nb_elements = + std::accumulate(dims.cbegin(), + dims.cend(), + std::size_t(1), + std::multiplies<std::size_t>()); + number_of_operation += nb_elements; + float *array0 = new float[nb_elements]; + + for (std::size_t i = 0; i < nb_elements; ++i) { + array0[i] = valueDist(gen); + } + + // input0 CUDA + float *array0_d; + std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>(); + T0_cuda->setDataType(DataType::Float32); + T0_cuda->setBackend("cuda"); + T0_cuda->resize(dims); + op_cuda->associateInput(0, T0_cuda); + cudaMalloc(reinterpret_cast<void **>(&array0_d), + sizeof(float) * nb_elements); + cudaMemcpy(array0_d, + array0, + sizeof(float) * nb_elements, + cudaMemcpyHostToDevice); + T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements); + + // input0 CPU + std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>(); + op_cpu->associateInput(0, T0_cpu); + T0_cpu->setDataType(DataType::Float32); + T0_cpu->setBackend("cpu"); + T0_cpu->resize(dims); + T0_cpu->getImpl()->setRawPtr(array0, nb_elements); + + // forward CUDA + op_cuda->setDataType(DataType::Float32); + op_cuda->setBackend("cuda"); + start = std::chrono::system_clock::now(); + op_cuda->forward(); + end = std::chrono::system_clock::now(); + duration += + std::chrono::duration_cast<std::chrono::microseconds>(end - start); + + // forward CPU + op_cpu->forward(); + float *computedCPU = + static_cast<float *>(op_cpu->getOutput(0)->getImpl()->rawPtr()); + + std::shared_ptr<Tensor> outputFallback; + const auto &cudaOutput = + op_cuda->getOutput(0)->refCastFrom(outputFallback, + *op_cpu->getOutput(0)); + REQUIRE(approxEq<float>(cudaOutput, *(op_cpu->getOutput(0)))); + + delete[] array0; + cudaFree(array0_d); + } } } // namespace Aidge diff --git a/unit_tests/Test_MaxPoolingImpl.cpp b/unit_tests/Test_MaxPoolingImpl.cpp index 99850a0715cf8feb3164d58c410a1ef689feece1..9990a4e60970c336be426bb8bfdc2c412bd85f29 100644 --- a/unit_tests/Test_MaxPoolingImpl.cpp +++ b/unit_tests/Test_MaxPoolingImpl.cpp @@ -11,7 +11,7 @@ #include <array> #include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution #include <catch2/catch_test_macros.hpp> @@ -22,68 +22,57 @@ using namespace Aidge; - TEST_CASE("[gpu/operator] MaxPooling(forward)", "[MaxPooling][GPU]") { - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,2,5,5> { //NCHW - { - { - {{-0.3848, 0.2166, -0.4373, 0.6142, 0.5277}, - {0.7995, 0.3638, -1.4589, -1.0843, 1.0918}, - {0.7147, 0.0936, -1.2902, 1.2037, 0.4874}, - {-0.5981, 2.1184, -0.9175, 1.3859, 0.3305}, - {-1.7700, 0.0563, -0.3914, 0.0538, -0.3955}}, - - {{-3.1409, -0.4554, 0.0524, 2.2291, 0.4859}, - {-0.7465, -0.6567, -2.3703, -0.6386, -1.4152}, - { 2.2329, -0.5850, 0.0700, 1.2838, -1.7363}, - { 0.2139, 0.0624, -1.0689, -0.8221, -0.8038}, - { 0.1886, -0.7840, -0.2313, 0.2651, -1.6244}} - }, - { - {{ 0.4371, 1.6417, 0.9129, 0.6325, 0.5438}, - {-2.3552, -0.8850, -0.0232, -0.5462, -1.2011}, - {1.7653, -1.6668, -1.0814, 0.6182, 1.2071}, - {0.9541, -0.5133, 0.8664, -0.8892, 1.4585}, - {1.0220, -0.5107, 0.1829, -0.2301, -0.4268}}, - - {{ 1.0429, 0.6279, -0.2875, 0.7187, -0.1500}, - {1.6041, 2.9635, 1.4172, -0.7517, 0.5441}, - {-0.2276, 0.0857, 0.6776, -0.1389, -0.0614}, - {-0.1547, -0.3435, 0.0650, -0.5095, -1.8073}, - {1.7217, 0.3999, -0.5953, 1.0604, -0.4126}} - } - } - }); + std::shared_ptr<Tensor> myInput = + std::make_shared<Tensor>(Array4D<float, 2, 2, 5, 5>{ + // NCHW + {{{{-0.3848, 0.2166, -0.4373, 0.6142, 0.5277}, + {0.7995, 0.3638, -1.4589, -1.0843, 1.0918}, + {0.7147, 0.0936, -1.2902, 1.2037, 0.4874}, + {-0.5981, 2.1184, -0.9175, 1.3859, 0.3305}, + {-1.7700, 0.0563, -0.3914, 0.0538, -0.3955}}, + + {{-3.1409, -0.4554, 0.0524, 2.2291, 0.4859}, + {-0.7465, -0.6567, -2.3703, -0.6386, -1.4152}, + {2.2329, -0.5850, 0.0700, 1.2838, -1.7363}, + {0.2139, 0.0624, -1.0689, -0.8221, -0.8038}, + {0.1886, -0.7840, -0.2313, 0.2651, -1.6244}}}, + {{{0.4371, 1.6417, 0.9129, 0.6325, 0.5438}, + {-2.3552, -0.8850, -0.0232, -0.5462, -1.2011}, + {1.7653, -1.6668, -1.0814, 0.6182, 1.2071}, + {0.9541, -0.5133, 0.8664, -0.8892, 1.4585}, + {1.0220, -0.5107, 0.1829, -0.2301, -0.4268}}, + + {{1.0429, 0.6279, -0.2875, 0.7187, -0.1500}, + {1.6041, 2.9635, 1.4172, -0.7517, 0.5441}, + {-0.2276, 0.0857, 0.6776, -0.1389, -0.0614}, + {-0.1547, -0.3435, 0.0650, -0.5095, -1.8073}, + {1.7217, 0.3999, -0.5953, 1.0604, -0.4126}}}}}); SECTION("Stride") { - std::shared_ptr<Node> myMaxPool = MaxPooling({2,2}, "mycdw", {2,2}); - auto op = std::static_pointer_cast<OperatorTensor>(myMaxPool -> getOperator()); - - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,2,2,2> { - { - { - {{ 0.7995, 0.6142}, - { 2.1184, 1.3859}}, - {{ -0.4554, 2.2291}, - { 2.2329, 1.2838}} - }, - { - {{1.6417, 0.9129}, - {1.7653, 0.8664}}, - {{2.9635, 1.4172}, - {0.0857, 0.6776}} - } - } - }); - myMaxPool->getOperator()->associateInput(0,myInput); + std::shared_ptr<Node> myMaxPool = MaxPooling({2, 2}, "mycdw", {2, 2}); + auto op = + std::static_pointer_cast<OperatorTensor>(myMaxPool->getOperator()); + + std::shared_ptr<Tensor> myOutput = + std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 2>{ + {{{{0.7995, 0.6142}, {2.1184, 1.3859}}, + {{-0.4554, 2.2291}, {2.2329, 1.2838}}}, + {{{1.6417, 0.9129}, {1.7653, 0.8664}}, + {{2.9635, 1.4172}, {0.0857, 0.6776}}}}}); + myMaxPool->getOperator()->associateInput(0, myInput); myMaxPool->getOperator()->setDataType(DataType::Float32); myMaxPool->getOperator()->setBackend("cuda"); myMaxPool->forward(); - - float* computedOutput = new float[myOutput->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); - for(int i = 0; i < myOutput->size(); i++){ - const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + float *computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, + op->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * myOutput->size(), + cudaMemcpyDeviceToHost); + + for (int i = 0; i < myOutput->size(); i++) { + const float targetOutput = + *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i); REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); } @@ -98,86 +87,104 @@ TEST_CASE("[gpu/operator] MaxPooling(forward)", "[MaxPooling][GPU]") { std::random_device rd; std::mt19937 gen(rd()); std::uniform_real_distribution<float> valueDist( - 0.1f, 1.1f); // Random float distribution between 0 and 1 - std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(kernel), - std::size_t(10)); + 0.1f, + 1.1f); // Random float distribution between 0 and 1 + std::uniform_int_distribution<std::size_t> dimSizeDist( + std::size_t(kernel), + std::size_t(10)); // To measure execution time of 'forward()' std::chrono::time_point<std::chrono::system_clock> start; std::chrono::time_point<std::chrono::system_clock> end; std::chrono::duration<double, std::micro> duration{}; std::size_t number_of_operation = 0; - for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) - { + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { // Create MaxPooling Operator CUDA - std::shared_ptr<Node> myMaxPoolCuda = MaxPooling({kernel, kernel}, "myMaxPoolCuda", {stride, stride}); - auto op_cuda = std::static_pointer_cast<OperatorTensor>(myMaxPoolCuda->getOperator()); + std::shared_ptr<Node> myMaxPoolCuda = MaxPooling({kernel, kernel}, + "myMaxPoolCuda", + {stride, stride}); + auto op_cuda = std::static_pointer_cast<OperatorTensor>( + myMaxPoolCuda->getOperator()); op_cuda->setDataType(DataType::Float32); op_cuda->setBackend("cuda"); // Create MaxPooling Operator CUDA - std::shared_ptr<Node> myMaxPoolCpu = MaxPooling({kernel, kernel}, "myMaxPoolCpu", {stride, stride}); - auto op_cpu = std::static_pointer_cast<OperatorTensor>(myMaxPoolCpu->getOperator()); + std::shared_ptr<Node> myMaxPoolCpu = + MaxPooling({kernel, kernel}, "myMaxPoolCpu", {stride, stride}); + auto op_cpu = std::static_pointer_cast<OperatorTensor>( + myMaxPoolCpu->getOperator()); op_cpu->setDataType(DataType::Float32); op_cpu->setBackend("cpu"); // generate a random Tensor const std::size_t nbDims = 4; std::vector<std::size_t> dims; - for (std::size_t i = 0; i < nbDims; ++i) - { + for (std::size_t i = 0; i < nbDims; ++i) { dims.push_back(dimSizeDist(gen)); } - const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t nb_elements = + std::accumulate(dims.cbegin(), + dims.cend(), + std::size_t(1), + std::multiplies<std::size_t>()); number_of_operation += nb_elements; // Fill input tensor float *array0 = new float[nb_elements]; - for (std::size_t i = 0; i < nb_elements; ++i) - { + for (std::size_t i = 0; i < nb_elements; ++i) { array0[i] = valueDist(gen); } // input0 CUDA - float* array0_d; + float *array0_d; std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>(); T0_cuda->setDataType(DataType::Float32); T0_cuda->setBackend("cuda"); T0_cuda->resize(dims); op_cuda->associateInput(0, T0_cuda); - cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements); - cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice); + cudaMalloc(reinterpret_cast<void **>(&array0_d), + sizeof(float) * nb_elements); + cudaMemcpy(array0_d, + array0, + sizeof(float) * nb_elements, + cudaMemcpyHostToDevice); T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements); // input0 CPU std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>(); - op_cpu->associateInput(0,T0_cpu); + op_cpu->associateInput(0, T0_cpu); T0_cpu->setDataType(DataType::Float32); T0_cpu->setBackend("cpu"); T0_cpu->resize(dims); - T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements); + T0_cpu->getImpl()->setRawPtr(array0, nb_elements); - // Run inference + // Run inference start = std::chrono::system_clock::now(); op_cuda->forward(); end = std::chrono::system_clock::now(); - duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + duration += std::chrono::duration_cast<std::chrono::microseconds>( + end - start); - const std::size_t outSize = op_cuda->getOutput(0)->size(); + const std::size_t outSize = op_cuda->getOutput(0)->size(); float *computed_cuda = new float[outSize](); - cudaMemcpy(computed_cuda, op_cuda->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * outSize, cudaMemcpyDeviceToHost); + cudaMemcpy(computed_cuda, + op_cuda->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * outSize, + cudaMemcpyDeviceToHost); // forward CPU op_cpu->forward(); - float *computed_cpu = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr()); + float *computed_cpu = static_cast<float *>( + op_cpu->getOutput(0)->getImpl()->rawPtr()); REQUIRE(approxEq<float>(*computed_cuda, *computed_cpu)); delete[] computed_cuda; delete[] array0; cudaFree(array0_d); } - std::cout << "number of elements over time spent: " << (number_of_operation / duration.count()) << std::endl; + std::cout << "number of elements over time spent: " + << (number_of_operation / duration.count()) << std::endl; std::cout << "total time: " << duration.count() << "μs" << std::endl; } } \ No newline at end of file diff --git a/unit_tests/Test_MulImpl.cpp b/unit_tests/Test_MulImpl.cpp index 9eaba6e80971a7075576cd3d4d409b79dac4eb0c..f4996f85269a6c1a777c2c3fa38ffcee1afc81b9 100644 --- a/unit_tests/Test_MulImpl.cpp +++ b/unit_tests/Test_MulImpl.cpp @@ -11,7 +11,7 @@ #include <array> #include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution #include <catch2/catch_test_macros.hpp> @@ -23,118 +23,145 @@ namespace Aidge { TEST_CASE("[gpu/operator] Mul", "[Mul][GPU]") { -constexpr std::uint16_t NBTRIALS = 10; - // Create a random number generator - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_real_distribution<float> valueDist( - 0.1f, 1.1f); // Random float distribution between 0 and 1 - std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1), - std::size_t(10)); - std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(4), std::size_t(5)); - std::uniform_int_distribution<int> boolDist(0,1); - - // To measure execution time of 'forward()' - std::chrono::time_point<std::chrono::system_clock> start; - std::chrono::time_point<std::chrono::system_clock> end; - std::chrono::duration<double, std::micro> duration{}; - std::size_t number_of_operation = 0; - for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) - { - // Create Mul Operator CUDA - std::shared_ptr<Node> myMulCUDA = Mul(); - auto op_cuda = std::static_pointer_cast<OperatorTensor>(myMulCUDA -> getOperator()); - - // Create Mul Operator CPU - std::shared_ptr<Node> myMulCPU = Mul(); - auto op_cpu = std::static_pointer_cast<OperatorTensor>(myMulCPU -> getOperator()); - op_cpu->setDataType(DataType::Float32); - op_cpu->setBackend("cpu"); - - const std::size_t nbDims = nbDimsDist(gen); - std::vector<std::size_t> dims0, dims1, dims; - for (std::size_t i = 0; i < nbDims; ++i) { - const std::size_t dim = dimSizeDist(gen); - dims0.push_back(dim); - if (boolDist(gen)) { - dims1.push_back(1); - }else{ - dims1.push_back(dim); - } - dims.push_back(std::max(dims0[i], dims1[i])); - } - - const std::size_t nb_elements0 = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); - const std::size_t nb_elements1 = std::accumulate(dims1.cbegin(), dims1.cend(), std::size_t(1), std::multiplies<std::size_t>()); - const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); - number_of_operation += nb_elements; - float* array0 = new float[nb_elements0]; - float* array1 = new float[nb_elements1]; - - for (std::size_t i = 0; i < nb_elements0; ++i) { - array0[i] = valueDist(gen); - } - for (std::size_t i = 0; i < nb_elements1; ++i) { - array1[i] = valueDist(gen); + constexpr std::uint16_t NBTRIALS = 10; + // Create a random number generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<float> valueDist( + 0.1f, + 1.1f); // Random float distribution between 0 and 1 + std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1), + std::size_t(10)); + std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(4), + std::size_t(5)); + std::uniform_int_distribution<int> boolDist(0, 1); + + // To measure execution time of 'forward()' + std::chrono::time_point<std::chrono::system_clock> start; + std::chrono::time_point<std::chrono::system_clock> end; + std::chrono::duration<double, std::micro> duration{}; + std::size_t number_of_operation = 0; + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { + // Create Mul Operator CUDA + std::shared_ptr<Node> myMulCUDA = Mul(); + auto op_cuda = + std::static_pointer_cast<OperatorTensor>(myMulCUDA->getOperator()); + + // Create Mul Operator CPU + std::shared_ptr<Node> myMulCPU = Mul(); + auto op_cpu = + std::static_pointer_cast<OperatorTensor>(myMulCPU->getOperator()); + op_cpu->setDataType(DataType::Float32); + op_cpu->setBackend("cpu"); + + const std::size_t nbDims = nbDimsDist(gen); + std::vector<std::size_t> dims0, dims1, dims; + for (std::size_t i = 0; i < nbDims; ++i) { + const std::size_t dim = dimSizeDist(gen); + dims0.push_back(dim); + if (boolDist(gen)) { + dims1.push_back(1); + } else { + dims1.push_back(dim); } + dims.push_back(std::max(dims0[i], dims1[i])); + } - // input0 CUDA - float* array0_d, *array1_d; - std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>(); - T0_cuda->setDataType(DataType::Float32); - T0_cuda->setBackend("cuda"); - T0_cuda->resize(dims0); - op_cuda->associateInput(0, T0_cuda); - cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements0); - cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements0, cudaMemcpyHostToDevice); - T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements0); - - // input0 CPU - std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>(); - op_cpu->associateInput(0,T0_cpu); - T0_cpu->setDataType(DataType::Float32); - T0_cpu->setBackend("cpu"); - T0_cpu->resize(dims0); - T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements0); - - // input1 CUDA - std::shared_ptr<Tensor> T1_cuda = std::make_shared<Tensor>(); - T1_cuda->setDataType(DataType::Float32); - T1_cuda->setBackend("cuda"); - T1_cuda->resize(dims1); - op_cuda->associateInput(1, T1_cuda); - cudaMalloc(reinterpret_cast<void **>(&array1_d), sizeof(float) * nb_elements1); - cudaMemcpy(array1_d, array1, sizeof(float) * nb_elements1, cudaMemcpyHostToDevice); - T1_cuda->getImpl()->setRawPtr(array1_d, nb_elements1); - - // input1 CPU - std::shared_ptr<Tensor> T1_cpu = std::make_shared<Tensor>(); - op_cpu->associateInput(1,T1_cpu); - T1_cpu->setDataType(DataType::Float32); - T1_cpu->setBackend("cpu"); - T1_cpu->resize(dims1); - T1_cpu -> getImpl() -> setRawPtr(array1, nb_elements1); - - // forward CUDA - op_cuda->setDataType(DataType::Float32); - op_cuda->setBackend("cuda"); - start = std::chrono::system_clock::now(); - op_cuda->forward(); - end = std::chrono::system_clock::now(); - duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); - - // forward CPU - op_cpu->forward(); - float *computedCPU = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr()); - - std::shared_ptr<Tensor> outputFallback; - const auto& cudaOutput = op_cuda->getOutput(0)->refCastFrom(outputFallback, *op_cpu->getOutput(0)); - REQUIRE(approxEq<float>(cudaOutput, *(op_cpu->getOutput(0)))); - - delete[] array0; - delete[] array1; - cudaFree(array0_d); - cudaFree(array1_d); + const std::size_t nb_elements0 = + std::accumulate(dims0.cbegin(), + dims0.cend(), + std::size_t(1), + std::multiplies<std::size_t>()); + const std::size_t nb_elements1 = + std::accumulate(dims1.cbegin(), + dims1.cend(), + std::size_t(1), + std::multiplies<std::size_t>()); + const std::size_t nb_elements = + std::accumulate(dims.cbegin(), + dims.cend(), + std::size_t(1), + std::multiplies<std::size_t>()); + number_of_operation += nb_elements; + float *array0 = new float[nb_elements0]; + float *array1 = new float[nb_elements1]; + + for (std::size_t i = 0; i < nb_elements0; ++i) { + array0[i] = valueDist(gen); + } + for (std::size_t i = 0; i < nb_elements1; ++i) { + array1[i] = valueDist(gen); } + + // input0 CUDA + float *array0_d, *array1_d; + std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>(); + T0_cuda->setDataType(DataType::Float32); + T0_cuda->setBackend("cuda"); + T0_cuda->resize(dims0); + op_cuda->associateInput(0, T0_cuda); + cudaMalloc(reinterpret_cast<void **>(&array0_d), + sizeof(float) * nb_elements0); + cudaMemcpy(array0_d, + array0, + sizeof(float) * nb_elements0, + cudaMemcpyHostToDevice); + T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements0); + + // input0 CPU + std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>(); + op_cpu->associateInput(0, T0_cpu); + T0_cpu->setDataType(DataType::Float32); + T0_cpu->setBackend("cpu"); + T0_cpu->resize(dims0); + T0_cpu->getImpl()->setRawPtr(array0, nb_elements0); + + // input1 CUDA + std::shared_ptr<Tensor> T1_cuda = std::make_shared<Tensor>(); + T1_cuda->setDataType(DataType::Float32); + T1_cuda->setBackend("cuda"); + T1_cuda->resize(dims1); + op_cuda->associateInput(1, T1_cuda); + cudaMalloc(reinterpret_cast<void **>(&array1_d), + sizeof(float) * nb_elements1); + cudaMemcpy(array1_d, + array1, + sizeof(float) * nb_elements1, + cudaMemcpyHostToDevice); + T1_cuda->getImpl()->setRawPtr(array1_d, nb_elements1); + + // input1 CPU + std::shared_ptr<Tensor> T1_cpu = std::make_shared<Tensor>(); + op_cpu->associateInput(1, T1_cpu); + T1_cpu->setDataType(DataType::Float32); + T1_cpu->setBackend("cpu"); + T1_cpu->resize(dims1); + T1_cpu->getImpl()->setRawPtr(array1, nb_elements1); + + // forward CUDA + op_cuda->setDataType(DataType::Float32); + op_cuda->setBackend("cuda"); + start = std::chrono::system_clock::now(); + op_cuda->forward(); + end = std::chrono::system_clock::now(); + duration += + std::chrono::duration_cast<std::chrono::microseconds>(end - start); + + // forward CPU + op_cpu->forward(); + float *computedCPU = + static_cast<float *>(op_cpu->getOutput(0)->getImpl()->rawPtr()); + + std::shared_ptr<Tensor> outputFallback; + const auto &cudaOutput = + op_cuda->getOutput(0)->refCastFrom(outputFallback, + *op_cpu->getOutput(0)); + REQUIRE(approxEq<float>(cudaOutput, *(op_cpu->getOutput(0)))); + + delete[] array0; + delete[] array1; + cudaFree(array0_d); + cudaFree(array1_d); + } } } // namespace Aidge diff --git a/unit_tests/Test_PadImpl.cpp b/unit_tests/Test_PadImpl.cpp index 4e799ea6b7d11c9b446e0e4c8b9d12beae24bb05..0f2488e11b4cf85b8e632e1b21ab7390d33c90f9 100644 --- a/unit_tests/Test_PadImpl.cpp +++ b/unit_tests/Test_PadImpl.cpp @@ -11,7 +11,7 @@ #include <array> #include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution #include <catch2/catch_test_macros.hpp> @@ -26,117 +26,113 @@ TEST_CASE("[gpu/operator] Pad(forward)", "[Pad][GPU]") { SECTION("Symmetric Pad") { const int pv = 0; // pad value - std::shared_ptr<Node> myPad = Pad<2>({1, 1, 1, 1}, "mypad", PadBorderType::Constant, static_cast<double>(pv)); - auto op = std::static_pointer_cast<OperatorTensor>(myPad -> getOperator()); - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,3,5,5> { //NCHW - { - { - {{ 0, 1, 2, 3, 4}, - { 5, 6, 7, 8, 9}, - { 10, 11, 12, 13, 14}, - { 15, 16, 17, 18, 19}, - { 20, 21, 22, 23, 24}}, - - {{ 25, 26, 27, 28, 29}, - { 30, 31, 32, 33, 34}, - { 35, 36, 37, 38, 39}, - { 40, 41, 42, 43, 44}, - { 45, 46, 47, 48, 49}}, - - {{ 50, 51, 52, 53, 54}, - { 55, 56, 57, 58, 59}, - { 60, 61, 62, 63, 64}, - { 65, 66, 67, 68, 69}, - { 70, 71, 72, 73, 74}} - }, - { - {{ 75, 76, 77, 78, 79}, - { 80, 81, 82, 83, 84}, - { 85, 86, 87, 88, 89}, - { 90, 91, 92, 93, 94}, - { 95, 96, 97, 98, 99}}, - - {{100, 101, 102, 103, 104}, - {105, 106, 107, 108, 109}, - {110, 111, 112, 113, 114}, - {115, 116, 117, 118, 119}, - {120, 121, 122, 123, 124}}, - - {{125, 126, 127, 128, 129}, - {130, 131, 132, 133, 134}, - {135, 136, 137, 138, 139}, - {140, 141, 142, 143, 144}, - {145, 146, 147, 148, 149}} - } - } - }); - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,3,7,7> { //NCHW - { - { - {{ pv, pv, pv, pv, pv, pv, pv}, - { pv, 0, 1, 2, 3, 4, pv}, - { pv, 5, 6, 7, 8, 9, pv}, - { pv, 10, 11, 12, 13, 14, pv}, - { pv, 15, 16, 17, 18, 19, pv}, - { pv, 20, 21, 22, 23, 24, pv}, - { pv, pv, pv, pv, pv, pv, pv}}, - - {{ pv, pv, pv, pv, pv, pv, pv}, - { pv, 25, 26, 27, 28, 29, pv}, - { pv, 30, 31, 32, 33, 34, pv}, - { pv, 35, 36, 37, 38, 39, pv}, - { pv, 40, 41, 42, 43, 44, pv}, - { pv, 45, 46, 47, 48, 49, pv}, - { pv, pv, pv, pv, pv, pv, pv}}, - - {{ pv, pv, pv, pv, pv, pv, pv}, - { pv, 50, 51, 52, 53, 54, pv}, - { pv, 55, 56, 57, 58, 59, pv}, - { pv, 60, 61, 62, 63, 64, pv}, - { pv, 65, 66, 67, 68, 69, pv}, - { pv, 70, 71, 72, 73, 74, pv}, - { pv, pv, pv, pv, pv, pv, pv}} - }, - { - {{ pv, pv, pv, pv, pv, pv, pv}, - { pv, 75, 76, 77, 78, 79, pv}, - { pv, 80, 81, 82, 83, 84, pv}, - { pv, 85, 86, 87, 88, 89, pv}, - { pv, 90, 91, 92, 93, 94, pv}, - { pv, 95, 96, 97, 98, 99, pv}, - { pv, pv, pv, pv, pv, pv, pv}}, - - {{ pv, pv, pv, pv, pv, pv, pv}, - {pv, 100, 101, 102, 103, 104, pv}, - {pv, 105, 106, 107, 108, 109, pv}, - {pv, 110, 111, 112, 113, 114, pv}, - {pv, 115, 116, 117, 118, 119, pv}, - {pv, 120, 121, 122, 123, 124, pv}, - { pv, pv, pv, pv, pv, pv, pv}}, - - {{ pv, pv, pv, pv, pv, pv, pv}, - {pv, 125, 126, 127, 128, 129, pv}, - {pv, 130, 131, 132, 133, 134, pv}, - {pv, 135, 136, 137, 138, 139, pv}, - {pv, 140, 141, 142, 143, 144, pv}, - {pv, 145, 146, 147, 148, 149, pv}, - { pv, pv, pv, pv, pv, pv, pv}} - } - } - }); + std::shared_ptr<Node> myPad = Pad<2>({1, 1, 1, 1}, + "mypad", + PadBorderType::Constant, + static_cast<double>(pv)); + auto op = + std::static_pointer_cast<OperatorTensor>(myPad->getOperator()); + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>( + Array4D<float, 2, 3, 5, 5>{// NCHW + {{{{0, 1, 2, 3, 4}, + {5, 6, 7, 8, 9}, + {10, 11, 12, 13, 14}, + {15, 16, 17, 18, 19}, + {20, 21, 22, 23, 24}}, + + {{25, 26, 27, 28, 29}, + {30, 31, 32, 33, 34}, + {35, 36, 37, 38, 39}, + {40, 41, 42, 43, 44}, + {45, 46, 47, 48, 49}}, + + {{50, 51, 52, 53, 54}, + {55, 56, 57, 58, 59}, + {60, 61, 62, 63, 64}, + {65, 66, 67, 68, 69}, + {70, 71, 72, 73, 74}}}, + {{{75, 76, 77, 78, 79}, + {80, 81, 82, 83, 84}, + {85, 86, 87, 88, 89}, + {90, 91, 92, 93, 94}, + {95, 96, 97, 98, 99}}, + + {{100, 101, 102, 103, 104}, + {105, 106, 107, 108, 109}, + {110, 111, 112, 113, 114}, + {115, 116, 117, 118, 119}, + {120, 121, 122, 123, 124}}, + + {{125, 126, 127, 128, 129}, + {130, 131, 132, 133, 134}, + {135, 136, 137, 138, 139}, + {140, 141, 142, 143, 144}, + {145, 146, 147, 148, 149}}}}}); + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>( + Array4D<float, 2, 3, 7, 7>{// NCHW + {{{{pv, pv, pv, pv, pv, pv, pv}, + {pv, 0, 1, 2, 3, 4, pv}, + {pv, 5, 6, 7, 8, 9, pv}, + {pv, 10, 11, 12, 13, 14, pv}, + {pv, 15, 16, 17, 18, 19, pv}, + {pv, 20, 21, 22, 23, 24, pv}, + {pv, pv, pv, pv, pv, pv, pv}}, + + {{pv, pv, pv, pv, pv, pv, pv}, + {pv, 25, 26, 27, 28, 29, pv}, + {pv, 30, 31, 32, 33, 34, pv}, + {pv, 35, 36, 37, 38, 39, pv}, + {pv, 40, 41, 42, 43, 44, pv}, + {pv, 45, 46, 47, 48, 49, pv}, + {pv, pv, pv, pv, pv, pv, pv}}, + + {{pv, pv, pv, pv, pv, pv, pv}, + {pv, 50, 51, 52, 53, 54, pv}, + {pv, 55, 56, 57, 58, 59, pv}, + {pv, 60, 61, 62, 63, 64, pv}, + {pv, 65, 66, 67, 68, 69, pv}, + {pv, 70, 71, 72, 73, 74, pv}, + {pv, pv, pv, pv, pv, pv, pv}}}, + {{{pv, pv, pv, pv, pv, pv, pv}, + {pv, 75, 76, 77, 78, 79, pv}, + {pv, 80, 81, 82, 83, 84, pv}, + {pv, 85, 86, 87, 88, 89, pv}, + {pv, 90, 91, 92, 93, 94, pv}, + {pv, 95, 96, 97, 98, 99, pv}, + {pv, pv, pv, pv, pv, pv, pv}}, + + {{pv, pv, pv, pv, pv, pv, pv}, + {pv, 100, 101, 102, 103, 104, pv}, + {pv, 105, 106, 107, 108, 109, pv}, + {pv, 110, 111, 112, 113, 114, pv}, + {pv, 115, 116, 117, 118, 119, pv}, + {pv, 120, 121, 122, 123, 124, pv}, + {pv, pv, pv, pv, pv, pv, pv}}, + + {{pv, pv, pv, pv, pv, pv, pv}, + {pv, 125, 126, 127, 128, 129, pv}, + {pv, 130, 131, 132, 133, 134, pv}, + {pv, 135, 136, 137, 138, 139, pv}, + {pv, 140, 141, 142, 143, 144, pv}, + {pv, 145, 146, 147, 148, 149, pv}, + {pv, pv, pv, pv, pv, pv, pv}}}}}); myInput->setBackend("cuda"); - myPad->getOperator()->associateInput(0,myInput); + myPad->getOperator()->associateInput(0, myInput); myPad->getOperator()->setDataType(DataType::Float32); myPad->getOperator()->setBackend("cuda"); myPad->forward(); - float* computedOutput = new float[myOutput->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + float *computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, + op->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * myOutput->size(), + cudaMemcpyDeviceToHost); - for(int i = 0; i < myOutput->size(); i++){ - const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + for (int i = 0; i < myOutput->size(); i++) { + const float targetOutput = + *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i); REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); } @@ -146,111 +142,107 @@ TEST_CASE("[gpu/operator] Pad(forward)", "[Pad][GPU]") { SECTION("Asymmetric Pad") { const int pv = 0; // pad value - std::shared_ptr<Node> myPad = Pad<2>({1, 0, 0, 1}, "mypad", PadBorderType::Constant, static_cast<double>(pv)); - auto op = std::static_pointer_cast<OperatorTensor>(myPad -> getOperator()); - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,3,5,5> { //NCHW - { - { - {{ 0, 1, 2, 3, 4}, - { 5, 6, 7, 8, 9}, - { 10, 11, 12, 13, 14}, - { 15, 16, 17, 18, 19}, - { 20, 21, 22, 23, 24}}, - - {{ 25, 26, 27, 28, 29}, - { 30, 31, 32, 33, 34}, - { 35, 36, 37, 38, 39}, - { 40, 41, 42, 43, 44}, - { 45, 46, 47, 48, 49}}, - - {{ 50, 51, 52, 53, 54}, - { 55, 56, 57, 58, 59}, - { 60, 61, 62, 63, 64}, - { 65, 66, 67, 68, 69}, - { 70, 71, 72, 73, 74}} - }, - { - {{ 75, 76, 77, 78, 79}, - { 80, 81, 82, 83, 84}, - { 85, 86, 87, 88, 89}, - { 90, 91, 92, 93, 94}, - { 95, 96, 97, 98, 99}}, - - {{100, 101, 102, 103, 104}, - {105, 106, 107, 108, 109}, - {110, 111, 112, 113, 114}, - {115, 116, 117, 118, 119}, - {120, 121, 122, 123, 124}}, - - {{125, 126, 127, 128, 129}, - {130, 131, 132, 133, 134}, - {135, 136, 137, 138, 139}, - {140, 141, 142, 143, 144}, - {145, 146, 147, 148, 149}} - } - } - }); - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,3,6,6> { //NCHW - { - { - {{ pv, pv, pv, pv, pv, pv}, - { 0, 1, 2, 3, 4, pv}, - { 5, 6, 7, 8, 9, pv}, - { 10, 11, 12, 13, 14, pv}, - { 15, 16, 17, 18, 19, pv}, - { 20, 21, 22, 23, 24, pv}}, - - {{ pv, pv, pv, pv, pv, pv}, - { 25, 26, 27, 28, 29, pv}, - { 30, 31, 32, 33, 34, pv}, - { 35, 36, 37, 38, 39, pv}, - { 40, 41, 42, 43, 44, pv}, - { 45, 46, 47, 48, 49, pv}}, - - {{ pv, pv, pv, pv, pv, pv}, - { 50, 51, 52, 53, 54, pv}, - { 55, 56, 57, 58, 59, pv}, - { 60, 61, 62, 63, 64, pv}, - { 65, 66, 67, 68, 69, pv}, - { 70, 71, 72, 73, 74, pv}} - }, - { - {{ pv, pv, pv, pv, pv, pv}, - { 75, 76, 77, 78, 79, pv}, - { 80, 81, 82, 83, 84, pv}, - { 85, 86, 87, 88, 89, pv}, - { 90, 91, 92, 93, 94, pv}, - { 95, 96, 97, 98, 99, pv}}, - - {{ pv, pv, pv, pv, pv, pv}, - { 100, 101, 102, 103, 104, pv}, - { 105, 106, 107, 108, 109, pv}, - { 110, 111, 112, 113, 114, pv}, - { 115, 116, 117, 118, 119, pv}, - { 120, 121, 122, 123, 124, pv}}, - - {{ pv, pv, pv, pv, pv, pv}, - { 125, 126, 127, 128, 129, pv}, - { 130, 131, 132, 133, 134, pv}, - { 135, 136, 137, 138, 139, pv}, - { 140, 141, 142, 143, 144, pv}, - { 145, 146, 147, 148, 149, pv}} - } - } - }); + std::shared_ptr<Node> myPad = Pad<2>({1, 0, 0, 1}, + "mypad", + PadBorderType::Constant, + static_cast<double>(pv)); + auto op = + std::static_pointer_cast<OperatorTensor>(myPad->getOperator()); + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>( + Array4D<float, 2, 3, 5, 5>{// NCHW + {{{{0, 1, 2, 3, 4}, + {5, 6, 7, 8, 9}, + {10, 11, 12, 13, 14}, + {15, 16, 17, 18, 19}, + {20, 21, 22, 23, 24}}, + + {{25, 26, 27, 28, 29}, + {30, 31, 32, 33, 34}, + {35, 36, 37, 38, 39}, + {40, 41, 42, 43, 44}, + {45, 46, 47, 48, 49}}, + + {{50, 51, 52, 53, 54}, + {55, 56, 57, 58, 59}, + {60, 61, 62, 63, 64}, + {65, 66, 67, 68, 69}, + {70, 71, 72, 73, 74}}}, + {{{75, 76, 77, 78, 79}, + {80, 81, 82, 83, 84}, + {85, 86, 87, 88, 89}, + {90, 91, 92, 93, 94}, + {95, 96, 97, 98, 99}}, + + {{100, 101, 102, 103, 104}, + {105, 106, 107, 108, 109}, + {110, 111, 112, 113, 114}, + {115, 116, 117, 118, 119}, + {120, 121, 122, 123, 124}}, + + {{125, 126, 127, 128, 129}, + {130, 131, 132, 133, 134}, + {135, 136, 137, 138, 139}, + {140, 141, 142, 143, 144}, + {145, 146, 147, 148, 149}}}}}); + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>( + Array4D<float, 2, 3, 6, 6>{// NCHW + {{{{pv, pv, pv, pv, pv, pv}, + {0, 1, 2, 3, 4, pv}, + {5, 6, 7, 8, 9, pv}, + {10, 11, 12, 13, 14, pv}, + {15, 16, 17, 18, 19, pv}, + {20, 21, 22, 23, 24, pv}}, + + {{pv, pv, pv, pv, pv, pv}, + {25, 26, 27, 28, 29, pv}, + {30, 31, 32, 33, 34, pv}, + {35, 36, 37, 38, 39, pv}, + {40, 41, 42, 43, 44, pv}, + {45, 46, 47, 48, 49, pv}}, + + {{pv, pv, pv, pv, pv, pv}, + {50, 51, 52, 53, 54, pv}, + {55, 56, 57, 58, 59, pv}, + {60, 61, 62, 63, 64, pv}, + {65, 66, 67, 68, 69, pv}, + {70, 71, 72, 73, 74, pv}}}, + {{{pv, pv, pv, pv, pv, pv}, + {75, 76, 77, 78, 79, pv}, + {80, 81, 82, 83, 84, pv}, + {85, 86, 87, 88, 89, pv}, + {90, 91, 92, 93, 94, pv}, + {95, 96, 97, 98, 99, pv}}, + + {{pv, pv, pv, pv, pv, pv}, + {100, 101, 102, 103, 104, pv}, + {105, 106, 107, 108, 109, pv}, + {110, 111, 112, 113, 114, pv}, + {115, 116, 117, 118, 119, pv}, + {120, 121, 122, 123, 124, pv}}, + + {{pv, pv, pv, pv, pv, pv}, + {125, 126, 127, 128, 129, pv}, + {130, 131, 132, 133, 134, pv}, + {135, 136, 137, 138, 139, pv}, + {140, 141, 142, 143, 144, pv}, + {145, 146, 147, 148, 149, pv}}}}}); myInput->setBackend("cuda"); - myPad->getOperator()->associateInput(0,myInput); + myPad->getOperator()->associateInput(0, myInput); myPad->getOperator()->setDataType(DataType::Float32); myPad->getOperator()->setBackend("cuda"); myPad->forward(); - float* computedOutput = new float[myOutput->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + float *computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, + op->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * myOutput->size(), + cudaMemcpyDeviceToHost); - for(int i = 0; i < myOutput->size(); i++){ - const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + for (int i = 0; i < myOutput->size(); i++) { + const float targetOutput = + *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i); REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); } @@ -258,115 +250,110 @@ TEST_CASE("[gpu/operator] Pad(forward)", "[Pad][GPU]") { } SECTION("Pad Edge") { - std::shared_ptr<Node> myPad = Pad<2>({1, 1, 1, 1}, "mypad", PadBorderType::Edge); - auto op = std::static_pointer_cast<OperatorTensor>(myPad -> getOperator()); - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,3,5,5> { //NCHW - { - { - {{ 0, 1, 2, 3, 4}, - { 5, 6, 7, 8, 9}, - { 10, 11, 12, 13, 14}, - { 15, 16, 17, 18, 19}, - { 20, 21, 22, 23, 24}}, - - {{ 25, 26, 27, 28, 29}, - { 30, 31, 32, 33, 34}, - { 35, 36, 37, 38, 39}, - { 40, 41, 42, 43, 44}, - { 45, 46, 47, 48, 49}}, - - {{ 50, 51, 52, 53, 54}, - { 55, 56, 57, 58, 59}, - { 60, 61, 62, 63, 64}, - { 65, 66, 67, 68, 69}, - { 70, 71, 72, 73, 74}} - }, - { - {{ 75, 76, 77, 78, 79}, - { 80, 81, 82, 83, 84}, - { 85, 86, 87, 88, 89}, - { 90, 91, 92, 93, 94}, - { 95, 96, 97, 98, 99}}, - - {{100, 101, 102, 103, 104}, - {105, 106, 107, 108, 109}, - {110, 111, 112, 113, 114}, - {115, 116, 117, 118, 119}, - {120, 121, 122, 123, 124}}, - - {{125, 126, 127, 128, 129}, - {130, 131, 132, 133, 134}, - {135, 136, 137, 138, 139}, - {140, 141, 142, 143, 144}, - {145, 146, 147, 148, 149}} - } - } - }); - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,3,7,7> { //NCHW - { - { - {{ 0, 0, 1, 2, 3, 4, 4}, - { 0, 0, 1, 2, 3, 4, 4}, - { 5, 5, 6, 7, 8, 9, 9}, - { 10, 10, 11, 12, 13, 14, 14}, - { 15, 15, 16, 17, 18, 19, 19}, - { 20, 20, 21, 22, 23, 24, 24}, - { 20, 20, 21, 22, 23, 24, 24}}, - - {{ 25, 25, 26, 27, 28, 29, 29}, - { 25, 25, 26, 27, 28, 29, 29}, - { 30, 30, 31, 32, 33, 34, 34}, - { 35, 35, 36, 37, 38, 39, 39}, - { 40, 40, 41, 42, 43, 44, 44}, - { 45, 45, 46, 47, 48, 49, 49}, - { 45, 45, 46, 47, 48, 49, 49}}, - - {{ 50, 50, 51, 52, 53, 54, 54}, - { 50, 50, 51, 52, 53, 54, 54}, - { 55, 55, 56, 57, 58, 59, 59}, - { 60, 60, 61, 62, 63, 64, 64}, - { 65, 65, 66, 67, 68, 69, 69}, - { 70, 70, 71, 72, 73, 74, 74}, - { 70, 70, 71, 72, 73, 74, 74}} - }, - { - {{ 75, 75, 76, 77, 78, 79, 79}, - { 75, 75, 76, 77, 78, 79, 79}, - { 80, 80, 81, 82, 83, 84, 84}, - { 85, 85, 86, 87, 88, 89, 89}, - { 90, 90, 91, 92, 93, 94, 94}, - { 95, 95, 96, 97, 98, 99, 99}, - { 95, 95, 96, 97, 98, 99, 99}}, - - {{100, 100, 101, 102, 103, 104, 104}, - {100, 100, 101, 102, 103, 104, 104}, - {105, 105, 106, 107, 108, 109, 109}, - {110, 110, 111, 112, 113, 114, 114}, - {115, 115, 116, 117, 118, 119, 119}, - {120, 120, 121, 122, 123, 124, 124}, - {120, 120, 121, 122, 123, 124, 124}}, - - {{125, 125, 126, 127, 128, 129, 129}, - {125, 125, 126, 127, 128, 129, 129}, - {130, 130, 131, 132, 133, 134, 134}, - {135, 135, 136, 137, 138, 139, 139}, - {140, 140, 141, 142, 143, 144, 144}, - {145, 145, 146, 147, 148, 149, 149}, - {145, 145, 146, 147, 148, 149, 149}} - } - } - }); + std::shared_ptr<Node> myPad = + Pad<2>({1, 1, 1, 1}, "mypad", PadBorderType::Edge); + auto op = + std::static_pointer_cast<OperatorTensor>(myPad->getOperator()); + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>( + Array4D<float, 2, 3, 5, 5>{// NCHW + {{{{0, 1, 2, 3, 4}, + {5, 6, 7, 8, 9}, + {10, 11, 12, 13, 14}, + {15, 16, 17, 18, 19}, + {20, 21, 22, 23, 24}}, + + {{25, 26, 27, 28, 29}, + {30, 31, 32, 33, 34}, + {35, 36, 37, 38, 39}, + {40, 41, 42, 43, 44}, + {45, 46, 47, 48, 49}}, + + {{50, 51, 52, 53, 54}, + {55, 56, 57, 58, 59}, + {60, 61, 62, 63, 64}, + {65, 66, 67, 68, 69}, + {70, 71, 72, 73, 74}}}, + {{{75, 76, 77, 78, 79}, + {80, 81, 82, 83, 84}, + {85, 86, 87, 88, 89}, + {90, 91, 92, 93, 94}, + {95, 96, 97, 98, 99}}, + + {{100, 101, 102, 103, 104}, + {105, 106, 107, 108, 109}, + {110, 111, 112, 113, 114}, + {115, 116, 117, 118, 119}, + {120, 121, 122, 123, 124}}, + + {{125, 126, 127, 128, 129}, + {130, 131, 132, 133, 134}, + {135, 136, 137, 138, 139}, + {140, 141, 142, 143, 144}, + {145, 146, 147, 148, 149}}}}}); + std::shared_ptr<Tensor> myOutput = + std::make_shared<Tensor>(Array4D<float, 2, 3, 7, 7>{ + // NCHW + {{{{0, 0, 1, 2, 3, 4, 4}, + {0, 0, 1, 2, 3, 4, 4}, + {5, 5, 6, 7, 8, 9, 9}, + {10, 10, 11, 12, 13, 14, 14}, + {15, 15, 16, 17, 18, 19, 19}, + {20, 20, 21, 22, 23, 24, 24}, + {20, 20, 21, 22, 23, 24, 24}}, + + {{25, 25, 26, 27, 28, 29, 29}, + {25, 25, 26, 27, 28, 29, 29}, + {30, 30, 31, 32, 33, 34, 34}, + {35, 35, 36, 37, 38, 39, 39}, + {40, 40, 41, 42, 43, 44, 44}, + {45, 45, 46, 47, 48, 49, 49}, + {45, 45, 46, 47, 48, 49, 49}}, + + {{50, 50, 51, 52, 53, 54, 54}, + {50, 50, 51, 52, 53, 54, 54}, + {55, 55, 56, 57, 58, 59, 59}, + {60, 60, 61, 62, 63, 64, 64}, + {65, 65, 66, 67, 68, 69, 69}, + {70, 70, 71, 72, 73, 74, 74}, + {70, 70, 71, 72, 73, 74, 74}}}, + {{{75, 75, 76, 77, 78, 79, 79}, + {75, 75, 76, 77, 78, 79, 79}, + {80, 80, 81, 82, 83, 84, 84}, + {85, 85, 86, 87, 88, 89, 89}, + {90, 90, 91, 92, 93, 94, 94}, + {95, 95, 96, 97, 98, 99, 99}, + {95, 95, 96, 97, 98, 99, 99}}, + + {{100, 100, 101, 102, 103, 104, 104}, + {100, 100, 101, 102, 103, 104, 104}, + {105, 105, 106, 107, 108, 109, 109}, + {110, 110, 111, 112, 113, 114, 114}, + {115, 115, 116, 117, 118, 119, 119}, + {120, 120, 121, 122, 123, 124, 124}, + {120, 120, 121, 122, 123, 124, 124}}, + + {{125, 125, 126, 127, 128, 129, 129}, + {125, 125, 126, 127, 128, 129, 129}, + {130, 130, 131, 132, 133, 134, 134}, + {135, 135, 136, 137, 138, 139, 139}, + {140, 140, 141, 142, 143, 144, 144}, + {145, 145, 146, 147, 148, 149, 149}, + {145, 145, 146, 147, 148, 149, 149}}}}}); myInput->setBackend("cuda"); - myPad->getOperator()->associateInput(0,myInput); + myPad->getOperator()->associateInput(0, myInput); myPad->getOperator()->setDataType(DataType::Float32); myPad->getOperator()->setBackend("cuda"); myPad->forward(); - float* computedOutput = new float[myOutput->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); - for(int i = 0; i < myOutput->size(); i++){ - const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + float *computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, + op->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * myOutput->size(), + cudaMemcpyDeviceToHost); + for (int i = 0; i < myOutput->size(); i++) { + const float targetOutput = + *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i); REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); } @@ -374,124 +361,107 @@ TEST_CASE("[gpu/operator] Pad(forward)", "[Pad][GPU]") { } SECTION("Pad Reflect") { - std::shared_ptr<Node> myPad = Pad<2>({1, 1, 1, 1}, "mypad", PadBorderType::Reflect); - auto op = std::static_pointer_cast<OperatorTensor>(myPad -> getOperator()); - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,3,5,5> { //NCHW - { - { - {{ 0, 1, 2, 3, 4}, - { 5, 6, 7, 8, 9}, - { 10, 11, 12, 13, 14}, - { 15, 16, 17, 18, 19}, - { 20, 21, 22, 23, 24}}, - - {{ 25, 26, 27, 28, 29}, - { 30, 31, 32, 33, 34}, - { 35, 36, 37, 38, 39}, - { 40, 41, 42, 43, 44}, - { 45, 46, 47, 48, 49}}, - - {{ 50, 51, 52, 53, 54}, - { 55, 56, 57, 58, 59}, - { 60, 61, 62, 63, 64}, - { 65, 66, 67, 68, 69}, - { 70, 71, 72, 73, 74}} - }, - { - {{ 75, 76, 77, 78, 79}, - { 80, 81, 82, 83, 84}, - { 85, 86, 87, 88, 89}, - { 90, 91, 92, 93, 94}, - { 95, 96, 97, 98, 99}}, - - {{100, 101, 102, 103, 104}, - {105, 106, 107, 108, 109}, - {110, 111, 112, 113, 114}, - {115, 116, 117, 118, 119}, - {120, 121, 122, 123, 124}}, - - {{125, 126, 127, 128, 129}, - {130, 131, 132, 133, 134}, - {135, 136, 137, 138, 139}, - {140, 141, 142, 143, 144}, - {145, 146, 147, 148, 149}} - } - } - }); - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,3,7,7> { //NCHW - { - { - { - { 6, 5, 6, 7, 8, 9, 5}, - { 1, 0, 1, 2, 3, 4, 0}, - { 6, 5, 6, 7, 8, 9, 5}, - { 11, 10, 11, 12, 13, 14, 10}, - { 16, 15, 16, 17, 18, 19, 15}, - { 21, 20, 21, 22, 23, 24, 20}, - { 1, 0, 1, 2, 3, 4, 0} - }, - { - { 31, 30, 31, 32, 33, 34, 30}, - { 26, 25, 26, 27, 28, 29, 25}, - { 31, 30, 31, 32, 33, 34, 30}, - { 36, 35, 36, 37, 38, 39, 35}, - { 41, 40, 41, 42, 43, 44, 40}, - { 46, 45, 46, 47, 48, 49, 45}, - { 26, 25, 26, 27, 28, 29, 25} - }, - { - { 56, 55, 56, 57, 58, 59, 55}, - { 51, 50, 51, 52, 53, 54, 50}, - { 56, 55, 56, 57, 58, 59, 55}, - { 61, 60, 61, 62, 63, 64, 60}, - { 66, 65, 66, 67, 68, 69, 65}, - { 71, 70, 71, 72, 73, 74, 70}, - { 51, 50, 51, 52, 53, 54, 50} - } - }, - { - { - { 81, 80, 81, 82, 83, 84, 80}, - { 76, 75, 76, 77, 78, 79, 75}, - { 81, 80, 81, 82, 83, 84, 80}, - { 86, 85, 86, 87, 88, 89, 85}, - { 91, 90, 91, 92, 93, 94, 90}, - { 96, 95, 96, 97, 98, 99, 95}, - { 76, 75, 76, 77, 78, 79, 75} - }, - { - { 106, 105, 106, 107, 108, 109, 105}, - { 101, 100, 101, 102, 103, 104, 100}, - { 106, 105, 106, 107, 108, 109, 105}, - { 111, 110, 111, 112, 113, 114, 110}, - { 116, 115, 116, 117, 118, 119, 115}, - { 121, 120, 121, 122, 123, 124, 120}, - { 101, 100, 101, 102, 103, 104, 100} - }, - { - { 131, 130, 131, 132, 133, 134, 130}, - { 126, 125, 126, 127, 128, 129, 125}, - { 131, 130, 131, 132, 133, 134, 130}, - { 136, 135, 136, 137, 138, 139, 135}, - { 141, 140, 141, 142, 143, 144, 140}, - { 146, 145, 146, 147, 148, 149, 145}, - { 126, 125, 126, 127, 128, 129, 125} - } - } - } - }); + std::shared_ptr<Node> myPad = + Pad<2>({1, 1, 1, 1}, "mypad", PadBorderType::Reflect); + auto op = + std::static_pointer_cast<OperatorTensor>(myPad->getOperator()); + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>( + Array4D<float, 2, 3, 5, 5>{// NCHW + {{{{0, 1, 2, 3, 4}, + {5, 6, 7, 8, 9}, + {10, 11, 12, 13, 14}, + {15, 16, 17, 18, 19}, + {20, 21, 22, 23, 24}}, + + {{25, 26, 27, 28, 29}, + {30, 31, 32, 33, 34}, + {35, 36, 37, 38, 39}, + {40, 41, 42, 43, 44}, + {45, 46, 47, 48, 49}}, + + {{50, 51, 52, 53, 54}, + {55, 56, 57, 58, 59}, + {60, 61, 62, 63, 64}, + {65, 66, 67, 68, 69}, + {70, 71, 72, 73, 74}}}, + {{{75, 76, 77, 78, 79}, + {80, 81, 82, 83, 84}, + {85, 86, 87, 88, 89}, + {90, 91, 92, 93, 94}, + {95, 96, 97, 98, 99}}, + + {{100, 101, 102, 103, 104}, + {105, 106, 107, 108, 109}, + {110, 111, 112, 113, 114}, + {115, 116, 117, 118, 119}, + {120, 121, 122, 123, 124}}, + + {{125, 126, 127, 128, 129}, + {130, 131, 132, 133, 134}, + {135, 136, 137, 138, 139}, + {140, 141, 142, 143, 144}, + {145, 146, 147, 148, 149}}}}}); + std::shared_ptr<Tensor> myOutput = + std::make_shared<Tensor>(Array4D<float, 2, 3, 7, 7>{ + // NCHW + {{{{6, 5, 6, 7, 8, 9, 5}, + {1, 0, 1, 2, 3, 4, 0}, + {6, 5, 6, 7, 8, 9, 5}, + {11, 10, 11, 12, 13, 14, 10}, + {16, 15, 16, 17, 18, 19, 15}, + {21, 20, 21, 22, 23, 24, 20}, + {1, 0, 1, 2, 3, 4, 0}}, + {{31, 30, 31, 32, 33, 34, 30}, + {26, 25, 26, 27, 28, 29, 25}, + {31, 30, 31, 32, 33, 34, 30}, + {36, 35, 36, 37, 38, 39, 35}, + {41, 40, 41, 42, 43, 44, 40}, + {46, 45, 46, 47, 48, 49, 45}, + {26, 25, 26, 27, 28, 29, 25}}, + {{56, 55, 56, 57, 58, 59, 55}, + {51, 50, 51, 52, 53, 54, 50}, + {56, 55, 56, 57, 58, 59, 55}, + {61, 60, 61, 62, 63, 64, 60}, + {66, 65, 66, 67, 68, 69, 65}, + {71, 70, 71, 72, 73, 74, 70}, + {51, 50, 51, 52, 53, 54, 50}}}, + {{{81, 80, 81, 82, 83, 84, 80}, + {76, 75, 76, 77, 78, 79, 75}, + {81, 80, 81, 82, 83, 84, 80}, + {86, 85, 86, 87, 88, 89, 85}, + {91, 90, 91, 92, 93, 94, 90}, + {96, 95, 96, 97, 98, 99, 95}, + {76, 75, 76, 77, 78, 79, 75}}, + {{106, 105, 106, 107, 108, 109, 105}, + {101, 100, 101, 102, 103, 104, 100}, + {106, 105, 106, 107, 108, 109, 105}, + {111, 110, 111, 112, 113, 114, 110}, + {116, 115, 116, 117, 118, 119, 115}, + {121, 120, 121, 122, 123, 124, 120}, + {101, 100, 101, 102, 103, 104, 100}}, + {{131, 130, 131, 132, 133, 134, 130}, + {126, 125, 126, 127, 128, 129, 125}, + {131, 130, 131, 132, 133, 134, 130}, + {136, 135, 136, 137, 138, 139, 135}, + {141, 140, 141, 142, 143, 144, 140}, + {146, 145, 146, 147, 148, 149, 145}, + {126, 125, 126, 127, 128, 129, 125}}}}}); myInput->setBackend("cuda"); - myPad->getOperator()->associateInput(0,myInput); + myPad->getOperator()->associateInput(0, myInput); myPad->getOperator()->setDataType(DataType::Float32); myPad->getOperator()->setBackend("cuda"); myPad->forward(); - float* computedOutput = new float[myOutput->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + float *computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, + op->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * myOutput->size(), + cudaMemcpyDeviceToHost); - for(int i = 0; i < myOutput->size(); i++){ - const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + for (int i = 0; i < myOutput->size(); i++) { + const float targetOutput = + *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i); REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); } @@ -499,116 +469,111 @@ TEST_CASE("[gpu/operator] Pad(forward)", "[Pad][GPU]") { } SECTION("Pad Wrap") { - std::shared_ptr<Node> myPad = Pad<2>({1, 1, 1, 1}, "mypad", PadBorderType::Wrap); - auto op = std::static_pointer_cast<OperatorTensor>(myPad -> getOperator()); - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,3,5,5> { //NCHW - { - { - {{ 0, 1, 2, 3, 4}, - { 5, 6, 7, 8, 9}, - { 10, 11, 12, 13, 14}, - { 15, 16, 17, 18, 19}, - { 20, 21, 22, 23, 24}}, - - {{ 25, 26, 27, 28, 29}, - { 30, 31, 32, 33, 34}, - { 35, 36, 37, 38, 39}, - { 40, 41, 42, 43, 44}, - { 45, 46, 47, 48, 49}}, - - {{ 50, 51, 52, 53, 54}, - { 55, 56, 57, 58, 59}, - { 60, 61, 62, 63, 64}, - { 65, 66, 67, 68, 69}, - { 70, 71, 72, 73, 74}} - }, - { - {{ 75, 76, 77, 78, 79}, - { 80, 81, 82, 83, 84}, - { 85, 86, 87, 88, 89}, - { 90, 91, 92, 93, 94}, - { 95, 96, 97, 98, 99}}, - - {{100, 101, 102, 103, 104}, - {105, 106, 107, 108, 109}, - {110, 111, 112, 113, 114}, - {115, 116, 117, 118, 119}, - {120, 121, 122, 123, 124}}, - - {{125, 126, 127, 128, 129}, - {130, 131, 132, 133, 134}, - {135, 136, 137, 138, 139}, - {140, 141, 142, 143, 144}, - {145, 146, 147, 148, 149}} - } - } - }); - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,3,7,7> { //NCHW - { - { - {{ 24, 20, 21, 22, 23, 24, 20}, - { 4, 0, 1, 2, 3, 4, 0}, - { 9, 5, 6, 7, 8, 9, 5}, - { 14, 10, 11, 12, 13, 14, 10}, - { 19, 15, 16, 17, 18, 19, 15}, - { 24, 20, 21, 22, 23, 24, 20}, - { 4, 0, 1, 2, 3, 4, 0}}, - - {{ 49, 45, 46, 47, 48, 49, 45}, - { 29, 25, 26, 27, 28, 29, 25}, - { 34, 30, 31, 32, 33, 34, 30}, - { 39, 35, 36, 37, 38, 39, 35}, - { 44, 40, 41, 42, 43, 44, 40}, - { 49, 45, 46, 47, 48, 49, 45}, - { 29, 25, 26, 27, 28, 29, 25}}, - - {{ 74, 70, 71, 72, 73, 74, 70}, - { 54, 50, 51, 52, 53, 54, 50}, - { 59, 55, 56, 57, 58, 59, 55}, - { 64, 60, 61, 62, 63, 64, 60}, - { 69, 65, 66, 67, 68, 69, 65}, - { 74, 70, 71, 72, 73, 74, 70}, - { 54, 50, 51, 52, 53, 54, 50}} - }, - { - {{ 99, 95, 96, 97, 98, 99, 95}, - { 79, 75, 76, 77, 78, 79, 75}, - { 84, 80, 81, 82, 83, 84, 80}, - { 89, 85, 86, 87, 88, 89, 85}, - { 94, 90, 91, 92, 93, 94, 90}, - { 99, 95, 96, 97, 98, 99, 95}, - { 79, 75, 76, 77, 78, 79, 75}}, - - {{124, 120, 121, 122, 123, 124, 120}, - {104, 100, 101, 102, 103, 104, 100}, - {109, 105, 106, 107, 108, 109, 105}, - {114, 110, 111, 112, 113, 114, 110}, - {119, 115, 116, 117, 118, 119, 115}, - {124, 120, 121, 122, 123, 124, 120}, - {104, 100, 101, 102, 103, 104, 100}}, - - {{149, 145, 146, 147, 148, 149, 145}, - {129, 125, 126, 127, 128, 129, 125}, - {134, 130, 131, 132, 133, 134, 130}, - {139, 135, 136, 137, 138, 139, 135}, - {144, 140, 141, 142, 143, 144, 140}, - {149, 145, 146, 147, 148, 149, 145}, - {129, 125, 126, 127, 128, 129, 125}} - } - } - }); + std::shared_ptr<Node> myPad = + Pad<2>({1, 1, 1, 1}, "mypad", PadBorderType::Wrap); + auto op = + std::static_pointer_cast<OperatorTensor>(myPad->getOperator()); + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>( + Array4D<float, 2, 3, 5, 5>{// NCHW + {{{{0, 1, 2, 3, 4}, + {5, 6, 7, 8, 9}, + {10, 11, 12, 13, 14}, + {15, 16, 17, 18, 19}, + {20, 21, 22, 23, 24}}, + + {{25, 26, 27, 28, 29}, + {30, 31, 32, 33, 34}, + {35, 36, 37, 38, 39}, + {40, 41, 42, 43, 44}, + {45, 46, 47, 48, 49}}, + + {{50, 51, 52, 53, 54}, + {55, 56, 57, 58, 59}, + {60, 61, 62, 63, 64}, + {65, 66, 67, 68, 69}, + {70, 71, 72, 73, 74}}}, + {{{75, 76, 77, 78, 79}, + {80, 81, 82, 83, 84}, + {85, 86, 87, 88, 89}, + {90, 91, 92, 93, 94}, + {95, 96, 97, 98, 99}}, + + {{100, 101, 102, 103, 104}, + {105, 106, 107, 108, 109}, + {110, 111, 112, 113, 114}, + {115, 116, 117, 118, 119}, + {120, 121, 122, 123, 124}}, + + {{125, 126, 127, 128, 129}, + {130, 131, 132, 133, 134}, + {135, 136, 137, 138, 139}, + {140, 141, 142, 143, 144}, + {145, 146, 147, 148, 149}}}}}); + std::shared_ptr<Tensor> myOutput = + std::make_shared<Tensor>(Array4D<float, 2, 3, 7, 7>{ + // NCHW + {{{{24, 20, 21, 22, 23, 24, 20}, + {4, 0, 1, 2, 3, 4, 0}, + {9, 5, 6, 7, 8, 9, 5}, + {14, 10, 11, 12, 13, 14, 10}, + {19, 15, 16, 17, 18, 19, 15}, + {24, 20, 21, 22, 23, 24, 20}, + {4, 0, 1, 2, 3, 4, 0}}, + + {{49, 45, 46, 47, 48, 49, 45}, + {29, 25, 26, 27, 28, 29, 25}, + {34, 30, 31, 32, 33, 34, 30}, + {39, 35, 36, 37, 38, 39, 35}, + {44, 40, 41, 42, 43, 44, 40}, + {49, 45, 46, 47, 48, 49, 45}, + {29, 25, 26, 27, 28, 29, 25}}, + + {{74, 70, 71, 72, 73, 74, 70}, + {54, 50, 51, 52, 53, 54, 50}, + {59, 55, 56, 57, 58, 59, 55}, + {64, 60, 61, 62, 63, 64, 60}, + {69, 65, 66, 67, 68, 69, 65}, + {74, 70, 71, 72, 73, 74, 70}, + {54, 50, 51, 52, 53, 54, 50}}}, + {{{99, 95, 96, 97, 98, 99, 95}, + {79, 75, 76, 77, 78, 79, 75}, + {84, 80, 81, 82, 83, 84, 80}, + {89, 85, 86, 87, 88, 89, 85}, + {94, 90, 91, 92, 93, 94, 90}, + {99, 95, 96, 97, 98, 99, 95}, + {79, 75, 76, 77, 78, 79, 75}}, + + {{124, 120, 121, 122, 123, 124, 120}, + {104, 100, 101, 102, 103, 104, 100}, + {109, 105, 106, 107, 108, 109, 105}, + {114, 110, 111, 112, 113, 114, 110}, + {119, 115, 116, 117, 118, 119, 115}, + {124, 120, 121, 122, 123, 124, 120}, + {104, 100, 101, 102, 103, 104, 100}}, + + {{149, 145, 146, 147, 148, 149, 145}, + {129, 125, 126, 127, 128, 129, 125}, + {134, 130, 131, 132, 133, 134, 130}, + {139, 135, 136, 137, 138, 139, 135}, + {144, 140, 141, 142, 143, 144, 140}, + {149, 145, 146, 147, 148, 149, 145}, + {129, 125, 126, 127, 128, 129, 125}}}}}); myInput->setBackend("cuda"); - myPad->getOperator()->associateInput(0,myInput); + myPad->getOperator()->associateInput(0, myInput); myPad->getOperator()->setDataType(DataType::Float32); myPad->getOperator()->setBackend("cuda"); myPad->forward(); - float* computedOutput = new float[myOutput->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + float *computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, + op->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * myOutput->size(), + cudaMemcpyDeviceToHost); - for(int i = 0; i < myOutput->size(); i++){ - const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + for (int i = 0; i < myOutput->size(); i++) { + const float targetOutput = + *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i); REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); } @@ -620,77 +585,103 @@ TEST_CASE("[gpu/operator] Pad(forward)", "[Pad][GPU]") { std::random_device rd; std::mt19937 gen(rd()); std::uniform_real_distribution<float> valueDist( - 0.1f, 1.1f); // Random float distribution between 0 and 1 - std::uniform_int_distribution<std::size_t> padTypeDist(std::size_t(0), std::size_t(1)); - // TODO: fix Reflect and Wrap Pad, cpu and gpu only five same results when padding = 1 - std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1), std::size_t(10)); - std::uniform_int_distribution<std::size_t> padSizeDist(std::size_t(0), std::size_t(5)); + 0.1f, + 1.1f); // Random float distribution between 0 and 1 + std::uniform_int_distribution<std::size_t> padTypeDist(std::size_t(0), + std::size_t(1)); + // TODO: fix Reflect and Wrap Pad, cpu and gpu only five same results + // when padding = 1 + std::uniform_int_distribution<std::size_t> dimSizeDist( + std::size_t(1), + std::size_t(10)); + std::uniform_int_distribution<std::size_t> padSizeDist(std::size_t(0), + std::size_t(5)); // To measure execution time of 'forward()' std::chrono::time_point<std::chrono::system_clock> start; std::chrono::time_point<std::chrono::system_clock> end; std::chrono::duration<double, std::micro> duration{}; - for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) - { + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { const std::size_t nbDims = 4; std::vector<std::size_t> dims; for (std::size_t i = 0; i < nbDims; ++i) { dims.push_back(dimSizeDist(gen)); } - const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t nb_elements = + std::accumulate(dims.cbegin(), + dims.cend(), + std::size_t(1), + std::multiplies<std::size_t>()); const std::size_t borderType = padTypeDist(gen); const std::size_t padding = padSizeDist(gen); // Create Pad Operator CUDA - std::shared_ptr<Node> myPadCUDA = Pad<2>({padding, padding, padding, padding}, "mypadcuda", static_cast<PadBorderType>(borderType)); - auto op_cuda = std::static_pointer_cast<OperatorTensor>(myPadCUDA -> getOperator()); + std::shared_ptr<Node> myPadCUDA = + Pad<2>({padding, padding, padding, padding}, + "mypadcuda", + static_cast<PadBorderType>(borderType)); + auto op_cuda = std::static_pointer_cast<OperatorTensor>( + myPadCUDA->getOperator()); op_cuda->setDataType(DataType::Float32); op_cuda->setBackend("cuda"); // Create Pad Operator CPU - std::shared_ptr<Node> myPadCPU = Pad<2>({padding, padding, padding, padding}, "mypadcpu", static_cast<PadBorderType>(borderType)); - auto op_cpu = std::static_pointer_cast<OperatorTensor>(myPadCPU -> getOperator()); + std::shared_ptr<Node> myPadCPU = + Pad<2>({padding, padding, padding, padding}, + "mypadcpu", + static_cast<PadBorderType>(borderType)); + auto op_cpu = std::static_pointer_cast<OperatorTensor>( + myPadCPU->getOperator()); op_cpu->setDataType(DataType::Float32); op_cpu->setBackend("cpu"); - float* array0 = new float[nb_elements]; + float *array0 = new float[nb_elements]; for (std::size_t i = 0; i < nb_elements; ++i) { array0[i] = valueDist(gen); } // input CUDA - float* array0_d; + float *array0_d; std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>(); T0_cuda->setDataType(DataType::Float32); T0_cuda->setBackend("cuda"); T0_cuda->resize(dims); op_cuda->associateInput(0, T0_cuda); - cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements); - cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice); + cudaMalloc(reinterpret_cast<void **>(&array0_d), + sizeof(float) * nb_elements); + cudaMemcpy(array0_d, + array0, + sizeof(float) * nb_elements, + cudaMemcpyHostToDevice); T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements); // input CPU std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>(); - op_cpu->associateInput(0,T0_cpu); + op_cpu->associateInput(0, T0_cpu); T0_cpu->setDataType(DataType::Float32); T0_cpu->setBackend("cpu"); T0_cpu->resize(dims); - T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements); + T0_cpu->getImpl()->setRawPtr(array0, nb_elements); // forward CUDA start = std::chrono::system_clock::now(); op_cuda->forward(); end = std::chrono::system_clock::now(); - duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + duration += std::chrono::duration_cast<std::chrono::microseconds>( + end - start); - const std::size_t outSize = op_cuda->getOutput(0)->size(); + const std::size_t outSize = op_cuda->getOutput(0)->size(); float *computed_cuda = new float[outSize](); - cudaMemcpy(computed_cuda, op_cuda->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * outSize, cudaMemcpyDeviceToHost); + cudaMemcpy(computed_cuda, + op_cuda->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * outSize, + cudaMemcpyDeviceToHost); // forward CPU op_cpu->forward(); - float *computed_cpu = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr()); + float *computed_cpu = static_cast<float *>( + op_cpu->getOutput(0)->getImpl()->rawPtr()); REQUIRE(approxEq<float>(*computed_cuda, *computed_cpu)); delete[] array0; @@ -702,80 +693,80 @@ TEST_CASE("[gpu/operator] Pad(forward)", "[Pad][GPU]") { } TEST_CASE("[gpu/operator] Pad(backward)", "[Pad][GPU]") { - SECTION("Symmetric Pad") { + SECTION("Symmetric Pad") { const int pv = 0; // pad value - std::shared_ptr<Node> myPad = Pad<2>({1, 1, 1, 1}, "mypad", PadBorderType::Constant, static_cast<double>(pv)); - auto op = std::static_pointer_cast<OperatorTensor>(myPad -> getOperator()); - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,1,3,5,5> { //NCHW - { - { - {{ 0, 1, 2, 3, 4}, - { 5, 6, 7, 8, 9}, - { 10, 11, 12, 13, 14}, - { 15, 16, 17, 18, 19}, - { 20, 21, 22, 23, 24}}, - - {{ 25, 26, 27, 28, 29}, - { 30, 31, 32, 33, 34}, - { 35, 36, 37, 38, 39}, - { 40, 41, 42, 43, 44}, - { 45, 46, 47, 48, 49}}, - - {{ 50, 51, 52, 53, 54}, - { 55, 56, 57, 58, 59}, - { 60, 61, 62, 63, 64}, - { 65, 66, 67, 68, 69}, - { 70, 71, 72, 73, 74}} - } - } - }); + std::shared_ptr<Node> myPad = Pad<2>({1, 1, 1, 1}, + "mypad", + PadBorderType::Constant, + static_cast<double>(pv)); + auto op = + std::static_pointer_cast<OperatorTensor>(myPad->getOperator()); + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>( + Array4D<float, 1, 3, 5, 5>{// NCHW + {{{{0, 1, 2, 3, 4}, + {5, 6, 7, 8, 9}, + {10, 11, 12, 13, 14}, + {15, 16, 17, 18, 19}, + {20, 21, 22, 23, 24}}, + + {{25, 26, 27, 28, 29}, + {30, 31, 32, 33, 34}, + {35, 36, 37, 38, 39}, + {40, 41, 42, 43, 44}, + {45, 46, 47, 48, 49}}, + + {{50, 51, 52, 53, 54}, + {55, 56, 57, 58, 59}, + {60, 61, 62, 63, 64}, + {65, 66, 67, 68, 69}, + {70, 71, 72, 73, 74}}}}}); myInput->setBackend("cuda"); - myPad->getOperator()->associateInput(0,myInput); + myPad->getOperator()->associateInput(0, myInput); myPad->getOperator()->setDataType(DataType::Float32); myPad->getOperator()->setBackend("cuda"); myPad->forward(); - std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array4D<float,1,3,7,7> { //NCHW - { - { - {{ pv, pv, pv, pv, pv, pv, pv}, - { pv, 0, 1, 2, 3, 4, pv}, - { pv, 5, 6, 7, 8, 9, pv}, - { pv, 10, 11, 12, 13, 14, pv}, - { pv, 15, 16, 17, 18, 19, pv}, - { pv, 20, 21, 22, 23, 24, pv}, - { pv, pv, pv, pv, pv, pv, pv}}, - - {{ pv, pv, pv, pv, pv, pv, pv}, - { pv, 25, 26, 27, 28, 29, pv}, - { pv, 30, 31, 32, 33, 34, pv}, - { pv, 35, 36, 37, 38, 39, pv}, - { pv, 40, 41, 42, 43, 44, pv}, - { pv, 45, 46, 47, 48, 49, pv}, - { pv, pv, pv, pv, pv, pv, pv}}, - - {{ pv, pv, pv, pv, pv, pv, pv}, - { pv, 50, 51, 52, 53, 54, pv}, - { pv, 55, 56, 57, 58, 59, pv}, - { pv, 60, 61, 62, 63, 64, pv}, - { pv, 65, 66, 67, 68, 69, pv}, - { pv, 70, 71, 72, 73, 74, pv}, - { pv, pv, pv, pv, pv, pv, pv}} - } - } - }); + std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>( + Array4D<float, 1, 3, 7, 7>{// NCHW + {{{{pv, pv, pv, pv, pv, pv, pv}, + {pv, 0, 1, 2, 3, 4, pv}, + {pv, 5, 6, 7, 8, 9, pv}, + {pv, 10, 11, 12, 13, 14, pv}, + {pv, 15, 16, 17, 18, 19, pv}, + {pv, 20, 21, 22, 23, 24, pv}, + {pv, pv, pv, pv, pv, pv, pv}}, + + {{pv, pv, pv, pv, pv, pv, pv}, + {pv, 25, 26, 27, 28, 29, pv}, + {pv, 30, 31, 32, 33, 34, pv}, + {pv, 35, 36, 37, 38, 39, pv}, + {pv, 40, 41, 42, 43, 44, pv}, + {pv, 45, 46, 47, 48, 49, pv}, + {pv, pv, pv, pv, pv, pv, pv}}, + + {{pv, pv, pv, pv, pv, pv, pv}, + {pv, 50, 51, 52, 53, 54, pv}, + {pv, 55, 56, 57, 58, 59, pv}, + {pv, 60, 61, 62, 63, 64, pv}, + {pv, 65, 66, 67, 68, 69, pv}, + {pv, 70, 71, 72, 73, 74, pv}, + {pv, pv, pv, pv, pv, pv, pv}}}}}); myOutputGrad->setBackend("cuda"); op->getOutput(0)->setGrad(myOutputGrad); REQUIRE_NOTHROW(myPad->backward()); float *computedGradCuda = new float[myInput->size()](); - cudaMemcpy(computedGradCuda, op->getInput(0)->grad()->getImpl()->rawPtr(), sizeof(float) * myInput->size(), cudaMemcpyDeviceToHost); + cudaMemcpy(computedGradCuda, + op->getInput(0)->grad()->getImpl()->rawPtr(), + sizeof(float) * myInput->size(), + cudaMemcpyDeviceToHost); myInput->setBackend("cpu"); - for(int i = 0; i < myInput->size(); i++){ - const float targetOutput = *(static_cast<float*>(myInput->getImpl()->rawPtr()) + i); + for (int i = 0; i < myInput->size(); i++) { + const float targetOutput = + *(static_cast<float *>(myInput->getImpl()->rawPtr()) + i); REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6); } diff --git a/unit_tests/Test_PowImpl.cpp b/unit_tests/Test_PowImpl.cpp index 49e65b46d7d85b7087c5c73151d643593d91e02e..ab419d0b07029365600d9056e17f84a5e8442b27 100644 --- a/unit_tests/Test_PowImpl.cpp +++ b/unit_tests/Test_PowImpl.cpp @@ -11,7 +11,7 @@ #include <array> #include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution #include <catch2/catch_test_macros.hpp> @@ -27,10 +27,14 @@ TEST_CASE("[gpu/operator] Pow", "[Pow][GPU]") { // Create a random number generator std::random_device rd; std::mt19937 gen(rd()); - std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1 - std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10)); - std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5)); - std::uniform_int_distribution<int> boolDist(0,1); + std::uniform_real_distribution<float> valueDist( + 0.1f, + 1.1f); // Random float distribution between 0 and 1 + std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), + std::size_t(10)); + std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), + std::size_t(5)); + std::uniform_int_distribution<int> boolDist(0, 1); // To measure execution time of 'MatPow_Op::forward()' member function call std::chrono::time_point<std::chrono::system_clock> start; @@ -38,21 +42,19 @@ TEST_CASE("[gpu/operator] Pow", "[Pow][GPU]") { std::chrono::duration<double, std::micro> duration{}; SECTION("PowImpl::forward()") { - SECTION("Scalar / Scalar") { - - } - SECTION("Scalar / +1-D Tensor") { - - } + SECTION("Scalar / Scalar") {} + SECTION("Scalar / +1-D Tensor") {} SECTION("+1-D Tensor / +1-D Tensor - same dimensions") { // Create Pow Operator std::shared_ptr<Node> myPowCUDA = Pow(); - auto op_cuda = std::static_pointer_cast<OperatorTensor>(myPowCUDA-> getOperator()); + auto op_cuda = std::static_pointer_cast<OperatorTensor>( + myPowCUDA->getOperator()); op_cuda->setDataType(DataType::Float32); op_cuda->setBackend("cuda"); std::shared_ptr<Node> myPowCPU = Pow(); - auto op_cpu = std::static_pointer_cast<OperatorTensor>(myPowCPU-> getOperator()); + auto op_cpu = std::static_pointer_cast<OperatorTensor>( + myPowCPU->getOperator()); op_cpu->setDataType(DataType::Float32); op_cpu->setBackend("cpu"); @@ -65,12 +67,16 @@ TEST_CASE("[gpu/operator] Pow", "[Pow][GPU]") { for (std::size_t i = 0; i < nbDims; ++i) { dims.push_back(dimSizeDist(gen)); } - const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t nb_elements = + std::accumulate(dims.cbegin(), + dims.cend(), + std::size_t(1), + std::multiplies<std::size_t>()); number_of_operation += nb_elements; // without broadcasting - float* array0 = new float[nb_elements]; - float* array1 = new float[nb_elements]; + float *array0 = new float[nb_elements]; + float *array1 = new float[nb_elements]; for (std::size_t i = 0; i < nb_elements; ++i) { array0[i] = valueDist(gen); @@ -78,14 +84,18 @@ TEST_CASE("[gpu/operator] Pow", "[Pow][GPU]") { } // input0 CUDA - float* array0_d, *array1_d; + float *array0_d, *array1_d; std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>(); T0_cuda->setDataType(DataType::Float32); T0_cuda->setBackend("cuda"); T0_cuda->resize(dims); op_cuda->associateInput(0, T0_cuda); - cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements); - cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice); + cudaMalloc(reinterpret_cast<void **>(&array0_d), + sizeof(float) * nb_elements); + cudaMemcpy(array0_d, + array0, + sizeof(float) * nb_elements, + cudaMemcpyHostToDevice); T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements); // input0 CPU @@ -93,8 +103,8 @@ TEST_CASE("[gpu/operator] Pow", "[Pow][GPU]") { T0_cpu->setDataType(DataType::Float32); T0_cpu->setBackend("cpu"); T0_cpu->resize(dims); - op_cpu->associateInput(0,T0_cpu); - T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements); + op_cpu->associateInput(0, T0_cpu); + T0_cpu->getImpl()->setRawPtr(array0, nb_elements); // input1 CUDA std::shared_ptr<Tensor> T1_cuda = std::make_shared<Tensor>(); @@ -102,8 +112,12 @@ TEST_CASE("[gpu/operator] Pow", "[Pow][GPU]") { T1_cuda->setBackend("cuda"); T1_cuda->resize(dims); op_cuda->associateInput(1, T1_cuda); - cudaMalloc(reinterpret_cast<void **>(&array1_d), sizeof(float) * nb_elements); - cudaMemcpy(array1_d, array1, sizeof(float) * nb_elements, cudaMemcpyHostToDevice); + cudaMalloc(reinterpret_cast<void **>(&array1_d), + sizeof(float) * nb_elements); + cudaMemcpy(array1_d, + array1, + sizeof(float) * nb_elements, + cudaMemcpyHostToDevice); T1_cuda->getImpl()->setRawPtr(array1_d, nb_elements); // input1 @@ -111,21 +125,25 @@ TEST_CASE("[gpu/operator] Pow", "[Pow][GPU]") { T1_cpu->setDataType(DataType::Float32); T1_cpu->setBackend("cpu"); T1_cpu->resize(dims); - op_cpu -> associateInput(1,T1_cpu); - T1_cpu -> getImpl() -> setRawPtr(array1, nb_elements); + op_cpu->associateInput(1, T1_cpu); + T1_cpu->getImpl()->setRawPtr(array1, nb_elements); op_cuda->forwardDims(); start = std::chrono::system_clock::now(); myPowCUDA->forward(); end = std::chrono::system_clock::now(); - duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + duration += + std::chrono::duration_cast<std::chrono::microseconds>( + end - start); // REQUIRE(false); op_cpu->forwardDims(); myPowCPU->forward(); std::shared_ptr<Tensor> outputFallback; - const auto& cudaOutput = op_cuda->getOutput(0)->refCastFrom(outputFallback, *op_cpu->getOutput(0)); + const auto &cudaOutput = + op_cuda->getOutput(0)->refCastFrom(outputFallback, + *op_cpu->getOutput(0)); REQUIRE(approxEq<float>(cudaOutput, *(op_cpu->getOutput(0)))); delete[] array0; @@ -133,26 +151,31 @@ TEST_CASE("[gpu/operator] Pow", "[Pow][GPU]") { cudaFree(array0_d); cudaFree(array1_d); } - std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl; - std::cout << "total time: " << duration.count() << "μs" << std::endl; + std::cout << "number of elements over time spent: " + << (number_of_operation / duration.count()) << std::endl; + std::cout << "total time: " << duration.count() << "μs" + << std::endl; } SECTION("+1-D Tensor / +1-D Tensor - broadcasting") { // Create Pow Operator std::shared_ptr<Node> myPowCUDA = Pow(); - auto op_cuda = std::static_pointer_cast<OperatorTensor>(myPowCUDA-> getOperator()); + auto op_cuda = std::static_pointer_cast<OperatorTensor>( + myPowCUDA->getOperator()); op_cuda->setDataType(DataType::Float32); op_cuda->setBackend("cuda"); std::shared_ptr<Node> myPowCPU = Pow(); - auto op_cpu = std::static_pointer_cast<OperatorTensor>(myPowCPU-> getOperator()); + auto op_cpu = std::static_pointer_cast<OperatorTensor>( + myPowCPU->getOperator()); op_cpu->setDataType(DataType::Float32); op_cpu->setBackend("cpu"); - + std::size_t number_of_operation = 0; for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { // generate 2 random Tensors - // handle dimensions, replace some dimensions with '1' to get broadcasting + // handle dimensions, replace some dimensions with '1' to get + // broadcasting constexpr std::size_t nbDims = 4; std::vector<std::size_t> dims; for (std::size_t i = 0; i < nbDims; ++i) { @@ -172,10 +195,18 @@ TEST_CASE("[gpu/operator] Pow", "[Pow][GPU]") { } // create arrays and fill them with random values - std::size_t array0_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); - std::size_t array1_size = std::accumulate(dims1.cbegin(), dims1.cend(), std::size_t(1), std::multiplies<std::size_t>()); - float* array0 = new float[array0_size]; - float* array1 = new float[array1_size]; + std::size_t array0_size = + std::accumulate(dims0.cbegin(), + dims0.cend(), + std::size_t(1), + std::multiplies<std::size_t>()); + std::size_t array1_size = + std::accumulate(dims1.cbegin(), + dims1.cend(), + std::size_t(1), + std::multiplies<std::size_t>()); + float *array0 = new float[array0_size]; + float *array1 = new float[array1_size]; for (std::size_t i = 0; i < array0_size; ++i) { array0[i] = valueDist(gen); @@ -184,23 +215,27 @@ TEST_CASE("[gpu/operator] Pow", "[Pow][GPU]") { array1[i] = valueDist(gen); } // input0 CUDA - float* array0_d, *array1_d; + float *array0_d, *array1_d; std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>(); T0_cuda->setDataType(DataType::Float32); T0_cuda->setBackend("cuda"); T0_cuda->resize(dims0); op_cuda->associateInput(0, T0_cuda); - cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * array0_size); - cudaMemcpy(array0_d, array0, sizeof(float) * array0_size, cudaMemcpyHostToDevice); + cudaMalloc(reinterpret_cast<void **>(&array0_d), + sizeof(float) * array0_size); + cudaMemcpy(array0_d, + array0, + sizeof(float) * array0_size, + cudaMemcpyHostToDevice); T0_cuda->getImpl()->setRawPtr(array0_d, array0_size); // input0 CPU std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>(); T0_cpu->setDataType(DataType::Float32); T0_cpu->setBackend("cpu"); - op_cpu->associateInput(0,T0_cpu); + op_cpu->associateInput(0, T0_cpu); T0_cpu->resize(dims0); - T0_cpu -> getImpl() -> setRawPtr(array0, array0_size); + T0_cpu->getImpl()->setRawPtr(array0, array0_size); // input1 CUDA std::shared_ptr<Tensor> T1_cuda = std::make_shared<Tensor>(); @@ -208,8 +243,12 @@ TEST_CASE("[gpu/operator] Pow", "[Pow][GPU]") { T1_cuda->setBackend("cuda"); T1_cuda->resize(dims1); op_cuda->associateInput(1, T1_cuda); - cudaMalloc(reinterpret_cast<void **>(&array1_d), sizeof(float) * array1_size); - cudaMemcpy(array1_d, array1, sizeof(float) * array1_size, cudaMemcpyHostToDevice); + cudaMalloc(reinterpret_cast<void **>(&array1_d), + sizeof(float) * array1_size); + cudaMemcpy(array1_d, + array1, + sizeof(float) * array1_size, + cudaMemcpyHostToDevice); T1_cuda->getImpl()->setRawPtr(array1_d, array1_size); // input1 @@ -217,20 +256,24 @@ TEST_CASE("[gpu/operator] Pow", "[Pow][GPU]") { T1_cpu->setDataType(DataType::Float32); T1_cpu->setBackend("cpu"); T1_cpu->resize(dims1); - op_cpu -> associateInput(1,T1_cpu); - T1_cpu -> getImpl() -> setRawPtr(array1, array1_size); + op_cpu->associateInput(1, T1_cpu); + T1_cpu->getImpl()->setRawPtr(array1, array1_size); op_cuda->forwardDims(); start = std::chrono::system_clock::now(); myPowCUDA->forward(); end = std::chrono::system_clock::now(); - duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + duration += + std::chrono::duration_cast<std::chrono::microseconds>( + end - start); op_cpu->forwardDims(); myPowCPU->forward(); std::shared_ptr<Tensor> outputFallback; - const auto& cudaOutput = op_cuda->getOutput(0)->refCastFrom(outputFallback, *op_cpu->getOutput(0)); + const auto &cudaOutput = + op_cuda->getOutput(0)->refCastFrom(outputFallback, + *op_cpu->getOutput(0)); REQUIRE(approxEq<float>(cudaOutput, *(op_cpu->getOutput(0)))); delete[] array0; @@ -238,25 +281,35 @@ TEST_CASE("[gpu/operator] Pow", "[Pow][GPU]") { cudaFree(array0_d); cudaFree(array1_d); - const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t nb_elements = + std::accumulate(dimsOut.cbegin(), + dimsOut.cend(), + std::size_t(1), + std::multiplies<std::size_t>()); number_of_operation += nb_elements; } - std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl; - std::cout << "total time: " << duration.count() << "μs" << std::endl; + std::cout << "number of elements over time spent: " + << (number_of_operation / duration.count()) << std::endl; + std::cout << "total time: " << duration.count() << "μs" + << std::endl; } SECTION("+1-D Tensor / 1-D Tensor") { // Create Pow Operator std::shared_ptr<Node> myPowCUDA = Pow(); - auto op_cuda = std::static_pointer_cast<OperatorTensor>(myPowCUDA-> getOperator()); + auto op_cuda = std::static_pointer_cast<OperatorTensor>( + myPowCUDA->getOperator()); op_cuda->setDataType(DataType::Float32); op_cuda->setBackend("cuda"); std::shared_ptr<Node> myPowCPU = Pow(); - auto op_cpu = std::static_pointer_cast<OperatorTensor>(myPowCPU-> getOperator()); + auto op_cpu = std::static_pointer_cast<OperatorTensor>( + myPowCPU->getOperator()); op_cpu->setDataType(DataType::Float32); op_cpu->setBackend("cpu"); std::size_t number_of_operation = 0; - std::uniform_int_distribution<std::size_t> nbRemovedDimsDist(std::size_t(1), std::size_t(3)); + std::uniform_int_distribution<std::size_t> nbRemovedDimsDist( + std::size_t(1), + std::size_t(3)); for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { // generate 2 random Tensors @@ -273,13 +326,22 @@ TEST_CASE("[gpu/operator] Pow", "[Pow][GPU]") { dims1[i] = 1; } } - dims1.erase(dims1.cbegin(), dims1.cbegin() + nbRemovedDimsDist(gen)); + dims1.erase(dims1.cbegin(), + dims1.cbegin() + nbRemovedDimsDist(gen)); // create arrays and fill them with random values - std::size_t array0_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); - float* array0 = new float[array0_size]; - std::size_t array1_size = std::accumulate(dims1.cbegin(), dims1.cend(), std::size_t(1), std::multiplies<std::size_t>()); - float* array1 = new float[array1_size]; + std::size_t array0_size = + std::accumulate(dims0.cbegin(), + dims0.cend(), + std::size_t(1), + std::multiplies<std::size_t>()); + float *array0 = new float[array0_size]; + std::size_t array1_size = + std::accumulate(dims1.cbegin(), + dims1.cend(), + std::size_t(1), + std::multiplies<std::size_t>()); + float *array1 = new float[array1_size]; for (std::size_t i = 0; i < array0_size; ++i) { array0[i] = valueDist(gen); @@ -289,23 +351,27 @@ TEST_CASE("[gpu/operator] Pow", "[Pow][GPU]") { } // input0 CUDA - float* array0_d, *array1_d; + float *array0_d, *array1_d; std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>(); T0_cuda->setDataType(DataType::Float32); T0_cuda->setBackend("cuda"); T0_cuda->resize(dims0); op_cuda->associateInput(0, T0_cuda); - cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * array0_size); - cudaMemcpy(array0_d, array0, sizeof(float) * array0_size, cudaMemcpyHostToDevice); + cudaMalloc(reinterpret_cast<void **>(&array0_d), + sizeof(float) * array0_size); + cudaMemcpy(array0_d, + array0, + sizeof(float) * array0_size, + cudaMemcpyHostToDevice); T0_cuda->getImpl()->setRawPtr(array0_d, array0_size); // input0 CPU std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>(); T0_cpu->setDataType(DataType::Float32); T0_cpu->setBackend("cpu"); - op_cpu->associateInput(0,T0_cpu); + op_cpu->associateInput(0, T0_cpu); T0_cpu->resize(dims0); - T0_cpu -> getImpl() -> setRawPtr(array0, array0_size); + T0_cpu->getImpl()->setRawPtr(array0, array0_size); // input1 CUDA std::shared_ptr<Tensor> T1_cuda = std::make_shared<Tensor>(); @@ -313,8 +379,12 @@ TEST_CASE("[gpu/operator] Pow", "[Pow][GPU]") { T1_cuda->setBackend("cuda"); T1_cuda->resize(dims1); op_cuda->associateInput(1, T1_cuda); - cudaMalloc(reinterpret_cast<void **>(&array1_d), sizeof(float) * array1_size); - cudaMemcpy(array1_d, array1, sizeof(float) * array1_size, cudaMemcpyHostToDevice); + cudaMalloc(reinterpret_cast<void **>(&array1_d), + sizeof(float) * array1_size); + cudaMemcpy(array1_d, + array1, + sizeof(float) * array1_size, + cudaMemcpyHostToDevice); T1_cuda->getImpl()->setRawPtr(array1_d, array1_size); // input1 @@ -322,20 +392,24 @@ TEST_CASE("[gpu/operator] Pow", "[Pow][GPU]") { T1_cpu->setDataType(DataType::Float32); T1_cpu->setBackend("cpu"); T1_cpu->resize(dims1); - op_cpu -> associateInput(1,T1_cpu); - T1_cpu -> getImpl() -> setRawPtr(array1, array1_size); + op_cpu->associateInput(1, T1_cpu); + T1_cpu->getImpl()->setRawPtr(array1, array1_size); op_cuda->forwardDims(); start = std::chrono::system_clock::now(); myPowCUDA->forward(); end = std::chrono::system_clock::now(); - duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + duration += + std::chrono::duration_cast<std::chrono::microseconds>( + end - start); op_cpu->forwardDims(); myPowCPU->forward(); std::shared_ptr<Tensor> outputFallback; - const auto& cudaOutput = op_cuda->getOutput(0)->refCastFrom(outputFallback, *op_cpu->getOutput(0)); + const auto &cudaOutput = + op_cuda->getOutput(0)->refCastFrom(outputFallback, + *op_cpu->getOutput(0)); REQUIRE(approxEq<float>(cudaOutput, *(op_cpu->getOutput(0)))); delete[] array0; @@ -343,12 +417,18 @@ TEST_CASE("[gpu/operator] Pow", "[Pow][GPU]") { cudaFree(array0_d); cudaFree(array1_d); - const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t nb_elements = + std::accumulate(dimsOut.cbegin(), + dimsOut.cend(), + std::size_t(1), + std::multiplies<std::size_t>()); number_of_operation += nb_elements; } - std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl; - std::cout << "total time: " << duration.count() << "μs" << std::endl; + std::cout << "number of elements over time spent: " + << (number_of_operation / duration.count()) << std::endl; + std::cout << "total time: " << duration.count() << "μs" + << std::endl; } } } diff --git a/unit_tests/Test_ReLUImpl.cpp b/unit_tests/Test_ReLUImpl.cpp index 7ab38aa7def7f846555ae33ccd3871d6ee5a1539..dc4c918e6577bc0004de89ca722f48d530b95112 100644 --- a/unit_tests/Test_ReLUImpl.cpp +++ b/unit_tests/Test_ReLUImpl.cpp @@ -12,7 +12,7 @@ #include <array> #include <catch2/catch_test_macros.hpp> #include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution #include "aidge/backend/cpu.hpp" #include "aidge/backend/cuda.hpp" @@ -21,109 +21,91 @@ using namespace Aidge; - TEST_CASE("[gpu/operator] ReLU(forward)", "[ReLU][GPU]") { SECTION("Constant Input") { - std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array4D<float,2,2,2,10> { - { - { - { - { 0, 1, 2,-3, 4,-5,-6, 7, 8, 9}, - {-5, 4, 2,-3, 4,-5,-6, 7,-1,10} - }, - { - { 0, 1, 2,-3, 4,-5,-6, 7, 8, 9}, - {-5, 4, 2,-3, 4,-5,-6, 7,-1,10} - } - }, - { - { - { 0, 1, 2,-3, 4,-5,-6, 7, 8, 9}, - {-5, 4, 2,-3, 4,-5,-6, 7,-1,10} - }, - { - { 0, 1, 2,-3, 4,-5,-6, 7, 8, 9}, - {-5, 4, 2,-3, 4,-5,-6, 7,-1,10} - } - } - } - }); - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,2,2,10> { - { - { - { - { 0, 1, 2, 0, 4, 0, 0, 7, 8, 9}, - { 0, 4, 2, 0, 4, 0, 0, 7, 0,10} - }, - { - { 0, 1, 2, 0, 4, 0, 0, 7, 8, 9}, - { 0, 4, 2, 0, 4, 0, 0, 7, 0,10} - } - }, - { - { - { 0, 1, 2, 0, 4, 0, 0, 7, 8, 9}, - { 0, 4, 2, 0, 4, 0, 0, 7, 0,10} - }, - { - { 0, 1, 2, 0, 4, 0, 0, 7, 8, 9}, - { 0, 4, 2, 0, 4, 0, 0, 7, 0,10} - } - } - } - }); + std::shared_ptr<Tensor> input0 = + std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 10>{ + {{{{0, 1, 2, -3, 4, -5, -6, 7, 8, 9}, + {-5, 4, 2, -3, 4, -5, -6, 7, -1, 10}}, + {{0, 1, 2, -3, 4, -5, -6, 7, 8, 9}, + {-5, 4, 2, -3, 4, -5, -6, 7, -1, 10}}}, + {{{0, 1, 2, -3, 4, -5, -6, 7, 8, 9}, + {-5, 4, 2, -3, 4, -5, -6, 7, -1, 10}}, + {{0, 1, 2, -3, 4, -5, -6, 7, 8, 9}, + {-5, 4, 2, -3, 4, -5, -6, 7, -1, 10}}}}}); + std::shared_ptr<Tensor> myOutput = + std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 10>{ + {{{{0, 1, 2, 0, 4, 0, 0, 7, 8, 9}, + {0, 4, 2, 0, 4, 0, 0, 7, 0, 10}}, + {{0, 1, 2, 0, 4, 0, 0, 7, 8, 9}, + {0, 4, 2, 0, 4, 0, 0, 7, 0, 10}}}, + {{{0, 1, 2, 0, 4, 0, 0, 7, 8, 9}, + {0, 4, 2, 0, 4, 0, 0, 7, 0, 10}}, + {{0, 1, 2, 0, 4, 0, 0, 7, 8, 9}, + {0, 4, 2, 0, 4, 0, 0, 7, 0, 10}}}}}); std::shared_ptr<Node> myReLU = ReLU(); - auto op = std::static_pointer_cast<OperatorTensor>(myReLU -> getOperator()); - op->associateInput(0,input0); + auto op = + std::static_pointer_cast<OperatorTensor>(myReLU->getOperator()); + op->associateInput(0, input0); op->setDataType(DataType::Float32); op->setBackend("cuda"); op->forward(); - float* computedOutput = new float[myOutput->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + float *computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, + op->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * myOutput->size(), + cudaMemcpyDeviceToHost); - for(int i = 0; i < myOutput->size(); i++){ - const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + for (int i = 0; i < myOutput->size(); i++) { + const float targetOutput = + *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i); REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); } delete[] computedOutput; } - SECTION("Random Input") - { + SECTION("Random Input") { constexpr std::uint16_t NBTRIALS = 10; // Create a random number generator std::random_device rd; std::mt19937 gen(rd()); std::uniform_real_distribution<float> valueDist( - 0.1f, 1.1f); // Random float distribution between 0 and 1 - std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1), - std::size_t(10)); - - std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(8)); // Max nbDims supported by cudnn is 8 + 0.1f, + 1.1f); // Random float distribution between 0 and 1 + std::uniform_int_distribution<std::size_t> dimSizeDist( + std::size_t(1), + std::size_t(10)); + + std::uniform_int_distribution<std::size_t> nbDimsDist( + std::size_t(1), + std::size_t(8)); // Max nbDims supported by cudnn is 8 // To measure execution time of 'forward()' std::chrono::time_point<std::chrono::system_clock> start; std::chrono::time_point<std::chrono::system_clock> end; std::chrono::duration<double, std::micro> duration{}; std::size_t number_of_operation = 0; - for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) - { + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { // Create ReLU Operator std::shared_ptr<Node> myReLU = ReLU("myReLU"); - auto op = std::static_pointer_cast<OperatorTensor>(myReLU->getOperator()); + auto op = std::static_pointer_cast<OperatorTensor>( + myReLU->getOperator()); op->setDataType(DataType::Float32); op->setBackend("cuda"); // generate a random Tensor const std::size_t nbDims = nbDimsDist(gen); std::vector<std::size_t> dims; - for (std::size_t i = 0; i < nbDims; ++i) - { + for (std::size_t i = 0; i < nbDims; ++i) { dims.push_back(dimSizeDist(gen)); } - const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t nb_elements = + std::accumulate(dims.cbegin(), + dims.cend(), + std::size_t(1), + std::multiplies<std::size_t>()); number_of_operation += nb_elements; // Create the input Tensor @@ -136,25 +118,32 @@ TEST_CASE("[gpu/operator] ReLU(forward)", "[ReLU][GPU]") { // Fill input tensor float *input_h = new float[nb_elements]; float *output_h = new float[nb_elements]; - for (std::size_t i = 0; i < nb_elements; ++i) - { + for (std::size_t i = 0; i < nb_elements; ++i) { float value = valueDist(gen); input_h[i] = value; - output_h[i] = value>=0?value:0.0f; + output_h[i] = value >= 0 ? value : 0.0f; } float *input_d; - cudaMalloc(reinterpret_cast<void **>(&input_d), sizeof(float) * nb_elements); - cudaMemcpy(input_d, input_h, sizeof(float) * nb_elements, cudaMemcpyHostToDevice); + cudaMalloc(reinterpret_cast<void **>(&input_d), + sizeof(float) * nb_elements); + cudaMemcpy(input_d, + input_h, + sizeof(float) * nb_elements, + cudaMemcpyHostToDevice); T0->getImpl()->setRawPtr(input_d, nb_elements); - // Run inference + // Run inference start = std::chrono::system_clock::now(); myReLU->forward(); end = std::chrono::system_clock::now(); - duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + duration += std::chrono::duration_cast<std::chrono::microseconds>( + end - start); float *computedOutput = new float[nb_elements](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * nb_elements, cudaMemcpyDeviceToHost); + cudaMemcpy(computedOutput, + op->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * nb_elements, + cudaMemcpyDeviceToHost); REQUIRE(approxEq<float>(*computedOutput, *output_h)); @@ -162,8 +151,8 @@ TEST_CASE("[gpu/operator] ReLU(forward)", "[ReLU][GPU]") { delete[] input_h; cudaFree(input_d); } - std::cout << "number of elements over time spent: " << (number_of_operation / duration.count()) << std::endl; + std::cout << "number of elements over time spent: " + << (number_of_operation / duration.count()) << std::endl; std::cout << "total time: " << duration.count() << "μs" << std::endl; - } } diff --git a/unit_tests/Test_ReduceMeanImpl.cpp b/unit_tests/Test_ReduceMeanImpl.cpp index 041ad6e02d5f39fde22f34ce715d2b807e164b1a..6ed30534368a976cb666ec0dd93f1331bf2eb7ee 100644 --- a/unit_tests/Test_ReduceMeanImpl.cpp +++ b/unit_tests/Test_ReduceMeanImpl.cpp @@ -11,7 +11,7 @@ #include <array> #include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution #include <catch2/catch_test_macros.hpp> @@ -25,89 +25,69 @@ namespace Aidge { TEST_CASE("[gpu/operator] ReduceMean(forward)", "[ReduceMean][GPU]") { SECTION("KeepDims") { SECTION("test 1") { - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> { - { - { - { 5.0, 1.0 }, - { 20.0, 2.0 } - }, - { - { 30.0, 1.0 }, - { 40.0, 2.0 } - }, - { - { 55.0, 1.0 }, - { 60.0, 2.0 } - } - } - }); + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>( + Array3D<float, 3, 2, 2>{{{{5.0, 1.0}, {20.0, 2.0}}, + {{30.0, 1.0}, {40.0, 2.0}}, + {{55.0, 1.0}, {60.0, 2.0}}}}); myInput->setBackend("cuda"); - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array3D<float,3,1,2> { - { + std::shared_ptr<Tensor> myOutput = + std::make_shared<Tensor>(Array3D<float, 3, 1, 2>{{ - {{ 12.5, 1.5 }}, - {{ 35.0, 1.5 }}, - {{ 57.5, 1.5 }} - } - }); + {{12.5, 1.5}}, + {{35.0, 1.5}}, + {{57.5, 1.5}}}}); std::shared_ptr<Node> myReduceMean = ReduceMean({1}); - auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator()); - op->associateInput(0,myInput); + auto op = std::static_pointer_cast<OperatorTensor>( + myReduceMean->getOperator()); + op->associateInput(0, myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); myReduceMean->forward(); - float* computedOutput = new float[myOutput->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); - for(int i = 0; i < myOutput->size(); i++){ - const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + float *computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, + op->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * myOutput->size(), + cudaMemcpyDeviceToHost); + for (int i = 0; i < myOutput->size(); i++) { + const float targetOutput = + *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i); REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); } delete[] computedOutput; } SECTION("test 2") { - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,3,2> { - { - { - { 0.0, 0.0 }, - { 1.0, 1.0 }, - { 2.0, 2.0 } - }, - { - { 3.0, 3.0 }, - { 4.0, 4.0 }, - { 5.0, 5.0 } - }, - { - { 6.0, 6.0 }, - { 7.0, 7.0 }, - { 8.0, 8.0 } - } - } - }); + std::shared_ptr<Tensor> myInput = + std::make_shared<Tensor>(Array3D<float, 3, 3, 2>{ + {{{0.0, 0.0}, {1.0, 1.0}, {2.0, 2.0}}, + {{3.0, 3.0}, {4.0, 4.0}, {5.0, 5.0}}, + {{6.0, 6.0}, {7.0, 7.0}, {8.0, 8.0}}}}); myInput->setBackend("cuda"); - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array3D<float,3,1,1> { - { + std::shared_ptr<Tensor> myOutput = + std::make_shared<Tensor>(Array3D<float, 3, 1, 1>{{ - {{ 1.0 }}, - {{ 4.0 }}, - {{ 7.0 }} - } - }); + {{1.0}}, + {{4.0}}, + {{7.0}}}}); std::shared_ptr<Node> myReduceMean = ReduceMean({1, 2}); - auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator()); - op->associateInput(0,myInput); + auto op = std::static_pointer_cast<OperatorTensor>( + myReduceMean->getOperator()); + op->associateInput(0, myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); myReduceMean->forward(); - float* computedOutput = new float[myOutput->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); - for(int i = 0; i < myOutput->size(); i++){ - const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + float *computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, + op->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * myOutput->size(), + cudaMemcpyDeviceToHost); + for (int i = 0; i < myOutput->size(); i++) { + const float targetOutput = + *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i); REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); } @@ -115,145 +95,122 @@ TEST_CASE("[gpu/operator] ReduceMean(forward)", "[ReduceMean][GPU]") { } } SECTION("not_KeepDims") { - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> { - { - { - { 5.0, 1.0 }, - { 20.0, 2.0 } - }, - { - { 30.0, 1.0 }, - { 40.0, 2.0 } - }, - { - { 55.0, 1.0 }, - { 60.0, 2.0 } - } - } - }); + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>( + Array3D<float, 3, 2, 2>{{{{5.0, 1.0}, {20.0, 2.0}}, + {{30.0, 1.0}, {40.0, 2.0}}, + {{55.0, 1.0}, {60.0, 2.0}}}}); myInput->setBackend("cuda"); - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float,3,2> { - { - { 12.5, 1.5 }, - { 35.0, 1.5 }, - { 57.5, 1.5 } - } - }); + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>( + Array2D<float, 3, 2>{{{12.5, 1.5}, {35.0, 1.5}, {57.5, 1.5}}}); std::shared_ptr<Node> myReduceMean = ReduceMean({1}, false); - auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator()); - op->associateInput(0,myInput); + auto op = std::static_pointer_cast<OperatorTensor>( + myReduceMean->getOperator()); + op->associateInput(0, myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); myReduceMean->forward(); - float* computedOutput = new float[myOutput->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); - for(int i = 0; i < myOutput->size(); i++){ - const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); - std::cout << "computed: " << computedOutput[i] << ", target: " << targetOutput << std::endl; + float *computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, + op->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * myOutput->size(), + cudaMemcpyDeviceToHost); + for (int i = 0; i < myOutput->size(); i++) { + const float targetOutput = + *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i); + std::cout << "computed: " << computedOutput[i] + << ", target: " << targetOutput << std::endl; REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); } delete[] computedOutput; - } SECTION("all_axes") { SECTION("1") { - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> { - { - { - { 5.0, 1.0 }, - { 20.0, 2.0 } - }, - { - { 30.0, 1.0 }, - { 40.0, 2.0 } - }, - { - { 55.0, 1.0 }, - { 60.0, 2.0 } - } - } - }); + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>( + Array3D<float, 3, 2, 2>{{{{5.0, 1.0}, {20.0, 2.0}}, + {{30.0, 1.0}, {40.0, 2.0}}, + {{55.0, 1.0}, {60.0, 2.0}}}}); myInput->setBackend("cuda"); - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array1D<float,1> { - {18.25} - }); + std::shared_ptr<Tensor> myOutput = + std::make_shared<Tensor>(Array1D<float, 1>{{18.25}}); std::shared_ptr<Node> myReduceMean = ReduceMean({0, 1, 2}, false); - auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator()); - op->associateInput(0,myInput); + auto op = std::static_pointer_cast<OperatorTensor>( + myReduceMean->getOperator()); + op->associateInput(0, myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); myReduceMean->forward(); - float* computedOutput = new float[myOutput->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); - for(int i = 0; i < myOutput->size(); i++){ - const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + float *computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, + op->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * myOutput->size(), + cudaMemcpyDeviceToHost); + for (int i = 0; i < myOutput->size(); i++) { + const float targetOutput = + *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i); REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); } delete[] computedOutput; } SECTION("2") { - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<float,5,4> { - {{ 0.004232f, 0.105120f, 0.045124f, 0.009205f}, - { 0.000766f, 0.272162f, 0.503560f, 0.044163f}, - { 0.049755f, 0.000305f, 0.143634f, 0.013253f}, - { 0.096258f, 0.311231f, 0.358143f, 0.000452f}, - { 0.468617f, 0.015693f, 0.145316f, 0.000105f}} - }); + std::shared_ptr<Tensor> myInput = + std::make_shared<Tensor>(Array2D<float, 5, 4>{ + {{0.004232f, 0.105120f, 0.045124f, 0.009205f}, + {0.000766f, 0.272162f, 0.503560f, 0.044163f}, + {0.049755f, 0.000305f, 0.143634f, 0.013253f}, + {0.096258f, 0.311231f, 0.358143f, 0.000452f}, + {0.468617f, 0.015693f, 0.145316f, 0.000105f}}}); myInput->setBackend("cuda"); - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array1D<float,1> { - {0.1293547f} - }); + std::shared_ptr<Tensor> myOutput = + std::make_shared<Tensor>(Array1D<float, 1>{{0.1293547f}}); std::shared_ptr<Node> myReduceMean = ReduceMean({0, 1}, false); - auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator()); - op->associateInput(0,myInput); + auto op = std::static_pointer_cast<OperatorTensor>( + myReduceMean->getOperator()); + op->associateInput(0, myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); myReduceMean->forward(); - float* computedOutput = new float[myOutput->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); - for(int i = 0; i < myOutput->size(); i++){ - const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + float *computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, + op->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * myOutput->size(), + cudaMemcpyDeviceToHost); + for (int i = 0; i < myOutput->size(); i++) { + const float targetOutput = + *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i); REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); } delete[] computedOutput; } SECTION("noop_with_empty_axes") { - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> { - { - { - { 5.0, 1.0 }, - { 20.0, 2.0 } - }, - { - { 30.0, 1.0 }, - { 40.0, 2.0 } - }, - { - { 55.0, 1.0 }, - { 60.0, 2.0 } - } - } - }); + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>( + Array3D<float, 3, 2, 2>{{{{5.0, 1.0}, {20.0, 2.0}}, + {{30.0, 1.0}, {40.0, 2.0}}, + {{55.0, 1.0}, {60.0, 2.0}}}}); myInput->setBackend("cuda"); std::shared_ptr<Node> myReduceMean = ReduceMean({}, false, true); - auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator()); - op->associateInput(0,myInput); + auto op = std::static_pointer_cast<OperatorTensor>( + myReduceMean->getOperator()); + op->associateInput(0, myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); myReduceMean->forward(); - + myInput->setBackend("cpu"); - float* computedOutput = new float[myInput->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myInput->size(), cudaMemcpyDeviceToHost); - for(int i = 0; i < myInput->size(); i++){ - const float targetOutput = *(static_cast<float*>(myInput->getImpl()->rawPtr()) + i); + float *computedOutput = new float[myInput->size()](); + cudaMemcpy(computedOutput, + op->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * myInput->size(), + cudaMemcpyDeviceToHost); + for (int i = 0; i < myInput->size(); i++) { + const float targetOutput = + *(static_cast<float *>(myInput->getImpl()->rawPtr()) + i); REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); } @@ -264,70 +221,48 @@ TEST_CASE("[gpu/operator] ReduceMean(forward)", "[ReduceMean][GPU]") { TEST_CASE("[gpu/operator] ReduceMean(backward)", "[ReduceMean][GPU]") { SECTION("KeepDims") { - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> { - { - { - { 5.0, 1.0 }, - { 20.0, 2.0 } - }, - { - { 30.0, 1.0 }, - { 40.0, 2.0 } - }, - { - { 55.0, 1.0 }, - { 60.0, 2.0 } - } - } - }); + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>( + Array3D<float, 3, 2, 2>{{{{5.0, 1.0}, {20.0, 2.0}}, + {{30.0, 1.0}, {40.0, 2.0}}, + {{55.0, 1.0}, {60.0, 2.0}}}}); myInput->setBackend("cuda"); - std::shared_ptr<Node> myReduceMean = ReduceMean({1}); - auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator()); - op->associateInput(0,myInput); + auto op = std::static_pointer_cast<OperatorTensor>( + myReduceMean->getOperator()); + op->associateInput(0, myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); myReduceMean->forward(); + std::shared_ptr<Tensor> myOutputGrad = + std::make_shared<Tensor>(Array3D<float, 3, 1, 2>{{ - std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array3D<float,3,1,2> { - { - - {{ 1.0, 2.0 }}, - {{ 3.0, 4.0 }}, - {{ 5.0, 6.0 }} - } - }); - std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array3D<float,3,2,2> { - { - { - { 1.0, 2.0 }, - { 1.0, 2.0 } - }, - { - { 3.0, 4.0 }, - { 3.0, 4.0 } - }, - { - { 5.0, 6.0 }, - { 5.0, 6.0 } - } - } - }); + {{1.0, 2.0}}, + {{3.0, 4.0}}, + {{5.0, 6.0}}}}); + std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>( + Array3D<float, 3, 2, 2>{{{{1.0, 2.0}, {1.0, 2.0}}, + {{3.0, 4.0}, {3.0, 4.0}}, + {{5.0, 6.0}, {5.0, 6.0}}}}); myOutputGrad->setBackend("cuda"); op->getOutput(0)->setGrad(myOutputGrad); REQUIRE_NOTHROW(myReduceMean->backward()); float *computedGradCuda = new float[expectedInputGrad->size()](); - cudaMemcpy(computedGradCuda, op->getInput(0)->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInputGrad->size(), cudaMemcpyDeviceToHost); - - for(int i = 0; i < expectedInputGrad->size(); i++){ - const float targetOutput = *(static_cast<float*>(expectedInputGrad->getImpl()->rawPtr()) + i); + cudaMemcpy(computedGradCuda, + op->getInput(0)->grad()->getImpl()->rawPtr(), + sizeof(float) * expectedInputGrad->size(), + cudaMemcpyDeviceToHost); + + for (int i = 0; i < expectedInputGrad->size(); i++) { + const float targetOutput = *( + static_cast<float *>(expectedInputGrad->getImpl()->rawPtr()) + + i); REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6); } delete[] computedGradCuda; } } -} +} // namespace Aidge diff --git a/unit_tests/Test_ReduceSumImpl.cpp b/unit_tests/Test_ReduceSumImpl.cpp index d0d37754102331c8f91a1ce1c81d679761916339..a640d1f7e3585e7068dbd0ba935a31cb2e725df6 100644 --- a/unit_tests/Test_ReduceSumImpl.cpp +++ b/unit_tests/Test_ReduceSumImpl.cpp @@ -11,7 +11,7 @@ #include <array> #include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution #include <catch2/catch_test_macros.hpp> @@ -25,89 +25,68 @@ namespace Aidge { TEST_CASE("[gpu/operator] ReduceSum(forward)", "[ReduceSum][GPU]") { SECTION("KeepDims") { SECTION("test 1") { - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> { - { - { - { 5.0, 1.0 }, - { 20.0, 2.0 } - }, - { - { 30.0, 1.0 }, - { 40.0, 2.0 } - }, - { - { 55.0, 1.0 }, - { 60.0, 2.0 } - } - } - }); + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>( + Array3D<float, 3, 2, 2>{{{{5.0, 1.0}, {20.0, 2.0}}, + {{30.0, 1.0}, {40.0, 2.0}}, + {{55.0, 1.0}, {60.0, 2.0}}}}); myInput->setBackend("cuda"); - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array3D<float,3,1,2> { - { - {{ 25.0, 3.0 }}, - {{ 70.0, 3.0 }}, - {{ 115.0, 3.0 }} - } - }); + std::shared_ptr<Tensor> myOutput = + std::make_shared<Tensor>(Array3D<float, 3, 1, 2>{ + {{{25.0, 3.0}}, {{70.0, 3.0}}, {{115.0, 3.0}}}}); std::shared_ptr<Node> myReduceSum = ReduceSum({1}); - auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator()); - op->associateInput(0,myInput); + auto op = std::static_pointer_cast<OperatorTensor>( + myReduceSum->getOperator()); + op->associateInput(0, myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); myReduceSum->forward(); - float* computedOutput = new float[myOutput->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); - for(int i = 0; i < myOutput->size(); i++){ - const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); - std::cout << "i: " << i << ", computed: " << computedOutput[i] << ", target: "<< targetOutput <<std::endl; + float *computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, + op->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * myOutput->size(), + cudaMemcpyDeviceToHost); + for (int i = 0; i < myOutput->size(); i++) { + const float targetOutput = + *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i); + std::cout << "i: " << i << ", computed: " << computedOutput[i] + << ", target: " << targetOutput << std::endl; REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); } delete[] computedOutput; } SECTION("test 2") { - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,3,2> { - { - { - { 0.0, 0.0 }, - { 1.0, 1.0 }, - { 2.0, 2.0 } - }, - { - { 3.0, 3.0 }, - { 4.0, 4.0 }, - { 5.0, 5.0 } - }, - { - { 6.0, 6.0 }, - { 7.0, 7.0 }, - { 8.0, 8.0 } - } - } - }); + std::shared_ptr<Tensor> myInput = + std::make_shared<Tensor>(Array3D<float, 3, 3, 2>{ + {{{0.0, 0.0}, {1.0, 1.0}, {2.0, 2.0}}, + {{3.0, 3.0}, {4.0, 4.0}, {5.0, 5.0}}, + {{6.0, 6.0}, {7.0, 7.0}, {8.0, 8.0}}}}); myInput->setBackend("cuda"); - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array3D<float,3,1,1> { - { + std::shared_ptr<Tensor> myOutput = + std::make_shared<Tensor>(Array3D<float, 3, 1, 1>{{ - {{ 6.0 }}, - {{ 24.0 }}, - {{ 42.0 }} - } - }); + {{6.0}}, + {{24.0}}, + {{42.0}}}}); std::shared_ptr<Node> myReduceSum = ReduceSum({1, 2}); - auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator()); - op->associateInput(0,myInput); + auto op = std::static_pointer_cast<OperatorTensor>( + myReduceSum->getOperator()); + op->associateInput(0, myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); myReduceSum->forward(); - float* computedOutput = new float[myOutput->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); - for(int i = 0; i < myOutput->size(); i++){ - const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + float *computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, + op->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * myOutput->size(), + cudaMemcpyDeviceToHost); + for (int i = 0; i < myOutput->size(); i++) { + const float targetOutput = + *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i); REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); } @@ -115,109 +94,92 @@ TEST_CASE("[gpu/operator] ReduceSum(forward)", "[ReduceSum][GPU]") { } } SECTION("not_KeepDims") { - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> { - { - { - { 5.0, 1.0 }, - { 20.0, 2.0 } - }, - { - { 30.0, 1.0 }, - { 40.0, 2.0 } - }, - { - { 55.0, 1.0 }, - { 60.0, 2.0 } - } - } - }); + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>( + Array3D<float, 3, 2, 2>{{{{5.0, 1.0}, {20.0, 2.0}}, + {{30.0, 1.0}, {40.0, 2.0}}, + {{55.0, 1.0}, {60.0, 2.0}}}}); myInput->setBackend("cuda"); - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float,3,2> { - { - { 25.0, 3.0 }, - { 70.0, 3.0 }, - { 115.0, 3.0 } - } - }); + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>( + Array2D<float, 3, 2>{{{25.0, 3.0}, {70.0, 3.0}, {115.0, 3.0}}}); std::shared_ptr<Node> myReduceSum = ReduceSum({1}, false); - auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator()); - op->associateInput(0,myInput); + auto op = std::static_pointer_cast<OperatorTensor>( + myReduceSum->getOperator()); + op->associateInput(0, myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); myReduceSum->forward(); - float* computedOutput = new float[myOutput->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); - for(int i = 0; i < myOutput->size(); i++){ - const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + float *computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, + op->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * myOutput->size(), + cudaMemcpyDeviceToHost); + for (int i = 0; i < myOutput->size(); i++) { + const float targetOutput = + *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i); REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); } delete[] computedOutput; - } SECTION("all_axes") { SECTION("1") { - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> { - { - { - { 5.0, 1.0 }, - { 20.0, 2.0 } - }, - { - { 30.0, 1.0 }, - { 40.0, 2.0 } - }, - { - { 55.0, 1.0 }, - { 60.0, 2.0 } - } - } - }); + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>( + Array3D<float, 3, 2, 2>{{{{5.0, 1.0}, {20.0, 2.0}}, + {{30.0, 1.0}, {40.0, 2.0}}, + {{55.0, 1.0}, {60.0, 2.0}}}}); myInput->setBackend("cuda"); - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array1D<float,1> { - {219.0} - }); + std::shared_ptr<Tensor> myOutput = + std::make_shared<Tensor>(Array1D<float, 1>{{219.0}}); std::shared_ptr<Node> myReduceSum = ReduceSum({0, 1, 2}, false); - auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator()); - op->associateInput(0,myInput); + auto op = std::static_pointer_cast<OperatorTensor>( + myReduceSum->getOperator()); + op->associateInput(0, myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); myReduceSum->forward(); - float* computedOutput = new float[myOutput->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); - for(int i = 0; i < myOutput->size(); i++){ - const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + float *computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, + op->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * myOutput->size(), + cudaMemcpyDeviceToHost); + for (int i = 0; i < myOutput->size(); i++) { + const float targetOutput = + *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i); REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); } delete[] computedOutput; } SECTION("2") { - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<float,5,4> { - {{ 0.004232f, 0.105120f, 0.045124f, 0.009205f}, - { 0.000766f, 0.272162f, 0.503560f, 0.044163f}, - { 0.049755f, 0.000305f, 0.143634f, 0.013253f}, - { 0.096258f, 0.311231f, 0.358143f, 0.000452f}, - { 0.468617f, 0.015693f, 0.145316f, 0.000105f}} - }); + std::shared_ptr<Tensor> myInput = + std::make_shared<Tensor>(Array2D<float, 5, 4>{ + {{0.004232f, 0.105120f, 0.045124f, 0.009205f}, + {0.000766f, 0.272162f, 0.503560f, 0.044163f}, + {0.049755f, 0.000305f, 0.143634f, 0.013253f}, + {0.096258f, 0.311231f, 0.358143f, 0.000452f}, + {0.468617f, 0.015693f, 0.145316f, 0.000105f}}}); myInput->setBackend("cuda"); - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array1D<float,1> { - {2.587094f} - }); + std::shared_ptr<Tensor> myOutput = + std::make_shared<Tensor>(Array1D<float, 1>{{2.587094f}}); std::shared_ptr<Node> myReduceSum = ReduceSum({0, 1}, false); - auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator()); - op->associateInput(0,myInput); + auto op = std::static_pointer_cast<OperatorTensor>( + myReduceSum->getOperator()); + op->associateInput(0, myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); myReduceSum->forward(); - float* computedOutput = new float[myOutput->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); - for(int i = 0; i < myOutput->size(); i++){ - const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + float *computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, + op->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * myOutput->size(), + cudaMemcpyDeviceToHost); + for (int i = 0; i < myOutput->size(); i++) { + const float targetOutput = + *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i); REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); } @@ -228,70 +190,48 @@ TEST_CASE("[gpu/operator] ReduceSum(forward)", "[ReduceSum][GPU]") { TEST_CASE("[gpu/operator] ReduceSum(backward)", "[ReduceSum][GPU]") { SECTION("KeepDims") { - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> { - { - { - { 5.0, 1.0 }, - { 20.0, 2.0 } - }, - { - { 30.0, 1.0 }, - { 40.0, 2.0 } - }, - { - { 55.0, 1.0 }, - { 60.0, 2.0 } - } - } - }); + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>( + Array3D<float, 3, 2, 2>{{{{5.0, 1.0}, {20.0, 2.0}}, + {{30.0, 1.0}, {40.0, 2.0}}, + {{55.0, 1.0}, {60.0, 2.0}}}}); myInput->setBackend("cuda"); - std::shared_ptr<Node> myReduceSum = ReduceSum({1}); - auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator()); - op->associateInput(0,myInput); + auto op = std::static_pointer_cast<OperatorTensor>( + myReduceSum->getOperator()); + op->associateInput(0, myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); myReduceSum->forward(); + std::shared_ptr<Tensor> myOutputGrad = + std::make_shared<Tensor>(Array3D<float, 3, 1, 2>{{ - std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array3D<float,3,1,2> { - { - - {{ 1.0, 2.0 }}, - {{ 3.0, 4.0 }}, - {{ 5.0, 6.0 }} - } - }); - std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array3D<float,3,2,2> { - { - { - { 1.0, 2.0 }, - { 1.0, 2.0 } - }, - { - { 3.0, 4.0 }, - { 3.0, 4.0 } - }, - { - { 5.0, 6.0 }, - { 5.0, 6.0 } - } - } - }); + {{1.0, 2.0}}, + {{3.0, 4.0}}, + {{5.0, 6.0}}}}); + std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>( + Array3D<float, 3, 2, 2>{{{{1.0, 2.0}, {1.0, 2.0}}, + {{3.0, 4.0}, {3.0, 4.0}}, + {{5.0, 6.0}, {5.0, 6.0}}}}); myOutputGrad->setBackend("cuda"); op->getOutput(0)->setGrad(myOutputGrad); REQUIRE_NOTHROW(myReduceSum->backward()); float *computedGradCuda = new float[expectedInputGrad->size()](); - cudaMemcpy(computedGradCuda, op->getInput(0)->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInputGrad->size(), cudaMemcpyDeviceToHost); - - for(int i = 0; i < expectedInputGrad->size(); i++){ - const float targetOutput = *(static_cast<float*>(expectedInputGrad->getImpl()->rawPtr()) + i); + cudaMemcpy(computedGradCuda, + op->getInput(0)->grad()->getImpl()->rawPtr(), + sizeof(float) * expectedInputGrad->size(), + cudaMemcpyDeviceToHost); + + for (int i = 0; i < expectedInputGrad->size(); i++) { + const float targetOutput = *( + static_cast<float *>(expectedInputGrad->getImpl()->rawPtr()) + + i); REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6); } delete[] computedGradCuda; } } -} +} // namespace Aidge diff --git a/unit_tests/Test_ReshapeImpl.cpp b/unit_tests/Test_ReshapeImpl.cpp index df9a4dda6d59371c8dd07f8c4442e3a3bb4a7159..a8a03a3e427a3d760342053f2d0baa6b9128e4af 100644 --- a/unit_tests/Test_ReshapeImpl.cpp +++ b/unit_tests/Test_ReshapeImpl.cpp @@ -11,7 +11,7 @@ #include <array> #include <numeric> // std::accumulate, std::shuffle, std::transform -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution #include <catch2/catch_test_macros.hpp> @@ -22,250 +22,256 @@ using namespace Aidge; - TEST_CASE("[gpu/operator] Reshape(forward)") { SECTION("1D Tensor") { - std::shared_ptr<Tensor> input = std::make_shared<Tensor>(Array1D<float,6> { - {1.0, 2.0, 3.0, 4.0, 5.0, 6.0} - }); - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float,2,3> { - { - {1.0, 2.0, 3.0}, - {4.0, 5.0, 6.0} - } - }); + std::shared_ptr<Tensor> input = std::make_shared<Tensor>( + Array1D<float, 6>{{1.0, 2.0, 3.0, 4.0, 5.0, 6.0}}); + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>( + Array2D<float, 2, 3>{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}}}); std::shared_ptr<Node> myReshape = Reshape({2, 3}); - auto op = std::static_pointer_cast<OperatorTensor>(myReshape -> getOperator()); + auto op = + std::static_pointer_cast<OperatorTensor>(myReshape->getOperator()); op->associateInput(0, input); op->setDataType(DataType::Float32); op->setBackend("cuda"); myReshape->forward(); - float* computedOutput = new float[myOutput->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + float *computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, + op->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * myOutput->size(), + cudaMemcpyDeviceToHost); - for(int i = 0; i < myOutput->size(); i++){ - const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + for (int i = 0; i < myOutput->size(); i++) { + const float targetOutput = + *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i); REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); } delete[] computedOutput; } SECTION("2D Tensor") { - std::shared_ptr<Tensor> input = std::make_shared<Tensor>(Array2D<float,2,3> { - { - {1.0, 2.0, 3.0}, - {4.0, 5.0, 6.0} - } + std::shared_ptr<Tensor> input = std::make_shared<Tensor>( + Array2D<float, 2, 3>{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}} - }); - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float,3,2> { - { - {1.0, 2.0}, - {3.0, 4.0}, - {5.0, 6.0} - } - }); + }); + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>( + Array2D<float, 3, 2>{{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}}}); std::shared_ptr<Node> myReshape = Reshape({3, 2}); - auto op = std::static_pointer_cast<OperatorTensor>(myReshape -> getOperator()); + auto op = + std::static_pointer_cast<OperatorTensor>(myReshape->getOperator()); op->associateInput(0, input); op->setDataType(DataType::Float32); op->setBackend("cuda"); myReshape->forward(); - float* computedOutput = new float[myOutput->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + float *computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, + op->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * myOutput->size(), + cudaMemcpyDeviceToHost); - for(int i = 0; i < myOutput->size(); i++){ - const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + for (int i = 0; i < myOutput->size(); i++) { + const float targetOutput = + *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i); REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); } delete[] computedOutput; } - SECTION("Random Input") - { + SECTION("Random Input") { constexpr std::uint16_t NBTRIALS = 10; // Create a random number generator std::random_device rd; std::mt19937 gen(rd()); std::uniform_real_distribution<float> valueDist( - 0.1f, 1.1f); // Random float distribution between 0 and 1 - std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1), - std::size_t(10)); - - std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(CUDNN_DIM_MAX)); // Max nbDims supported by cudnn is 8 + 0.1f, + 1.1f); // Random float distribution between 0 and 1 + std::uniform_int_distribution<std::size_t> dimSizeDist( + std::size_t(1), + std::size_t(10)); + + std::uniform_int_distribution<std::size_t> nbDimsDist( + std::size_t(1), + std::size_t(CUDNN_DIM_MAX)); // Max nbDims supported by cudnn is 8 // To measure execution time of 'forward()' std::chrono::time_point<std::chrono::system_clock> start; std::chrono::time_point<std::chrono::system_clock> end; std::chrono::duration<double, std::micro> duration{}; std::size_t number_of_operation = 0; - for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) - { + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { // generate a random Tensor const std::size_t nbDims = nbDimsDist(gen); std::vector<std::size_t> dims, shuffeledDims; - for (std::size_t i = 0; i < nbDims; ++i) - { + for (std::size_t i = 0; i < nbDims; ++i) { dims.push_back(dimSizeDist(gen)); } shuffeledDims = dims; std::shuffle(shuffeledDims.begin(), shuffeledDims.end(), gen); std::vector<std::int64_t> shuffeledIntDims(shuffeledDims.size()); - std::transform(shuffeledDims.begin(), shuffeledDims.end(), shuffeledIntDims.begin(), - [](int value) { return static_cast<std::int64_t>(value); }); + std::transform( + shuffeledDims.begin(), + shuffeledDims.end(), + shuffeledIntDims.begin(), + [](int value) { return static_cast<std::int64_t>(value); }); // Create Reshape Operator CUDA - std::shared_ptr<Node> myReshapeCuda = Reshape(shuffeledIntDims, false,"myreshapecuda"); - auto op_cuda = std::static_pointer_cast<OperatorTensor>(myReshapeCuda->getOperator()); + std::shared_ptr<Node> myReshapeCuda = + Reshape(shuffeledIntDims, false, "myreshapecuda"); + auto op_cuda = std::static_pointer_cast<OperatorTensor>( + myReshapeCuda->getOperator()); op_cuda->setDataType(DataType::Float32); op_cuda->setBackend("cuda"); // Create Reshape Operator CPU - std::shared_ptr<Node> myReshapeCpu = Reshape(shuffeledIntDims, false,"myreshapecpu"); - auto op_cpu = std::static_pointer_cast<OperatorTensor>(myReshapeCpu->getOperator()); + std::shared_ptr<Node> myReshapeCpu = + Reshape(shuffeledIntDims, false, "myreshapecpu"); + auto op_cpu = std::static_pointer_cast<OperatorTensor>( + myReshapeCpu->getOperator()); op_cpu->setDataType(DataType::Float32); op_cpu->setBackend("cpu"); - const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t nb_elements = + std::accumulate(dims.cbegin(), + dims.cend(), + std::size_t(1), + std::multiplies<std::size_t>()); number_of_operation += nb_elements; // Fill input tensor float *array0 = new float[nb_elements]; - for (std::size_t i = 0; i < nb_elements; ++i) - { + for (std::size_t i = 0; i < nb_elements; ++i) { array0[i] = valueDist(gen); } // input0 CUDA - float* array0_d; + float *array0_d; std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>(); T0_cuda->setDataType(DataType::Float32); T0_cuda->setBackend("cuda"); T0_cuda->resize(dims); op_cuda->associateInput(0, T0_cuda); - cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements); - cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice); + cudaMalloc(reinterpret_cast<void **>(&array0_d), + sizeof(float) * nb_elements); + cudaMemcpy(array0_d, + array0, + sizeof(float) * nb_elements, + cudaMemcpyHostToDevice); T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements); // input0 CPU std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>(); - op_cpu->associateInput(0,T0_cpu); + op_cpu->associateInput(0, T0_cpu); T0_cpu->setDataType(DataType::Float32); T0_cpu->setBackend("cpu"); T0_cpu->resize(dims); - T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements); + T0_cpu->getImpl()->setRawPtr(array0, nb_elements); - // Run inference + // Run inference start = std::chrono::system_clock::now(); op_cuda->forward(); end = std::chrono::system_clock::now(); - duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + duration += std::chrono::duration_cast<std::chrono::microseconds>( + end - start); float *computed_cuda = new float[nb_elements]; - cudaMemcpy(computed_cuda, op_cuda->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * nb_elements, cudaMemcpyDeviceToHost); + cudaMemcpy(computed_cuda, + op_cuda->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * nb_elements, + cudaMemcpyDeviceToHost); // forward CPU op_cpu->forward(); - float *computed_cpu = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr()); + float *computed_cpu = static_cast<float *>( + op_cpu->getOutput(0)->getImpl()->rawPtr()); REQUIRE(approxEq<float>(*computed_cuda, *computed_cpu)); delete[] computed_cuda; delete[] array0; cudaFree(array0_d); } - std::cout << "number of elements over time spent: " << (number_of_operation / duration.count()) << std::endl; + std::cout << "number of elements over time spent: " + << (number_of_operation / duration.count()) << std::endl; std::cout << "total time: " << duration.count() << "μs" << std::endl; - } } TEST_CASE("[gpu/operator] Reshape(backward)") { SECTION("1D Tensor") { - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<float,2,3> { - { - {1.0, 2.0, 3.0}, - {4.0, 5.0, 6.0} - } - }); + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>( + Array2D<float, 2, 3>{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}}}); std::shared_ptr<Node> myReshape = Reshape({6}); - auto op = std::static_pointer_cast<OperatorTensor>(myReshape -> getOperator()); + auto op = + std::static_pointer_cast<OperatorTensor>(myReshape->getOperator()); op->associateInput(0, myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); myReshape->forward(); // Run and test backward operation - std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array1D<float, 6> { - {1, 2, 3, 4, 5, 6} - }); + std::shared_ptr<Tensor> myOutputGrad = + std::make_shared<Tensor>(Array1D<float, 6>{{1, 2, 3, 4, 5, 6}}); myOutputGrad->setBackend("cuda"); std::shared_ptr<Tensor> predictedOutput = op->getOutput(0); std::shared_ptr<Tensor> input = op->getInput(0); predictedOutput->setGrad(myOutputGrad); REQUIRE_NOTHROW(myReshape->backward()); - std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array2D<float,2,3> { - { - {1.0, 2.0, 3.0}, - {4.0, 5.0, 6.0} - } - }); + std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>( + Array2D<float, 2, 3>{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}}}); float *computedGradCuda = new float[expectedInputGrad->size()](); - cudaMemcpy(computedGradCuda, input->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInputGrad->size(), cudaMemcpyDeviceToHost); - - for(int i = 0; i < expectedInputGrad->size(); i++){ - const float targetOutput = *(static_cast<float*>(expectedInputGrad->getImpl()->rawPtr()) + i); + cudaMemcpy(computedGradCuda, + input->grad()->getImpl()->rawPtr(), + sizeof(float) * expectedInputGrad->size(), + cudaMemcpyDeviceToHost); + + for (int i = 0; i < expectedInputGrad->size(); i++) { + const float targetOutput = *( + static_cast<float *>(expectedInputGrad->getImpl()->rawPtr()) + + i); REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6); } delete[] computedGradCuda; } SECTION("2D Tensor") { - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<float,2,3> { - { - {1.0, 2.0, 3.0}, - {4.0, 5.0, 6.0} - } - }); + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>( + Array2D<float, 2, 3>{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}}}); std::shared_ptr<Node> myReshape = Reshape({3, 2}); - auto op = std::static_pointer_cast<OperatorTensor>(myReshape -> getOperator()); + auto op = + std::static_pointer_cast<OperatorTensor>(myReshape->getOperator()); op->associateInput(0, myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); myReshape->forward(); // Run and test backward operation - std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array2D<float, 3, 2> { - { - {1.0, 2.0}, - {3.0, 4.0}, - {5.0, 6.0} - } - }); + std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>( + Array2D<float, 3, 2>{{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}}}); myOutputGrad->setBackend("cuda"); std::shared_ptr<Tensor> predictedOutput = op->getOutput(0); std::shared_ptr<Tensor> input = op->getInput(0); predictedOutput->setGrad(myOutputGrad); REQUIRE_NOTHROW(myReshape->backward()); - std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array2D<float,2,3> { - { - {1.0, 2.0, 3.0}, - {4.0, 5.0, 6.0} - } - }); + std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>( + Array2D<float, 2, 3>{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}}}); float *computedGradCuda = new float[expectedInputGrad->size()](); - cudaMemcpy(computedGradCuda, input->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInputGrad->size(), cudaMemcpyDeviceToHost); - - for(int i = 0; i < expectedInputGrad->size(); i++){ - const float targetOutput = *(static_cast<float*>(expectedInputGrad->getImpl()->rawPtr()) + i); + cudaMemcpy(computedGradCuda, + input->grad()->getImpl()->rawPtr(), + sizeof(float) * expectedInputGrad->size(), + cudaMemcpyDeviceToHost); + + for (int i = 0; i < expectedInputGrad->size(); i++) { + const float targetOutput = *( + static_cast<float *>(expectedInputGrad->getImpl()->rawPtr()) + + i); REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6); } diff --git a/unit_tests/Test_ShiftGELUImpl.cpp b/unit_tests/Test_ShiftGELUImpl.cpp index 86e747e735eccb397caa8062f52c2561e8ef759d..99194c012fe8b2c15b5eb99f2bf1b2bd51d17cfc 100644 --- a/unit_tests/Test_ShiftGELUImpl.cpp +++ b/unit_tests/Test_ShiftGELUImpl.cpp @@ -26,103 +26,219 @@ using namespace Aidge; TEST_CASE("[gpu/operator] ShiftGELU(forward)", "[ShiftGELU][GPU]") { SECTION("4D Tensor") { - std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array4D<float,2,2,2,10> { - { - { - { - {0.96, 0.48, 0.54, 0.49, 0.59, 0.93, 0.00, 0.00, 0.61, 0.61}, - {0.85, 0.06, 0.11, 0.87, 0.55, 0.12, 0.80, 0.48, 0.41, 0.16} - }, - { - {0.24, 0.46, 0.97, 0.19, 0.65, 0.12, 0.44, 1.00, 0.37, 0.09}, - {0.44, 0.64, 0.21, 0.58, 0.05, 0.24, 0.56, 0.07, 0.49, 0.79} - } - }, - { - { - {0.00, 0.13, 0.55, 0.42, 0.49, 0.28, 0.52, 0.55, 0.34, 0.85}, - {0.98, 0.32, 0.09, 0.05, 0.37, 0.47, 0.63, 0.13, 0.70, 0.02} - }, - { - {0.69, 0.13, 0.74, 0.61, 0.25, 0.87, 0.46, 0.40, 0.81, 0.06}, - {0.89, 0.32, 0.61, 0.24, 0.70, 0.23, 0.09, 0.03, 0.14, 0.80} - } - } - } - }); - - //expected output of shiftgelu forward operator - std::shared_ptr<Tensor> output_shiftGELU = std::make_shared<Tensor>(Array4D<float,2,2,2,10> { - { - { - { - { 0.991388f, 0.413078f, 0.413078f, 0.413078f, 0.413078f, 0.413078f, 0.0f, 0.0f, 0.413078f, 0.413078f }, - { 0.413078f, 0.0f, 0.0f, 0.413078f, 0.413078f, 0.0f, 0.413078f, 0.413078f, 0.413078f, 0.0f } - }, - { - { 0.0f, 0.413078f, 0.991388f, 0.0f, 0.413078f, 0.0f, 0.413078f, 0.991388f, 0.413078f, 0.0f }, - { 0.413078f, 0.413078f, 0.0f, 0.413078f, 0.0f, 0.0f, 0.413078f, 0.0f, 0.413078f, 0.413078f } - } - }, - { - { - { 0.0f, 0.0f, 0.413078f, 0.413078f, 0.413078f, 0.0f, 0.413078f, 0.413078f, 0.413078f, 0.413078f }, - { 0.991388f, 0.413078f, 0.0f, 0.0f, 0.413078f, 0.413078f, 0.413078f, 0.0f, 0.413078f, 0.0f} - }, - { - { 0.413078f, 0.0f, 0.413078f, 0.413078f, 0.0f, 0.413078f, 0.413078f, 0.413078f, 0.413078f, 0.0f }, - { 0.413078f, 0.413078f, 0.413078f, 0.0f, 0.413078f, 0.0f, 0.0f, 0.0f, 0.0f, 0.413078f } - } - } - } - }); - - //expected output of GELU forward operator (computed with PyTorch) - std::shared_ptr<Tensor> output_GELU = std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 10> { - { - { - { - { 0.7982f, 0.3285f, 0.3809f, 0.3371f, 0.4262f, 0.7661f, 0.0000f, 0.0000f, 0.4447f, 0.4447f }, - { 0.6820f, 0.0314f, 0.0598f, 0.7028f, 0.3899f, 0.0657f, 0.6305f, 0.3285f, 0.2702f, 0.0902f } - }, - { - { 0.1428f, 0.3115f, 0.8090f, 0.1093f, 0.4824f, 0.0657f, 0.2948f, 0.8413f, 0.2384f, 0.0482f }, - { 0.2948f, 0.4729f, 0.1225f, 0.4170f, 0.0260f, 0.1428f, 0.3989f, 0.0370f, 0.3371f, 0.6203f } - } - }, - { - { - { 0.0000f, 0.0717f, 0.3899f, 0.2784f, 0.3371f, 0.1709f, 0.3632f, 0.3899f, 0.2152f, 0.6820f }, - { 0.8197f, 0.2002f, 0.0482f, 0.0260f, 0.2384f, 0.3200f, 0.4635f, 0.0717f, 0.5306f, 0.0102f } - }, - { - { 0.5209f, 0.0717f, 0.5701f, 0.4447f, 0.1497f, 0.7028f, 0.3115f, 0.2622f, 0.6407f, 0.0314f }, - { 0.7238f, 0.2002f, 0.4447f, 0.1428f, 0.5306f, 0.1359f, 0.0482f, 0.0154f, 0.0778f, 0.6305f } - } - } - } - }); + std::shared_ptr<Tensor> input0 = std::make_shared< + Tensor>(Array4D<float, 2, 2, 2, 10>{ + {{{{0.96, 0.48, 0.54, 0.49, 0.59, 0.93, 0.00, 0.00, 0.61, 0.61}, + {0.85, 0.06, 0.11, 0.87, 0.55, 0.12, 0.80, 0.48, 0.41, 0.16}}, + {{0.24, 0.46, 0.97, 0.19, 0.65, 0.12, 0.44, 1.00, 0.37, 0.09}, + {0.44, 0.64, 0.21, 0.58, 0.05, 0.24, 0.56, 0.07, 0.49, 0.79}}}, + {{{0.00, 0.13, 0.55, 0.42, 0.49, 0.28, 0.52, 0.55, 0.34, 0.85}, + {0.98, 0.32, 0.09, 0.05, 0.37, 0.47, 0.63, 0.13, 0.70, 0.02}}, + {{0.69, 0.13, 0.74, 0.61, 0.25, 0.87, 0.46, 0.40, 0.81, 0.06}, + {0.89, + 0.32, + 0.61, + 0.24, + 0.70, + 0.23, + 0.09, + 0.03, + 0.14, + 0.80}}}}}); + + // expected output of shiftgelu forward operator + std::shared_ptr<Tensor> output_shiftGELU = std::make_shared<Tensor>( + Array4D<float, 2, 2, 2, 10>{{{{{0.991388f, + 0.413078f, + 0.413078f, + 0.413078f, + 0.413078f, + 0.413078f, + 0.0f, + 0.0f, + 0.413078f, + 0.413078f}, + {0.413078f, + 0.0f, + 0.0f, + 0.413078f, + 0.413078f, + 0.0f, + 0.413078f, + 0.413078f, + 0.413078f, + 0.0f}}, + {{0.0f, + 0.413078f, + 0.991388f, + 0.0f, + 0.413078f, + 0.0f, + 0.413078f, + 0.991388f, + 0.413078f, + 0.0f}, + {0.413078f, + 0.413078f, + 0.0f, + 0.413078f, + 0.0f, + 0.0f, + 0.413078f, + 0.0f, + 0.413078f, + 0.413078f}}}, + {{{0.0f, + 0.0f, + 0.413078f, + 0.413078f, + 0.413078f, + 0.0f, + 0.413078f, + 0.413078f, + 0.413078f, + 0.413078f}, + {0.991388f, + 0.413078f, + 0.0f, + 0.0f, + 0.413078f, + 0.413078f, + 0.413078f, + 0.0f, + 0.413078f, + 0.0f}}, + {{0.413078f, + 0.0f, + 0.413078f, + 0.413078f, + 0.0f, + 0.413078f, + 0.413078f, + 0.413078f, + 0.413078f, + 0.0f}, + {0.413078f, + 0.413078f, + 0.413078f, + 0.0f, + 0.413078f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.413078f}}}}}); + + // expected output of GELU forward operator (computed with PyTorch) + std::shared_ptr<Tensor> output_GELU = std::make_shared<Tensor>( + Array4D<float, 2, 2, 2, 10>{{{{{0.7982f, + 0.3285f, + 0.3809f, + 0.3371f, + 0.4262f, + 0.7661f, + 0.0000f, + 0.0000f, + 0.4447f, + 0.4447f}, + {0.6820f, + 0.0314f, + 0.0598f, + 0.7028f, + 0.3899f, + 0.0657f, + 0.6305f, + 0.3285f, + 0.2702f, + 0.0902f}}, + {{0.1428f, + 0.3115f, + 0.8090f, + 0.1093f, + 0.4824f, + 0.0657f, + 0.2948f, + 0.8413f, + 0.2384f, + 0.0482f}, + {0.2948f, + 0.4729f, + 0.1225f, + 0.4170f, + 0.0260f, + 0.1428f, + 0.3989f, + 0.0370f, + 0.3371f, + 0.6203f}}}, + {{{0.0000f, + 0.0717f, + 0.3899f, + 0.2784f, + 0.3371f, + 0.1709f, + 0.3632f, + 0.3899f, + 0.2152f, + 0.6820f}, + {0.8197f, + 0.2002f, + 0.0482f, + 0.0260f, + 0.2384f, + 0.3200f, + 0.4635f, + 0.0717f, + 0.5306f, + 0.0102f}}, + {{0.5209f, + 0.0717f, + 0.5701f, + 0.4447f, + 0.1497f, + 0.7028f, + 0.3115f, + 0.2622f, + 0.6407f, + 0.0314f}, + {0.7238f, + 0.2002f, + 0.4447f, + 0.1428f, + 0.5306f, + 0.1359f, + 0.0482f, + 0.0154f, + 0.0778f, + 0.6305f}}}}}); std::shared_ptr<Node> myShiftGELU = ShiftGELU(); - auto op = std::static_pointer_cast<OperatorTensor>(myShiftGELU -> getOperator()); - op->associateInput(0,input0); + auto op = std::static_pointer_cast<OperatorTensor>( + myShiftGELU->getOperator()); + op->associateInput(0, input0); op->setDataType(DataType::Float32); op->setBackend("cuda"); op->forward(); - - float* computedOutput = new float[output_shiftGELU->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * output_shiftGELU->size(), cudaMemcpyDeviceToHost); - //test if forward result are as expected - for(int i = 0; i < output_shiftGELU->size(); i++){ - const float targetOutput = *(static_cast<float*>(output_shiftGELU->getImpl()->rawPtr()) + i); + float *computedOutput = new float[output_shiftGELU->size()](); + cudaMemcpy(computedOutput, + op->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * output_shiftGELU->size(), + cudaMemcpyDeviceToHost); + + // test if forward result are as expected + for (int i = 0; i < output_shiftGELU->size(); i++) { + const float targetOutput = + *(static_cast<float *>(output_shiftGELU->getImpl()->rawPtr()) + + i); REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); } - //measure difference between GELU and shiftgelu + // measure difference between GELU and shiftgelu float sum = 0.0; - for(int i = 0; i < output_GELU->size(); i++){ - const float targetOutput = *(static_cast<float*>(output_GELU->getImpl()->rawPtr()) + i); + for (int i = 0; i < output_GELU->size(); i++) { + const float targetOutput = + *(static_cast<float *>(output_GELU->getImpl()->rawPtr()) + i); sum += fabs(computedOutput[i] - targetOutput); } sum = sum / output_GELU->size(); @@ -130,42 +246,54 @@ TEST_CASE("[gpu/operator] ShiftGELU(forward)", "[ShiftGELU][GPU]") { delete[] computedOutput; } - } TEST_CASE("[gpu/operator] ShiftGELU(backward)", "[ShiftGELU][GPU]") { - std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array4D<float,1,1,1,8> { //NCHW - { - { - { - {1.46650600, 1.24083233, -0.33106008, -0.15137172, 0.06625678, -1.8326609, 0.53444749, -0.05167147}, - }, - }, - } - }); - + std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>( + Array4D<float, 1, 1, 1, 8>{// NCHW + { + { + { + {1.46650600, + 1.24083233, + -0.33106008, + -0.15137172, + 0.06625678, + -1.8326609, + 0.53444749, + -0.05167147}, + }, + }, + }}); + input0->setBackend("cuda"); std::shared_ptr<Node> myShiftGELU = ShiftGELU(); - auto op = std::static_pointer_cast<OperatorTensor>(myShiftGELU->getOperator()); + auto op = + std::static_pointer_cast<OperatorTensor>(myShiftGELU->getOperator()); op->associateInput(0, input0); op->setDataType(DataType::Float32); op->setBackend("cuda"); myShiftGELU->forward(); - std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array4D<float,1,1,1,8> { + std::shared_ptr<Tensor> myOutputGrad = + std::make_shared<Tensor>(Array4D<float, 1, 1, 1, 8>{{ { { - { - { 1.34347093, 0.90813798, 0.39607167, 1.20428133, 0.16845724, 0.48487359, 0.40748054, -0.21790814}, - }, + {1.34347093, + 0.90813798, + 0.39607167, + 1.20428133, + 0.16845724, + 0.48487359, + 0.40748054, + -0.21790814}, }, - } - }); - + }, + }}); myOutputGrad->setBackend("cuda"); std::shared_ptr<Tensor> predictedOutput = op->getOutput(0); @@ -173,48 +301,66 @@ TEST_CASE("[gpu/operator] ShiftGELU(backward)", "[ShiftGELU][GPU]") predictedOutput->setGrad(myOutputGrad); REQUIRE_NOTHROW(myShiftGELU->backward()); - //expected output of shiftgelu backward operator - std::shared_ptr<Tensor> expectedInputGradShiftGELU = std::make_shared<Tensor>(Array4D<float,1,1,1,8> { + // expected output of shiftgelu backward operator + std::shared_ptr<Tensor> expectedInputGradShiftGELU = + std::make_shared<Tensor>(Array4D<float, 1, 1, 1, 8>{{ { { - { - { 1.88094, 1.09182, 0.134203, 0.439603, 0.0696628, 0.173469, 0.254718, -0.084009}, - }, + {1.88094, + 1.09182, + 0.134203, + 0.439603, + 0.0696628, + 0.173469, + 0.254718, + -0.084009}, }, - } - }); + }, + }}); - //expected output of gelu backward operator (computed with PyTorch) - std::shared_ptr<Tensor> expectedInputGradGELU = std::make_shared<Tensor>(Array4D<float,1,1,1,8> { + // expected output of gelu backward operator (computed with PyTorch) + std::shared_ptr<Tensor> expectedInputGradGELU = + std::make_shared<Tensor>(Array4D<float, 1, 1, 1, 8>{{ { { - { - { 1.5159, 1.0188, 0.0971, 0.4578, 0.0931, -0.0499, 0.3620, -0.1000}, - }, + {1.5159, + 1.0188, + 0.0971, + 0.4578, + 0.0931, + -0.0499, + 0.3620, + -0.1000}, }, - } - }); - + }, + }}); float *computedGradCuda = new float[myOutputGrad->size()](); - cudaMemcpy(computedGradCuda, input->grad()->getImpl()->rawPtr(), sizeof(float) * myOutputGrad->size(), cudaMemcpyDeviceToHost); + cudaMemcpy(computedGradCuda, + input->grad()->getImpl()->rawPtr(), + sizeof(float) * myOutputGrad->size(), + cudaMemcpyDeviceToHost); - //test if backward result are as expected - for(int i = 0; i < expectedInputGradShiftGELU->size(); i++){ - const float targetOutput = *(static_cast<float*>(expectedInputGradShiftGELU->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 2e-6); + // test if backward result are as expected + for (int i = 0; i < expectedInputGradShiftGELU->size(); i++) { + const float targetOutput = + *(static_cast<float *>( + expectedInputGradShiftGELU->getImpl()->rawPtr()) + + i); + REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 2e-6); } - //measure difference between gelu and shifgelu + // measure difference between gelu and shifgelu float sum = 0.0; - for(int i = 0; i < expectedInputGradGELU->size(); i++){ - const float targetOutput = *(static_cast<float*>(expectedInputGradGELU->getImpl()->rawPtr()) + i); - sum += fabs(computedGradCuda[i] - targetOutput); - } - sum = sum / expectedInputGradGELU->size(); - REQUIRE(sum < 2e-1); - + for (int i = 0; i < expectedInputGradGELU->size(); i++) { + const float targetOutput = *( + static_cast<float *>(expectedInputGradGELU->getImpl()->rawPtr()) + + i); + sum += fabs(computedGradCuda[i] - targetOutput); + } + sum = sum / expectedInputGradGELU->size(); + REQUIRE(sum < 2e-1); delete[] computedGradCuda; } diff --git a/unit_tests/Test_ShiftMaxImpl.cpp b/unit_tests/Test_ShiftMaxImpl.cpp index 2a94a23c3a04edd72cb535ebfb6e2c538e4aeee8..1ae24357398c7ad35f817937d9ffc993b82a8091 100644 --- a/unit_tests/Test_ShiftMaxImpl.cpp +++ b/unit_tests/Test_ShiftMaxImpl.cpp @@ -26,101 +26,217 @@ using namespace Aidge; TEST_CASE("[gpu/operator] ShiftMax(forward)", "[ShiftMax][GPU]") { SECTION("4D Tensor") { - std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array4D<float,2,2,2,10> { - { - { - { - {0.96, 0.48, 0.54, 0.49, 0.59, 0.93, 0.00, 0.00, 0.61, 0.61}, - {0.85, 0.06, 0.11, 0.87, 0.55, 0.12, 0.80, 0.48, 0.41, 0.16} - }, - { - {0.24, 0.46, 0.97, 0.19, 0.65, 0.12, 0.44, 1.00, 0.37, 0.09}, - {0.44, 0.64, 0.21, 0.58, 0.05, 0.24, 0.56, 0.07, 0.49, 0.79} - } - }, - { - { - {0.00, 0.13, 0.55, 0.42, 0.49, 0.28, 0.52, 0.55, 0.34, 0.85}, - {0.98, 0.32, 0.09, 0.05, 0.37, 0.47, 0.63, 0.13, 0.70, 0.02} - }, - { - {0.69, 0.13, 0.74, 0.61, 0.25, 0.87, 0.46, 0.40, 0.81, 0.06}, - {0.89, 0.32, 0.61, 0.24, 0.70, 0.23, 0.09, 0.03, 0.14, 0.80} - } - } - } - }); - //expected output of shiftmax forward operator - std::shared_ptr<Tensor> output_shiftmax = std::make_shared<Tensor>(Array4D<float,2,2,2,10> { - { - { - { - { 0.111084f, 0.111084f, 0.111084f, 0.111084f, 0.111084f, 0.111084f, 0.055542f, 0.055542f, 0.111084f, 0.111084f }, - { 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f } - }, - { - { 0.0624695f, 0.124969f, 0.124969f, 0.0624695f, 0.124969f, 0.0624695f, 0.124969f, 0.124969f, 0.124969f, 0.0624695f }, - { 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f } - } - }, - { - { - { 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f }, - { 0.124969f, 0.124969f, 0.0624695f, 0.0624695f, 0.124969f, 0.124969f, 0.124969f, 0.0624695f, 0.124969f, 0.0624695f } - }, - { - { 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f }, - { 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f } - } - } - } - }); - //expected output of softmax forward operator (computed with PyTorch) - std::shared_ptr<Tensor> output_softmax = std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 10> { - { - { - { - { 0.1484f, 0.0918f, 0.0975f, 0.0928f, 0.1025f, 0.1440f, 0.0568f, 0.0568f, 0.1046f, 0.1046f }, - { 0.1436f, 0.0652f, 0.0685f, 0.1465f, 0.1064f, 0.0692f, 0.1366f, 0.0992f, 0.0925f, 0.0721f } - }, - { - { 0.0768f, 0.0957f, 0.1593f, 0.0730f, 0.1157f, 0.0681f, 0.0938f, 0.1642f, 0.0874f, 0.0661f }, - { 0.1005f, 0.1227f, 0.0798f, 0.1156f, 0.0680f, 0.0823f, 0.1133f, 0.0694f, 0.1056f, 0.1426f } - } - }, - { - { - { 0.0645f, 0.0734f, 0.1118f, 0.0981f, 0.1052f, 0.0853f, 0.1085f, 0.1118f, 0.0906f, 0.1509f }, - { 0.1743f, 0.0901f, 0.0716f, 0.0688f, 0.0947f, 0.1047f, 0.1228f, 0.0745f, 0.1317f, 0.0667f } - }, - { - { 0.1164f, 0.0665f, 0.1224f, 0.1075f, 0.0750f, 0.1394f, 0.0925f, 0.0871f, 0.1313f, 0.0620f }, - { 0.1551f, 0.0877f, 0.1172f, 0.0810f, 0.1283f, 0.0802f, 0.0697f, 0.0656f, 0.0733f, 0.1418f } - } - } - } - }); + std::shared_ptr<Tensor> input0 = std::make_shared< + Tensor>(Array4D<float, 2, 2, 2, 10>{ + {{{{0.96, 0.48, 0.54, 0.49, 0.59, 0.93, 0.00, 0.00, 0.61, 0.61}, + {0.85, 0.06, 0.11, 0.87, 0.55, 0.12, 0.80, 0.48, 0.41, 0.16}}, + {{0.24, 0.46, 0.97, 0.19, 0.65, 0.12, 0.44, 1.00, 0.37, 0.09}, + {0.44, 0.64, 0.21, 0.58, 0.05, 0.24, 0.56, 0.07, 0.49, 0.79}}}, + {{{0.00, 0.13, 0.55, 0.42, 0.49, 0.28, 0.52, 0.55, 0.34, 0.85}, + {0.98, 0.32, 0.09, 0.05, 0.37, 0.47, 0.63, 0.13, 0.70, 0.02}}, + {{0.69, 0.13, 0.74, 0.61, 0.25, 0.87, 0.46, 0.40, 0.81, 0.06}, + {0.89, + 0.32, + 0.61, + 0.24, + 0.70, + 0.23, + 0.09, + 0.03, + 0.14, + 0.80}}}}}); + // expected output of shiftmax forward operator + std::shared_ptr<Tensor> output_shiftmax = std::make_shared<Tensor>( + Array4D<float, 2, 2, 2, 10>{{{{{0.111084f, + 0.111084f, + 0.111084f, + 0.111084f, + 0.111084f, + 0.111084f, + 0.055542f, + 0.055542f, + 0.111084f, + 0.111084f}, + {0.0999756f, + 0.0999756f, + 0.0999756f, + 0.0999756f, + 0.0999756f, + 0.0999756f, + 0.0999756f, + 0.0999756f, + 0.0999756f, + 0.0999756f}}, + {{0.0624695f, + 0.124969f, + 0.124969f, + 0.0624695f, + 0.124969f, + 0.0624695f, + 0.124969f, + 0.124969f, + 0.124969f, + 0.0624695f}, + {0.0999756f, + 0.0999756f, + 0.0999756f, + 0.0999756f, + 0.0999756f, + 0.0999756f, + 0.0999756f, + 0.0999756f, + 0.0999756f, + 0.0999756f}}}, + {{{0.0999756f, + 0.0999756f, + 0.0999756f, + 0.0999756f, + 0.0999756f, + 0.0999756f, + 0.0999756f, + 0.0999756f, + 0.0999756f, + 0.0999756f}, + {0.124969f, + 0.124969f, + 0.0624695f, + 0.0624695f, + 0.124969f, + 0.124969f, + 0.124969f, + 0.0624695f, + 0.124969f, + 0.0624695f}}, + {{0.0999756f, + 0.0999756f, + 0.0999756f, + 0.0999756f, + 0.0999756f, + 0.0999756f, + 0.0999756f, + 0.0999756f, + 0.0999756f, + 0.0999756f}, + {0.0999756f, + 0.0999756f, + 0.0999756f, + 0.0999756f, + 0.0999756f, + 0.0999756f, + 0.0999756f, + 0.0999756f, + 0.0999756f, + 0.0999756f}}}}}); + // expected output of softmax forward operator (computed with PyTorch) + std::shared_ptr<Tensor> output_softmax = std::make_shared<Tensor>( + Array4D<float, 2, 2, 2, 10>{{{{{0.1484f, + 0.0918f, + 0.0975f, + 0.0928f, + 0.1025f, + 0.1440f, + 0.0568f, + 0.0568f, + 0.1046f, + 0.1046f}, + {0.1436f, + 0.0652f, + 0.0685f, + 0.1465f, + 0.1064f, + 0.0692f, + 0.1366f, + 0.0992f, + 0.0925f, + 0.0721f}}, + {{0.0768f, + 0.0957f, + 0.1593f, + 0.0730f, + 0.1157f, + 0.0681f, + 0.0938f, + 0.1642f, + 0.0874f, + 0.0661f}, + {0.1005f, + 0.1227f, + 0.0798f, + 0.1156f, + 0.0680f, + 0.0823f, + 0.1133f, + 0.0694f, + 0.1056f, + 0.1426f}}}, + {{{0.0645f, + 0.0734f, + 0.1118f, + 0.0981f, + 0.1052f, + 0.0853f, + 0.1085f, + 0.1118f, + 0.0906f, + 0.1509f}, + {0.1743f, + 0.0901f, + 0.0716f, + 0.0688f, + 0.0947f, + 0.1047f, + 0.1228f, + 0.0745f, + 0.1317f, + 0.0667f}}, + {{0.1164f, + 0.0665f, + 0.1224f, + 0.1075f, + 0.0750f, + 0.1394f, + 0.0925f, + 0.0871f, + 0.1313f, + 0.0620f}, + {0.1551f, + 0.0877f, + 0.1172f, + 0.0810f, + 0.1283f, + 0.0802f, + 0.0697f, + 0.0656f, + 0.0733f, + 0.1418f}}}}}); std::shared_ptr<Node> myShiftMax = ShiftMax(); - auto op = std::static_pointer_cast<OperatorTensor>(myShiftMax -> getOperator()); - op->associateInput(0,input0); + auto op = std::static_pointer_cast<OperatorTensor>( + myShiftMax->getOperator()); + op->associateInput(0, input0); op->setDataType(DataType::Float32); op->setBackend("cuda"); op->forward(); - - float* computedOutput = new float[output_shiftmax->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * output_shiftmax->size(), cudaMemcpyDeviceToHost); - //test if forward result are as expected - for(int i = 0; i < output_shiftmax->size(); i++){ - const float targetOutput = *(static_cast<float*>(output_shiftmax->getImpl()->rawPtr()) + i); + float *computedOutput = new float[output_shiftmax->size()](); + cudaMemcpy(computedOutput, + op->getOutput(0)->getImpl()->rawPtr(), + sizeof(float) * output_shiftmax->size(), + cudaMemcpyDeviceToHost); + + // test if forward result are as expected + for (int i = 0; i < output_shiftmax->size(); i++) { + const float targetOutput = + *(static_cast<float *>(output_shiftmax->getImpl()->rawPtr()) + + i); REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); } - //measure difference between softmax and shiftmax + // measure difference between softmax and shiftmax float sum = 0.0; - for(int i = 0; i < output_softmax->size(); i++){ - const float targetOutput = *(static_cast<float*>(output_softmax->getImpl()->rawPtr()) + i); + for (int i = 0; i < output_softmax->size(); i++) { + const float targetOutput = *( + static_cast<float *>(output_softmax->getImpl()->rawPtr()) + i); sum += fabs(computedOutput[i] - targetOutput); } sum = sum / output_softmax->size(); @@ -128,42 +244,54 @@ TEST_CASE("[gpu/operator] ShiftMax(forward)", "[ShiftMax][GPU]") { delete[] computedOutput; } - } TEST_CASE("[gpu/operator] ShiftMax(backward)", "[ShiftMax][GPU]") { - std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array4D<float,1,1,1,8> { //NCHW - { - { - { - {1.46650600, 1.24083233, -0.33106008, -0.15137172, 0.06625678, -1.8326609, 0.53444749, -0.05167147}, - }, - }, - } - }); - + std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>( + Array4D<float, 1, 1, 1, 8>{// NCHW + { + { + { + {1.46650600, + 1.24083233, + -0.33106008, + -0.15137172, + 0.06625678, + -1.8326609, + 0.53444749, + -0.05167147}, + }, + }, + }}); + input0->setBackend("cuda"); std::shared_ptr<Node> myShiftMax = ShiftMax(); - auto op = std::static_pointer_cast<OperatorTensor>(myShiftMax->getOperator()); + auto op = + std::static_pointer_cast<OperatorTensor>(myShiftMax->getOperator()); op->associateInput(0, input0); op->setDataType(DataType::Float32); op->setBackend("cuda"); myShiftMax->forward(); - std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array4D<float,1,1,1,8> { + std::shared_ptr<Tensor> myOutputGrad = + std::make_shared<Tensor>(Array4D<float, 1, 1, 1, 8>{{ { { - { - { 1.34347093, 0.90813798, 0.39607167, 1.20428133, 0.16845724, 0.48487359, 0.40748054, -0.21790814}, - }, + {1.34347093, + 0.90813798, + 0.39607167, + 1.20428133, + 0.16845724, + 0.48487359, + 0.40748054, + -0.21790814}, }, - } - }); - + }, + }}); myOutputGrad->setBackend("cuda"); std::shared_ptr<Tensor> predictedOutput = op->getOutput(0); @@ -171,47 +299,67 @@ TEST_CASE("[gpu/operator] ShiftMax(backward)", "[ShiftMax][GPU]") predictedOutput->setGrad(myOutputGrad); REQUIRE_NOTHROW(myShiftMax->backward()); - //expected output of shiftmax backward operator - std::shared_ptr<Tensor> expectedInputGradShiftMax = std::make_shared<Tensor>(Array4D<float,1,1,1,8> { + // expected output of shiftmax backward operator + std::shared_ptr<Tensor> expectedInputGradShiftMax = + std::make_shared<Tensor>(Array4D<float, 1, 1, 1, 8>{{ { { - { - { 0.159378, 0.0249331, -0.0250217, 0.0262418, -0.0514701, -0.00459638, -0.0551896, -0.0739511}, - }, + {0.159378, + 0.0249331, + -0.0250217, + 0.0262418, + -0.0514701, + -0.00459638, + -0.0551896, + -0.0739511}, }, - } - }); + }, + }}); - //expected output of softmax backward operator (computed with PyTorch) - std::shared_ptr<Tensor> expectedInputGradSoftmax = std::make_shared<Tensor>(Array4D<float,1,1,1,8> { + // expected output of softmax backward operator (computed with PyTorch) + std::shared_ptr<Tensor> expectedInputGradSoftmax = + std::make_shared<Tensor>(Array4D<float, 1, 1, 1, 8>{{ { { - { - { 0.1672, 0.0198, -0.0236, 0.0241, -0.0535, -0.0042, -0.0547, -0.0752}, - }, + {0.1672, + 0.0198, + -0.0236, + 0.0241, + -0.0535, + -0.0042, + -0.0547, + -0.0752}, }, - } - }); - + }, + }}); float *computedGradCuda = new float[myOutputGrad->size()](); - cudaMemcpy(computedGradCuda, input->grad()->getImpl()->rawPtr(), sizeof(float) * myOutputGrad->size(), cudaMemcpyDeviceToHost); + cudaMemcpy(computedGradCuda, + input->grad()->getImpl()->rawPtr(), + sizeof(float) * myOutputGrad->size(), + cudaMemcpyDeviceToHost); - //test if backward result are as expected - for(int i = 0; i < expectedInputGradShiftMax->size(); i++){ - const float targetOutput = *(static_cast<float*>(expectedInputGradShiftMax->getImpl()->rawPtr()) + i); + // test if backward result are as expected + for (int i = 0; i < expectedInputGradShiftMax->size(); i++) { + const float targetOutput = + *(static_cast<float *>( + expectedInputGradShiftMax->getImpl()->rawPtr()) + + i); REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6); } - //measure difference between softmax and shiftmax + // measure difference between softmax and shiftmax float sum = 0.0; - for(int i = 0; i < expectedInputGradSoftmax->size(); i++){ - const float targetOutput = *(static_cast<float*>(expectedInputGradSoftmax->getImpl()->rawPtr()) + i); - sum += fabs(computedGradCuda[i] - targetOutput); - } - sum = sum / expectedInputGradSoftmax->size(); - REQUIRE(sum < 4e-3); + for (int i = 0; i < expectedInputGradSoftmax->size(); i++) { + const float targetOutput = + *(static_cast<float *>( + expectedInputGradSoftmax->getImpl()->rawPtr()) + + i); + sum += fabs(computedGradCuda[i] - targetOutput); + } + sum = sum / expectedInputGradSoftmax->size(); + REQUIRE(sum < 4e-3); delete[] computedGradCuda; } diff --git a/unit_tests/Test_TensorImpl.cpp b/unit_tests/Test_TensorImpl.cpp index cb120a970c5310f80f8c62960c029a845937ba30..ed34394267165549421d2171f50fc93d0d5d4572 100644 --- a/unit_tests/Test_TensorImpl.cpp +++ b/unit_tests/Test_TensorImpl.cpp @@ -26,34 +26,34 @@ TEST_CASE("CUDA test") { const int N = 100; // Allocate host memory - float* a = new float[N](); - float* b = new float[N](); - float* out = new float[N](); + float *a = new float[N](); + float *b = new float[N](); + float *out = new float[N](); // Initialize host arrays - for(int i = 0; i < N; i++){ + for (int i = 0; i < N; i++) { a[i] = 1.0f; b[i] = 2.0f; } // Allocate device memory - float *d_a, *d_b, *d_out; - cudaMalloc(reinterpret_cast<void**>(&d_a), sizeof(float) * N); - cudaMalloc(reinterpret_cast<void**>(&d_b), sizeof(float) * N); - cudaMalloc(reinterpret_cast<void**>(&d_out), sizeof(float) * N); + float *d_a, *d_b, *d_out; + cudaMalloc(reinterpret_cast<void **>(&d_a), sizeof(float) * N); + cudaMalloc(reinterpret_cast<void **>(&d_b), sizeof(float) * N); + cudaMalloc(reinterpret_cast<void **>(&d_out), sizeof(float) * N); // Transfer data from host to device memory cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice); cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice); - // Executing kernel + // Executing kernel vector_add(d_out, d_a, d_b, N); - + // Transfer data back to host memory cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost); // Verification - for(int i = 0; i < N; i++){ + for (int i = 0; i < N; i++) { REQUIRE(fabs(out[i] - a[i] - b[i]) < 1e-6); } @@ -72,17 +72,7 @@ TEST_CASE("Tensor creation", "[Connector]") { SECTION("from const array") { Tensor x; x.setBackend("cuda"); - x = Array3D<int,2,2,2>{ - { - { - {1, 2}, - {3, 4} - }, - { - {5, 6}, - {7, 8} - } - }}; + x = Array3D<int, 2, 2, 2>{{{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}}; REQUIRE(x.nbDims() == 3); REQUIRE(x.dims()[0] == 2); @@ -91,23 +81,16 @@ TEST_CASE("Tensor creation", "[Connector]") { REQUIRE(x.size() == 8); std::array<int, 8> val; - cudaMemcpy(&val[0], x.getImpl()->rawPtr(), 8 * sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&val[0], + x.getImpl()->rawPtr(), + 8 * sizeof(int), + cudaMemcpyDeviceToHost); REQUIRE(val[0] == 1); REQUIRE(val[7] == 8); } SECTION("from const array before backend") { - Tensor x = Array3D<int,2,2,2>{ - { - { - {1, 2}, - {3, 4} - }, - { - {5, 6}, - {7, 8} - } - }}; + Tensor x = Array3D<int, 2, 2, 2>{{{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}}; x.setBackend("cuda"); REQUIRE(x.nbDims() == 3); @@ -117,7 +100,10 @@ TEST_CASE("Tensor creation", "[Connector]") { REQUIRE(x.size() == 8); std::array<int, 8> val; - cudaMemcpy(&val[0], x.getImpl()->rawPtr(), 8 * sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&val[0], + x.getImpl()->rawPtr(), + 8 * sizeof(int), + cudaMemcpyDeviceToHost); REQUIRE(val[0] == 1); REQUIRE(val[7] == 8); } @@ -127,37 +113,52 @@ TEST_CASE("Tensor Descriptor Update") { Tensor x; x.setBackend("cuda"); - std::vector<std::size_t> shapeA = { 7, 6, 5, 4, 3 }; + std::vector<std::size_t> shapeA = {7, 6, 5, 4, 3}; x.resize(shapeA); - cudnnTensorDescriptor_t desc = std::dynamic_pointer_cast<TensorImpl_cuda_>(x.getImpl())->getCudnnTensorDesc(x); + cudnnTensorDescriptor_t desc = + std::dynamic_pointer_cast<TensorImpl_cuda_>(x.getImpl()) + ->getCudnnTensorDesc(x); cudnnDataType_t currentDataType; int currentNbDims; std::vector<int> currentDimA(shapeA.size()); std::vector<int> currentStrideA(shapeA.size()); - REQUIRE_NOTHROW(cudnnGetTensorNdDescriptor(desc, shapeA.size(), ¤tDataType, ¤tNbDims, currentDimA.data(), currentStrideA.data())); + REQUIRE_NOTHROW(cudnnGetTensorNdDescriptor(desc, + shapeA.size(), + ¤tDataType, + ¤tNbDims, + currentDimA.data(), + currentStrideA.data())); - REQUIRE(std::equal(currentDimA.begin(), currentDimA.end(), shapeA.begin(), [](int a, std::size_t b) { - return static_cast<std::size_t>(a) == b; - } - ) - ); + REQUIRE(std::equal(currentDimA.begin(), + currentDimA.end(), + shapeA.begin(), + [](int a, std::size_t b) { + return static_cast<std::size_t>(a) == b; + })); // Change the tensor shape and check tensor descriptor - std::vector<std::size_t> shapeB = { 6, 5, 4 }; + std::vector<std::size_t> shapeB = {6, 5, 4}; x.resize(shapeB); std::vector<int> currentDimB(shapeB.size()); std::vector<int> currentStrideB(shapeB.size()); - desc = std::dynamic_pointer_cast<TensorImpl_cuda_>(x.getImpl())->getCudnnTensorDesc(x); - REQUIRE_NOTHROW(cudnnGetTensorNdDescriptor(desc, shapeB.size(), ¤tDataType, ¤tNbDims, currentDimB.data(), currentStrideB.data())); - - REQUIRE(std::equal(currentDimB.begin(), currentDimB.end(), shapeB.begin(), [](int a, std::size_t b) { - return static_cast<std::size_t>(a) == b; - } - ) - ); + desc = std::dynamic_pointer_cast<TensorImpl_cuda_>(x.getImpl()) + ->getCudnnTensorDesc(x); + REQUIRE_NOTHROW(cudnnGetTensorNdDescriptor(desc, + shapeB.size(), + ¤tDataType, + ¤tNbDims, + currentDimB.data(), + currentStrideB.data())); + + REQUIRE(std::equal(currentDimB.begin(), + currentDimB.end(), + shapeB.begin(), + [](int a, std::size_t b) { + return static_cast<std::size_t>(a) == b; + })); }