diff --git a/.gitignore b/.gitignore index f3571f3fefd133675c1989b50247a12d107bc685..047ffa35e8433bae8d5675830ca14f3d57c38a5e 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ # C++ Build build*/ install*/ +include/aidge/backend/cuda_version.h # VSCode .vscode diff --git a/CHANGELOG b/CHANGELOG index 48ecf3b7d3d1424ca26d60e320137292d296386a..70cde1b65556574127b55189857183cde3722b89 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,5 @@ +# Version 0.5.0 (January 31, 2024) + # Version 0.4.0 (December 6, 2024) # Version 0.1.0 (January 23, 2024) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7a013f1bd78fa082570963510715154b6578e390..243cc75158a9c4b997a63e6a771bc4177eb26080 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,9 +1,18 @@ # CMake >= 3.18 is required for good support of FindCUDAToolkit cmake_minimum_required(VERSION 3.18) -set(CXX_STANDARD 14) + +set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) file(STRINGS "${CMAKE_SOURCE_DIR}/version.txt" version) +# Parse version.txt to retrieve Major, Minor and Path +string(REGEX MATCH "([0-9]+\\.[0-9]+\\.[0-9]+)" _ MATCHES ${version}) +set(PROJECT_VERSION_MAJOR ${CMAKE_MATCH_1}) +set(PROJECT_VERSION_MINOR ${CMAKE_MATCH_2}) +set(PROJECT_VERSION_PATCH ${CMAKE_MATCH_3}) + project(aidge_backend_cuda VERSION ${version} DESCRIPTION "CUDA implementations of the operators of aidge framework." @@ -21,7 +30,6 @@ execute_process( ) message(STATUS "Latest git commit: ${GIT_COMMIT_HASH}") # Define a preprocessor macro with the Git commit version -add_definitions(-DGIT_COMMIT_HASH="${GIT_COMMIT_HASH}") # Note : project name is ${CMAKE_PROJECT_NAME} and python module name is also ${CMAKE_PROJECT_NAME} set(module_name _${CMAKE_PROJECT_NAME}) # target name @@ -107,6 +115,14 @@ if (PYBIND) ) endif() +message(STATUS "Creating ${CMAKE_CURRENT_SOURCE_DIR}/include/aidge/backend/cuda_version.h") +# Generate version.h file from config file version.h.in +configure_file( + "${CMAKE_CURRENT_SOURCE_DIR}/include/aidge/backend/version.h.in" + "${CMAKE_CURRENT_SOURCE_DIR}/include/aidge/backend/cuda_version.h" +) + + target_link_libraries(${module_name} PUBLIC _aidge_core # _ is added because we link the target not the project diff --git a/aidge_backend_cuda/__init__.py b/aidge_backend_cuda/__init__.py index c59afd66efc6197d93268e9e205f35048a26d60e..7b57dd6b1704b16e2b9869c975ea1feb625f5bcf 100644 --- a/aidge_backend_cuda/__init__.py +++ b/aidge_backend_cuda/__init__.py @@ -1,2 +1,2 @@ from aidge_backend_cuda.aidge_backend_cuda import * # import so generated by PyBind -from ._version import * + diff --git a/include/aidge/backend/cuda.hpp b/include/aidge/backend/cuda.hpp index 974dbb0248a77024b4c8bc7c7c467646512f0828..ec2c94090c8e8944008b94a49605e309d14d39ab 100644 --- a/include/aidge/backend/cuda.hpp +++ b/include/aidge/backend/cuda.hpp @@ -11,9 +11,13 @@ #ifndef AIDGE_BACKEND_CUDA_IMPORTS_H_ #define AIDGE_BACKEND_CUDA_IMPORTS_H_ +#include "aidge/backend/cuda_version.h" #include "aidge/backend/cuda/data/TensorImpl.hpp" + #include "aidge/backend/cuda/operator/OperatorImpl.hpp" + +#include "aidge/backend/cuda/operator/AbsImpl.hpp" #include "aidge/backend/cuda/operator/AddImpl.hpp" #include "aidge/backend/cuda/operator/AndImpl.hpp" #include "aidge/backend/cuda/operator/ArgMaxImpl.hpp" @@ -22,11 +26,14 @@ #include "aidge/backend/cuda/operator/ConvImpl.hpp" #include "aidge/backend/cuda/operator/ClipImpl.hpp" #include "aidge/backend/cuda/operator/DivImpl.hpp" +#include "aidge/backend/cuda/operator/ErfImpl.hpp" #include "aidge/backend/cuda/operator/FCImpl.hpp" #include "aidge/backend/cuda/operator/GlobalAveragePoolingImpl.hpp" +#include "aidge/backend/cuda/operator/ILayerNormImpl.hpp" #include "aidge/backend/cuda/operator/LRNImpl.hpp" #include "aidge/backend/cuda/operator/LnImpl.hpp" #include "aidge/backend/cuda/operator/MaxPoolingImpl.hpp" +#include "aidge/backend/cuda/operator/MatMulImpl.hpp" #include "aidge/backend/cuda/operator/MulImpl.hpp" #include "aidge/backend/cuda/operator/PadImpl.hpp" #include "aidge/backend/cuda/operator/PowImpl.hpp" @@ -34,17 +41,12 @@ #include "aidge/backend/cuda/operator/ReduceSumImpl.hpp" #include "aidge/backend/cuda/operator/ReLUImpl.hpp" #include "aidge/backend/cuda/operator/RoundImpl.hpp" -#include "aidge/backend/cuda/operator/ShiftMaxImpl.hpp" #include "aidge/backend/cuda/operator/ShiftGELUImpl.hpp" -#include "aidge/backend/cuda/operator/ReshapeImpl.hpp" +#include "aidge/backend/cuda/operator/ShiftMaxImpl.hpp" #include "aidge/backend/cuda/operator/SigmoidImpl.hpp" +#include "aidge/backend/cuda/operator/SoftmaxImpl.hpp" #include "aidge/backend/cuda/operator/SqrtImpl.hpp" #include "aidge/backend/cuda/operator/SubImpl.hpp" #include "aidge/backend/cuda/operator/TanhImpl.hpp" -#include "aidge/backend/cuda/operator/ShiftMaxImpl.hpp" -#include "aidge/backend/cuda/operator/ShiftGELUImpl.hpp" -#include "aidge/backend/cuda/operator/ILayerNormImpl.hpp" - - #endif /* AIDGE_BACKEND_CUDA_IMPORTS_H_ */ diff --git a/include/aidge/backend/cuda/data/TensorImpl.hpp b/include/aidge/backend/cuda/data/TensorImpl.hpp index 5a2873cfbfa9aafc97b543f2ccce3d1abf0b6057..67f6175efd7bcc77ec26da2c657982bf0229e038 100644 --- a/include/aidge/backend/cuda/data/TensorImpl.hpp +++ b/include/aidge/backend/cuda/data/TensorImpl.hpp @@ -4,6 +4,9 @@ #include <cstddef> // std::size_t #include <memory> #include <string> +#include <vector> + +#include <cuda.h> #include "aidge/backend/TensorImpl.hpp" #include "aidge/data/Tensor.hpp" diff --git a/include/aidge/backend/cuda/operator/AbsImpl.hpp b/include/aidge/backend/cuda/operator/AbsImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..5a99c613113396cc7b6fa7985835b3e7bcc8e1ed --- /dev/null +++ b/include/aidge/backend/cuda/operator/AbsImpl.hpp @@ -0,0 +1,63 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_BACKEND_CUDA_OPERATOR_ABSIMPL_H_ +#define AIDGE_BACKEND_CUDA_OPERATOR_ABSIMPL_H_ + +#include <array> +#include <memory> +#include <tuple> +#include <vector> + +#include <cudnn.h> + +#include "aidge/backend/OperatorImpl.hpp" +#include "aidge/operator/Abs.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" + +#include "aidge/backend/cuda/utils/CudaUtils.hpp" + +namespace Aidge { +// Operator implementation entry point for the backend +class AbsImpl_cuda : public OperatorImpl { +public: + AbsImpl_cuda(const Abs_Op& op) : OperatorImpl(op, "cuda") {} + + static std::unique_ptr<AbsImpl_cuda> create(const Abs_Op& op) { + return std::make_unique<AbsImpl_cuda>(op); + } + + virtual std::vector<ImplSpec> getAvailableImplSpecs() const override { + return { + {DataType::Float64}, + {DataType::Float32}, + {DataType::Float16}, + }; + } + + void forward() override; + void backward() override; + +private: + + std::shared_ptr<Tensor> mInputFallback; + std::shared_ptr<Tensor> mOutputGradFallback; + + template <class T> void forward_(const Tensor& input); + template <class T> void backward_(const Tensor& input, const Tensor& outputGrad); +}; + +// Implementation entry point registration to Operator +REGISTRAR(Abs_Op, "cuda", Aidge::AbsImpl_cuda::create); +} // namespace Aidge + +#endif /* AIDGE_BACKEND_CUDA_OPERATOR_ABSIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/AbsImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/AbsImpl_CUDA_kernels.hpp new file mode 100644 index 0000000000000000000000000000000000000000..f385512e4d4c2494e62bcef23b879cb8b9e09ba8 --- /dev/null +++ b/include/aidge/backend/cuda/operator/AbsImpl_CUDA_kernels.hpp @@ -0,0 +1,36 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CUDA_OPERATOR_ABSIMPL_KERNELS_H_ +#define AIDGE_CUDA_OPERATOR_ABSIMPL_KERNELS_H_ + +#include <stdexcept> +#include <cfloat> +#include <cuda.h> +#include <cuda_runtime_api.h> +#include <cuda_fp16.h> + +#include "aidge/data/Data.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/utils/Types.h" + +namespace Aidge { + +template <class T> +void absForward(const T* input, T* output, int size); + +} +#endif /* AIDGE_CUDA_OPERATOR_ABSIMPL_KERNELS_H_ */ + + + + + diff --git a/include/aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/Cublas_kernels.hpp similarity index 67% rename from include/aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp rename to include/aidge/backend/cuda/operator/Cublas_kernels.hpp index a956960df0a4dccb4ef9eb0634e5f61b9ddede0a..3287e5940fedd23ebd32b5c1c9e4517045c7bb96 100644 --- a/include/aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp +++ b/include/aidge/backend/cuda/operator/Cublas_kernels.hpp @@ -9,8 +9,8 @@ * ********************************************************************************/ -#ifndef AIDGE_CUDA_OPERATOR_FCIMPL_KERNELS_H_ -#define AIDGE_CUDA_OPERATOR_FCIMPL_KERNELS_H_ +#ifndef AIDGE_CUDA_OPERATOR_CUBLAS_KERNELS_H_ +#define AIDGE_CUDA_OPERATOR_CUBLAS_KERNELS_H_ #include <stdexcept> #include <cfloat> @@ -33,6 +33,17 @@ cublasStatus_t cublasGemm(cublasHandle_t handle, const T *beta, T *C, int ldc); +template <class T> +cublasStatus_t cublasGemmStridedBatched(cublasHandle_t handle, + cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, + const T *alpha, + const T *A, int lda, long long int strideA, + const T *B, int ldb, long long int strideB, + const T *beta, + T *C, int ldc, long long int strideC, + int batchCount); + template <class T> cublasStatus_t cublasGemv(cublasHandle_t handle, cublasOperation_t trans, int m, int n, @@ -42,4 +53,4 @@ cublasStatus_t cublasGemv(cublasHandle_t handle, cublasOperation_t trans, const T *beta, T *y, int incy); } -#endif /* AIDGE_CUDA_OPERATOR_FCIMPL_KERNELS_H_ */ \ No newline at end of file +#endif /* AIDGE_CUDA_OPERATOR_CUBLAS_KERNELS_H_ */ \ No newline at end of file diff --git a/include/aidge/backend/cuda/operator/ErfImpl.hpp b/include/aidge/backend/cuda/operator/ErfImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a17520925c8195db1fc9b4381d9984d40c4164f8 --- /dev/null +++ b/include/aidge/backend/cuda/operator/ErfImpl.hpp @@ -0,0 +1,62 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_BACKEND_CUDA_OPERATOR_ERFIMPL_H_ +#define AIDGE_BACKEND_CUDA_OPERATOR_ERFIMPL_H_ + +#include <array> +#include <memory> +#include <tuple> +#include <vector> + +#include <cudnn.h> + +#include "aidge/backend/OperatorImpl.hpp" +#include "aidge/operator/Erf.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" + +#include "aidge/backend/cuda/utils/CudaUtils.hpp" + +namespace Aidge { +// Operator implementation entry point for the backend +class ErfImpl_cuda : public OperatorImpl { +public: + ErfImpl_cuda(const Erf_Op& op) : OperatorImpl(op, "cuda") {} + + static std::unique_ptr<ErfImpl_cuda> create(const Erf_Op& op) { + return std::make_unique<ErfImpl_cuda>(op); + } + + virtual std::vector<ImplSpec> getAvailableImplSpecs() const override { + return { + {DataType::Float64}, + {DataType::Float32}, + {DataType::Float16}, + }; + } + + void forward() override; + void backward() override; + +private: + std::shared_ptr<Tensor> mInputFallback; + std::shared_ptr<Tensor> mOutputGradFallback; + + template <class T> void forward_(const Tensor& input); + template <class T> void backward_(const Tensor& output_grad); +}; + +// Implementation entry point registration to Operator +REGISTRAR(Erf_Op, "cuda", Aidge::ErfImpl_cuda::create); +} // namespace Aidge + +#endif /* AIDGE_BACKEND_CUDA_OPERATOR_ERFIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/ErfImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/ErfImpl_CUDA_kernels.hpp new file mode 100644 index 0000000000000000000000000000000000000000..5a51f006e8b2b43d748dcb0d0e79f835b6616df2 --- /dev/null +++ b/include/aidge/backend/cuda/operator/ErfImpl_CUDA_kernels.hpp @@ -0,0 +1,32 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CUDA_OPERATOR_ERFIMPL_FORWARD_KERNEL_H_ +#define AIDGE_CUDA_OPERATOR_ERFIMPL_FORWARD_KERNEL_H_ + +#include <stdexcept> +#include <cfloat> +#include <cuda.h> +#include <cuda_runtime_api.h> +#include <cuda_fp16.h> + +#include "aidge/data/Data.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/utils/Types.h" + +namespace Aidge { + +template <class T> +void ErfForward(const T* input, T* output, int size); + +} +#endif /* AIDGE_CUDA_OPERATOR_ERFIMPL_FORWARD_KERNEL_H_ */ + diff --git a/include/aidge/backend/cuda/operator/MatMulImpl.hpp b/include/aidge/backend/cuda/operator/MatMulImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e5de57286ffc221a5f8b2bb84fa506d9a52d175e --- /dev/null +++ b/include/aidge/backend/cuda/operator/MatMulImpl.hpp @@ -0,0 +1,62 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_BACKEND_CUDA_OPERATOR_MATMULIMPL_H_ +#define AIDGE_BACKEND_CUDA_OPERATOR_MATMULIMPL_H_ + +#include <array> +#include <memory> +#include <tuple> +#include <vector> + +#include <cudnn.h> + +#include "aidge/backend/OperatorImpl.hpp" +#include "aidge/operator/MatMul.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" + +#include "aidge/backend/cuda/utils/CudaUtils.hpp" + +namespace Aidge { +// Operator implementation entry point for the backend +class MatMulImpl_cuda : public OperatorImpl { +public: + MatMulImpl_cuda(const MatMul_Op& op) : OperatorImpl(op, "cuda") {} + + static std::unique_ptr<MatMulImpl_cuda> create(const MatMul_Op& op) { + return std::make_unique<MatMulImpl_cuda>(op); + } + + virtual std::vector<ImplSpec> getAvailableImplSpecs() const override { + return { + {DataType::Float64}, + {DataType::Float32}, + {DataType::Float16}, + }; + } + + void forward() override; + void backward() override; + +private: + std::shared_ptr<Tensor> mInput0Fallback; + std::shared_ptr<Tensor> mInput1Fallback; + + template <class T> void forward_(const Tensor& input0, const Tensor& input1); + template <class T> void backward_(const Tensor& outGrad); +}; + +// Implementation entry point registration to Operator +REGISTRAR(MatMul_Op, "cuda", Aidge::MatMulImpl_cuda::create); +} // namespace Aidge + +#endif /* AIDGE_BACKEND_CUDA_OPERATOR_MATMULIMPL_H_ */ diff --git a/include/aidge/backend/cuda/operator/ReshapeImpl.hpp b/include/aidge/backend/cuda/operator/SoftmaxImpl.hpp similarity index 62% rename from include/aidge/backend/cuda/operator/ReshapeImpl.hpp rename to include/aidge/backend/cuda/operator/SoftmaxImpl.hpp index 2c8ebd68cff0313031279f83109043eb17d919b5..cd4b8c6a58fb50419dbd8982632d488317e677cf 100644 --- a/include/aidge/backend/cuda/operator/ReshapeImpl.hpp +++ b/include/aidge/backend/cuda/operator/SoftmaxImpl.hpp @@ -1,5 +1,5 @@ /******************************************************************************** - * Copyright (c) 2023 CEA-List + * Copyright (c) 2024 CEA-List * * This program and the accompanying materials are made available under the * terms of the Eclipse Public License 2.0 which is available at @@ -9,8 +9,8 @@ * ********************************************************************************/ -#ifndef AIDGE_BACKEND_CUDA_OPERATOR_RESHAPEIMPL_H_ -#define AIDGE_BACKEND_CUDA_OPERATOR_RESHAPEIMPL_H_ +#ifndef AIDGE_BACKEND_CUDA_OPERATOR_SOFTMAXIMPL_H_ +#define AIDGE_BACKEND_CUDA_OPERATOR_SOFTMAXIMPL_H_ #include <array> #include <memory> @@ -20,7 +20,7 @@ #include <cudnn.h> #include "aidge/backend/OperatorImpl.hpp" -#include "aidge/operator/Reshape.hpp" +#include "aidge/operator/Softmax.hpp" #include "aidge/utils/Registrar.hpp" #include "aidge/utils/Types.h" @@ -28,12 +28,12 @@ namespace Aidge { // Operator implementation entry point for the backend -class ReshapeImpl_cuda : public OperatorImpl { +class SoftmaxImpl_cuda : public OperatorImpl { public: - ReshapeImpl_cuda(const Reshape_Op& op) : OperatorImpl(op, "cuda") {} + SoftmaxImpl_cuda(const Softmax_Op& op) : OperatorImpl(op, "cuda") {} - static std::unique_ptr<ReshapeImpl_cuda> create(const Reshape_Op& op) { - return std::make_unique<ReshapeImpl_cuda>(op); + static std::unique_ptr<SoftmaxImpl_cuda> create(const Softmax_Op& op) { + return std::make_unique<SoftmaxImpl_cuda>(op); } virtual std::vector<ImplSpec> getAvailableImplSpecs() const override { @@ -48,11 +48,15 @@ public: void backward() override; private: + // CuDNN specific variables std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback; + + template <class T> void forward_(const Tensor& input); + template <class T> void backward_(const Tensor& output_grad, int axis); }; // Implementation entry point registration to Operator -REGISTRAR(Reshape_Op, "cuda", Aidge::ReshapeImpl_cuda::create); +REGISTRAR(Softmax_Op, "cuda", Aidge::SoftmaxImpl_cuda::create); } // namespace Aidge -#endif /* AIDGE_BACKEND_CUDA_OPERATOR_RESHAPEIMPL_H_ */ +#endif /* AIDGE_BACKEND_CUDA_OPERATOR_SOFTMAXIMPL_H_ */ diff --git a/include/aidge/backend/cuda/utils/CudaUtils.hpp b/include/aidge/backend/cuda/utils/CudaUtils.hpp index ab7c805224ed6fe073baf2036b84f4ed6f49b077..5796227fbc4c012e8558ec933f206cd21c89b1d1 100644 --- a/include/aidge/backend/cuda/utils/CudaUtils.hpp +++ b/include/aidge/backend/cuda/utils/CudaUtils.hpp @@ -1,11 +1,11 @@ -#ifndef AIDGE_BACKEND_CUDA_CUDA_UTILS_H -#define AIDGE_BACKEND_CUDA_CUDA_UTILS_H +#ifndef AIDGE_BACKEND_CUDA_CUDA_UTILS_H_ +#define AIDGE_BACKEND_CUDA_CUDA_UTILS_H_ #include <string> #include <memory> -#include <sstream> -#include <iostream> #include <stdexcept> +#include <fmt/core.h> +#include <fmt/format.h> #include <cublas_v2.h> #include <cuda.h> @@ -18,31 +18,29 @@ do { \ const cudnnStatus_t e = (status); \ if (e != CUDNN_STATUS_SUCCESS) { \ - std::stringstream error; \ - error << "CUDNN failure: " << cudnnGetErrorString(e) << " (" \ - << static_cast<int>(e) << ") in " << __FILE__ << ':' << __LINE__; \ - int status_dev; \ - if (cudaGetDevice(&status_dev) == cudaSuccess) \ - error << " on device #" << status_dev; \ - std::cerr << error.str() << std::endl; \ + std::string error = fmt::format("CUDNN failure: {} ({}) in {}:{}", \ + cudnnGetErrorString(e), static_cast<int>(e), __FILE__, __LINE__); \ + int status_dev; \ + if (cudaGetDevice(&status_dev) == cudaSuccess) \ + error = fmt::format("{} on device #{}", error, status_dev); \ + fmt::print(stderr, "{}\n", error); \ cudaDeviceReset(); \ - throw std::runtime_error(error.str()); \ + throw std::runtime_error(error); \ } \ } while(0) #define CHECK_CUDA_STATUS(status) \ do { \ const cudaError_t e = (status); \ - if ((e) != cudaSuccess) { \ - std::stringstream error; \ - error << "Cuda failure: " << cudaGetErrorString(e) << " (" \ - << static_cast<int>(e) << ") in " << __FILE__ << ':' << __LINE__; \ - int status_dev; \ - if (cudaGetDevice(&status_dev) == cudaSuccess) \ - error << " on device #" << status_dev; \ - std::cerr << error.str() << std::endl; \ + if ((e) != cudaSuccess) { \ + std::string error = fmt::format("Cuda failure: {} ({}) in {}:{}", \ + cudaGetErrorString(e), static_cast<int>(e), __FILE__, __LINE__); \ + int status_dev; \ + if (cudaGetDevice(&status_dev) == cudaSuccess) \ + error = fmt::format("{} on device #{}", error, status_dev); \ + fmt::print(stderr, "{}\n", error); \ cudaDeviceReset(); \ - throw std::runtime_error(error.str()); \ + throw std::runtime_error(error); \ } \ } while(0) @@ -50,16 +48,14 @@ do { \ const cublasStatus_t e = (status); \ if (e != CUBLAS_STATUS_SUCCESS) { \ - std::stringstream error; \ - error << "Cublas failure: " \ - << Aidge::Cuda::cublasGetErrorString(e) << " (" \ - << static_cast<int>(e) << ") in " << __FILE__ << ':' << __LINE__; \ - int status_dev; \ - if (cudaGetDevice(&status_dev) == cudaSuccess) \ - error << " on device #" << status_dev; \ - std::cerr << error.str() << std::endl; \ + std::string error = fmt::format("Cublas failure: {} ({}) in {}:{}", \ + Aidge::Cuda::cublasGetErrorString(e), static_cast<int>(e), __FILE__, __LINE__); \ + int status_dev; \ + if (cudaGetDevice(&status_dev) == cudaSuccess) \ + error = fmt::format("{} on device #{}", error, status_dev); \ + fmt::print(stderr, "{}\n", error); \ cudaDeviceReset(); \ - throw std::runtime_error(error.str()); \ + throw std::runtime_error(error); \ } \ } while(0) @@ -96,4 +92,4 @@ namespace Cuda { } } -#endif // AIDGE_BACKEND_CUDA_CUDA_UTILS_H +#endif // AIDGE_BACKEND_CUDA_CUDA_UTILS_H_ diff --git a/include/aidge/backend/version.h.in b/include/aidge/backend/version.h.in new file mode 100644 index 0000000000000000000000000000000000000000..4b876f63002972c1f8f1340b70cdecdace911012 --- /dev/null +++ b/include/aidge/backend/version.h.in @@ -0,0 +1,11 @@ +#ifndef VERSION_H +#define VERSION_H + +namespace Aidge { +static constexpr const int PROJECT_VERSION_MAJOR = @PROJECT_VERSION_MAJOR@; +static constexpr const int PROJECT_VERSION_MINOR = @PROJECT_VERSION_MINOR@; +static constexpr const int PROJECT_VERSION_PATCH = @PROJECT_VERSION_PATCH@; +static constexpr const char * PROJECT_VERSION = "@PROJECT_VERSION_MAJOR@.@PROJECT_VERSION_MINOR@.@PROJECT_VERSION_PATCH@"; +static constexpr const char * PROJECT_GIT_HASH = "@GIT_COMMIT_HASH@"; +} +#endif // VERSION_H diff --git a/include/aidge/utils/sys_info/CudaVersionInfo.hpp b/include/aidge/utils/sys_info/CudaVersionInfo.hpp index 17490476b18d62da66671a28f76709349e3ba805..dfa5fc82e0abcc2beea9080578b3a0c3259cb58d 100644 --- a/include/aidge/utils/sys_info/CudaVersionInfo.hpp +++ b/include/aidge/utils/sys_info/CudaVersionInfo.hpp @@ -3,20 +3,19 @@ #include "aidge/backend/cuda/utils/CudaUtils.hpp" // CHECK_CUDA_STATUS #include "aidge/utils/Log.hpp" - +#include "aidge/backend/cuda_version.h" namespace Aidge { -#ifndef PROJECT_VERSION // Normally defined in CMakeLists.txt -#define PROJECT_VERSION "Unknown version" -#endif -#ifndef GIT_COMMIT_HASH -#define GIT_COMMIT_HASH "" -#endif -#ifndef CUDA_COMPILER_VERSION -#define CUDA_COMPILER_VERSION "Unknown version" -#endif -void showCudaVersion() { - Log::info("Aidge backend CUDA: {} ({}), {} {}", PROJECT_VERSION, GIT_COMMIT_HASH, __DATE__, __TIME__); +constexpr inline const char * getBackendCudaProjectVersion(){ + return PROJECT_VERSION; +} + +constexpr inline const char * getBackendCudaGitHash(){ + return PROJECT_GIT_HASH; +} + +void showBackendCudaProjectVersion() { + Log::info("Aidge backend CUDA: {} ({}), {} {}", getBackendCudaProjectVersion(), getBackendCudaGitHash(), __DATE__, __TIME__); Log::info("CUDA compiler version: {}", CUDA_COMPILER_VERSION); Log::info("CuDNN version: {}.{}.{}\n", CUDNN_MAJOR, CUDNN_MINOR, CUDNN_PATCHLEVEL); diff --git a/project_name.txt b/project_name.txt new file mode 100644 index 0000000000000000000000000000000000000000..5dcb1176a91a8786f918c2ea3a7e95b5e3d1edef --- /dev/null +++ b/project_name.txt @@ -0,0 +1 @@ +aidge_backend_cuda diff --git a/pyproject.toml b/pyproject.toml index 9816f05666ac6231387717d5267f97a1764140cf..9abb470b62d89ec89e2712c6721428dec87307b2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,14 +4,22 @@ description="CUDA implementations of the operators of aidge framework" dependencies = [ "numpy", ] -requires-python = ">= 3.7" +requires-python = ">= 3.8" readme = "README.md" license = { file = "LICENSE" } classifiers = [ "Development Status :: 2 - Pre-Alpha", "Programming Language :: Python :: 3" ] -dynamic = ["version"] +dynamic = ["version"] # defined by pbr + + +[project.urls] +Homepage = "https://www.deepgreen.ai/en/platform" +Documentation = "https://eclipse-aidge.readthedocs.io/en/latest/" +Repository = "https://gitlab.eclipse.org/eclipse/aidge/aidge_backend_cuda" +Issues = "https://gitlab.eclipse.org/eclipse/aidge/aidge_backend_cuda/-/issues" +Changelog = "https://gitlab.eclipse.org/eclipse/aidge/aidge_backend_cuda/-/releases" ##################################################### # SETUPTOOLS @@ -21,16 +29,13 @@ where = ["."] # list of folders that contain the packages (["."] by default) include = ["aidge_backend_cuda*"] # package names should match these glob patterns (["*"] by default) exclude = ["aidge_backend_cuda.unit_tests*"] # exclude packages matching these glob patterns (empty by default) namespaces = false # to disable scanning PEP 420 namespaces (true by default) -# SETUPTOOLS_SCM -[tool.setuptools_scm] -write_to = "aidge_backend_cuda/_version.py" [build-system] requires = [ "setuptools>=68", - "setuptools-scm", "cmake>=3.18.0", - "toml" + "toml", + "pbr" ] build-backend = "setuptools.build_meta" diff --git a/python_binding/utils/sys_info/pybind_CudaVersionInfo.cpp b/python_binding/utils/sys_info/pybind_CudaVersionInfo.cpp index 64f650903ec75d579ffd58dbd6d7db7bbaf573a2..b3823d52f8dfa25eb8ce46543a6fa67f548659ec 100644 --- a/python_binding/utils/sys_info/pybind_CudaVersionInfo.cpp +++ b/python_binding/utils/sys_info/pybind_CudaVersionInfo.cpp @@ -4,6 +4,8 @@ namespace py = pybind11; namespace Aidge { void init_cuda_sys_info(py::module& m){ - m.def("show_cuda_version", &showCudaVersion); + m.def("show_version", &showBackendCudaProjectVersion); + m.def("get_project_version", &getBackendCudaProjectVersion); + m.def("get_git_hash", &getBackendCudaGitHash); } } diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000000000000000000000000000000000000..aa0f227f6688468a5ab93384f7b1670086000035 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,3 @@ +# pbr file +[metadata] +version = file: version.txt diff --git a/setup.py b/setup.py index 706fc53ca08319ee487ef789ebc85f0d513ab25b..c72553a4650c7671669fa144ab3baacad77373fd 100644 --- a/setup.py +++ b/setup.py @@ -37,6 +37,7 @@ class CMakeBuild(build_ext): # This lists the number of processors available on the machine # The compilation will use half of them max_jobs = str(ceil(multiprocessing.cpu_count() / 2)) + max_jobs = os.environ.get("AIDGE_NB_PROC", max_jobs) cwd = pathlib.Path().absolute() @@ -50,33 +51,41 @@ class CMakeBuild(build_ext): os.chdir(str(build_temp)) - compile_type = ( - "Release" - if "AIDGE_PYTHON_BUILD_TYPE" not in os.environ - else os.environ["AIDGE_PYTHON_BUILD_TYPE"] - ) - install_path = ( os.path.join(sys.prefix, "lib", "libAidge") if "AIDGE_INSTALL" not in os.environ else os.environ["AIDGE_INSTALL"] ) + # Read environment variables for CMake options + c_compiler = os.environ.get("AIDGE_C_COMPILER", "gcc") + cxx_compiler = os.environ.get("AIDGE_CXX_COMPILER", "g++") + build_type = os.environ.get("AIDGE_BUILD_TYPE", "Release") + asan = os.environ.get("AIDGE_ASAN", "OFF") + with_cuda = os.environ.get("AIDGE_WITH_CUDA", "OFF") + # using ninja as default build system to build faster and with the same compiler as on windows - build_gen = ( - ["-G", os.environ["AIDGE_BUILD_GEN"]] - if "AIDGE_BUILD_GEN" in os.environ + build_gen = os.environ.get("AIDGE_BUILD_GEN", "") + build_gen_opts = ( + ["-G", build_gen] + if build_gen else [] ) - + + test_onoff = os.environ.get("AIDGE_BUILD_TEST", "OFF") + self.spawn( [ "cmake", *build_gen, str(cwd), - "-DTEST=OFF", + f"-DTEST={test_onoff}", f"-DCMAKE_INSTALL_PREFIX:PATH={install_path}", - f"-DCMAKE_BUILD_TYPE={compile_type}", + f"-DCMAKE_BUILD_TYPE={build_type}", + f"-DCMAKE_C_COMPILER={c_compiler}", + f"-DCMAKE_CXX_COMPILER={cxx_compiler}", + f"-DENABLE_ASAN={asan}", + f"-DCUDA={with_cuda}", "-DPYBIND=ON", "-DCMAKE_EXPORT_COMPILE_COMMANDS=ON", "-DCOVERAGE=OFF", @@ -86,9 +95,9 @@ class CMakeBuild(build_ext): if not self.dry_run: self.spawn( - ["cmake", "--build", ".", "--config", compile_type, "-j", max_jobs] + ["cmake", "--build", ".", "--config", build_type, "-j", max_jobs] ) - self.spawn(["cmake", "--install", ".", "--config", compile_type]) + self.spawn(["cmake", "--install", ".", "--config", build_type]) os.chdir(str(cwd)) aidge_package = build_lib / (get_project_name()) diff --git a/src/operator/AbsImpl.cpp b/src/operator/AbsImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..57fc721cc970ef5f2ca99b8b73a417dae80ca15b --- /dev/null +++ b/src/operator/AbsImpl.cpp @@ -0,0 +1,70 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <algorithm> +#include <cassert> +#include <numeric> +#include <vector> + +#include <cuda_fp16.h> +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/AbsImpl.hpp" +#include "aidge/backend/cuda/operator/AbsImpl_CUDA_kernels.hpp" +#include "aidge/backend/cuda/utils/CudaContext.hpp" +#include "aidge/backend/cuda/utils/CudaContext.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/operator/Abs.hpp" +#include "aidge/utils/Types.h" + +void Aidge::AbsImpl_cuda::forward() { + + const Abs_Op& op = static_cast<const Abs_Op&>(mOp); + + AIDGE_ASSERT(op.getInput(0), "missing input #0"); + + const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0)); + + switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + forward_<double>(input); + break; + case DataType::Float32: + forward_<float>(input); + break; + case DataType::Float16: + forward_<half>(input); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by the CUDA backend"); + } +} + +template <class T> +void Aidge::AbsImpl_cuda::forward_(const Tensor& input) +{ + const Abs_Op& op = static_cast<const Abs_Op&>(mOp); + + const T* inputPtr = static_cast<T*>(input.getImpl()->rawPtr()); + T* outputPtr = static_cast<T*>(op.getOutput(0)->getImpl()->rawPtr()); + + int size = op.getInput(0)->size(); + + Aidge::absForward<T>(inputPtr, outputPtr, size); +} + +void Aidge::AbsImpl_cuda::backward() { + // TODO +} + +template <class T> +void Aidge::AbsImpl_cuda::backward_(const Tensor& input, const Tensor& outputGrad) { + // TODO +} \ No newline at end of file diff --git a/src/operator/AbsImpl_CUDA_kernels.cu b/src/operator/AbsImpl_CUDA_kernels.cu new file mode 100644 index 0000000000000000000000000000000000000000..ca9e37ef074d4ddc1f523118297521cdce7a4e64 --- /dev/null +++ b/src/operator/AbsImpl_CUDA_kernels.cu @@ -0,0 +1,47 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include "aidge/backend/cuda/operator/AbsImpl_CUDA_kernels.hpp" + +// Base template for floating-point types (float, double) +template<typename T> +__device__ T abs_helper(T x) { + return std::abs(x); // std::log works for both float and double +} + +// Specialization for half-precision type using CUDA's half +template<> +__device__ half abs_helper<half>(half x) { + float x_float = __half2float(x); + return __float2half(std::abs(x_float)); +} + +template <class T> +__global__ void absKernel(const T* input, T* output, int size) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= size) return; + output[idx] = abs_helper(input[idx]); +} + +template <class T> +void Aidge::absForward(const T* input, T* output, int size) +{ + int blockSize = 256; + int numBlocks = (size + blockSize - 1) / blockSize; + absKernel<<<numBlocks, blockSize>>>(input, output, size); +}; + +template void Aidge::absForward<double>(const double* input, double* output, int size); + +template void Aidge::absForward<float>(const float* input, float* output, int size); + +template void Aidge::absForward<half>(const half* input, half* output, int size); \ No newline at end of file diff --git a/src/operator/FCImpl_CUDA_kernels.cu b/src/operator/Cublas_kernels.cu similarity index 62% rename from src/operator/FCImpl_CUDA_kernels.cu rename to src/operator/Cublas_kernels.cu index 90b7c6bbc9cf0f470a7184e24273aae04da6f3c6..5cc943f522392c1cbd3d8c76fb1739260b080e18 100644 --- a/src/operator/FCImpl_CUDA_kernels.cu +++ b/src/operator/Cublas_kernels.cu @@ -10,7 +10,7 @@ ********************************************************************************/ #include <stdio.h> -#include "aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp" +#include "aidge/backend/cuda/operator/Cublas_kernels.hpp" namespace Aidge{ @@ -74,6 +74,73 @@ cublasStatus_t cublasGemm<double>(cublasHandle_t handle, C, ldc); } + +template <> +cublasStatus_t cublasGemmStridedBatched<__half>(cublasHandle_t handle, + cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, + const __half *alpha, + const __half *A, int lda, long long int strideA, + const __half *B, int ldb, long long int strideB, + const __half *beta, + __half *C, int ldc, long long int strideC, + int batchCount) +{ + return cublasHgemmStridedBatched(handle, + transa, transb, + m, n, k, + alpha, + A, lda, strideA, + B, ldb, strideB, + beta, + C, ldc, strideC, + batchCount); +} + +template <> +cublasStatus_t cublasGemmStridedBatched<float>(cublasHandle_t handle, + cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, + const float *alpha, + const float *A, int lda, long long int strideA, + const float *B, int ldb, long long int strideB, + const float *beta, + float *C, int ldc, long long int strideC, + int batchCount) +{ + return cublasSgemmStridedBatched(handle, + transa, transb, + m, n, k, + alpha, + A, lda, strideA, + B, ldb, strideB, + beta, + C, ldc, strideC, + batchCount); +} + +template <> +cublasStatus_t cublasGemmStridedBatched<double>(cublasHandle_t handle, + cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, + const double *alpha, + const double *A, int lda, long long int strideA, + const double *B, int ldb, long long int strideB, + const double *beta, + double *C, int ldc, long long int strideC, + int batchCount) +{ + return cublasDgemmStridedBatched(handle, + transa, transb, + m, n, k, + alpha, + A, lda, strideA, + B, ldb, strideB, + beta, + C, ldc, strideC, + batchCount); +} + template <> cublasStatus_t cublasGemv<__half>(cublasHandle_t handle, cublasOperation_t trans, int m, int n, diff --git a/src/operator/ErfImpl.cpp b/src/operator/ErfImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..faa8067ce62ab03b420dd27975ddcead0332bb20 --- /dev/null +++ b/src/operator/ErfImpl.cpp @@ -0,0 +1,79 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <cassert> +#include <vector> + +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/ErfImpl.hpp" +#include "aidge/backend/cuda/operator/ErfImpl_CUDA_kernels.hpp" +#include "aidge/backend/cuda/utils/CudaContext.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/operator/Erf.hpp" +#include "aidge/utils/Types.h" + +void Aidge::ErfImpl_cuda::forward() { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + + AIDGE_ASSERT(op.getInput(0), "missing input #0"); + + const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0)); + + switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + forward_<double>(input); + break; + case DataType::Float32: + forward_<float>(input); + break; + case DataType::Float16: + forward_<half>(input); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + } +} + +template <class T> +void Aidge::ErfImpl_cuda::forward_(const Tensor& input) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const T * inputPtr = static_cast<const T*>(input.getImpl()->rawPtr()); + T * outputPtr = static_cast<T*>(op.getOutput(0)->getImpl()->rawPtr()); + + Aidge::ErfForward<T>(inputPtr, outputPtr, static_cast<int>(op.getOutput(0)->size())); +} + +void Aidge::ErfImpl_cuda::backward() { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + + AIDGE_ASSERT(op.getOutput(0)->grad(), "missing output #0 grad"); + + const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad()); + + switch(op.getInput(0)->grad()->dataType()) { + case DataType::Float64: + backward_<double>(output_grad); + break; + case DataType::Float32: + backward_<float>(output_grad); + break; + case DataType::Float16: + backward_<half>(output_grad); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + } +} + +template <class T> +void Aidge::ErfImpl_cuda::backward_(const Tensor& output_grad) { + //TODO +} \ No newline at end of file diff --git a/src/operator/ErfImpl_CUDA_kernels.cu b/src/operator/ErfImpl_CUDA_kernels.cu new file mode 100644 index 0000000000000000000000000000000000000000..012c02fd32a2c2fd83892b8e5514b76878f1b266 --- /dev/null +++ b/src/operator/ErfImpl_CUDA_kernels.cu @@ -0,0 +1,48 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include "aidge/backend/cuda/operator/ErfImpl_CUDA_kernels.hpp" +// Base template for floating-point types (float, double) +template<typename T> +__device__ T Erf_helper(T x) { + return erff(x); +} + +// Specialization for half-precision type using CUDA's half +template<> +__device__ half Erf_helper<half>(half x) { + float x_float = __half2float(x); // Convert __half to float + return __float2half(erff(x_float)); // Compute log and convert back to half +} + +template <class T> +__global__ void ErfKernel(const T* input, T* output, int size) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= size) return; + + output[idx] = Erf_helper(input[idx]); +} + +template <class T> +void Aidge::ErfForward(const T* input, T* output, int size) +{ + int blockSize = 256; + int numBlocks = (size + blockSize - 1) / blockSize; + + // Launch the kernel + ErfKernel<<<numBlocks, blockSize>>>(input, output, size); +}; + +template void Aidge::ErfForward<double>(const double* input, double* output, int size); + +template void Aidge::ErfForward<float>(const float* input, float* output, int size); + +template void Aidge::ErfForward<half>(const half* input, half* output, int size); diff --git a/src/operator/FCImpl.cpp b/src/operator/FCImpl.cpp index 55cb31b7492956a2c722e775c225276d22fbdf4e..de89df4d941771f305d359d9d2f5e7036eee285b 100644 --- a/src/operator/FCImpl.cpp +++ b/src/operator/FCImpl.cpp @@ -17,7 +17,7 @@ #include "aidge/backend/cuda/data/TensorImpl.hpp" #include "aidge/backend/cuda/operator/FCImpl.hpp" -#include "aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp" +#include "aidge/backend/cuda/operator/Cublas_kernels.hpp" #include "aidge/backend/cuda/utils/CudaContext.hpp" #include "aidge/backend/cuda/utils/CudaUtils.hpp" #include "aidge/operator/FC.hpp" diff --git a/src/operator/ILayerNormImpl_CUDA_kernels.cu b/src/operator/ILayerNormImpl_CUDA_kernels.cu index fafdc176fdad6a6130c9bc4374d75f8a773f2c16..72a1371fde04960230dfb0d2010562f69ee43bb6 100644 --- a/src/operator/ILayerNormImpl_CUDA_kernels.cu +++ b/src/operator/ILayerNormImpl_CUDA_kernels.cu @@ -11,13 +11,19 @@ * ********************************************************************************/ -#define MAX(X,Y) (((X) > (Y)) ? (X) : (Y)) -#define CLAMP(X) (((X) < (0)) ? (0) : (X)) +#include "aidge/backend/cuda/operator/ILayerNormImpl_CUDA_kernels.hpp" -#include <stdio.h> +#include <algorithm> // std::min +#include <cmath> // std::sqrt +#include <cstddef> // std::size_t +#include <vector> + +#include <cuda.h> #include <cuda_runtime.h> +#include <fmt/core.h> -#include "aidge/backend/cuda/operator/ILayerNormImpl_CUDA_kernels.hpp" +#define MAX(X,Y) (((X) > (Y)) ? (X) : (Y)) +#define CLAMP(X) (((X) < (0)) ? (0) : (X)) namespace Aidge{ @@ -67,10 +73,10 @@ __global__ void ILayerNormforward_(T* input, double SF, int* dims, int* quantize } template <> -void ILayerNormforward<float>(const float* input, float* output, double SF, const float* weight_raw, const float* bias_raw, size_t size, std::vector<long unsigned int> dims_input) +void ILayerNormforward<float>(const float* input, float* output, double SF, const float* weight_raw, const float* bias_raw, std::size_t size, std::vector<long unsigned int> dims_input) { int dims_input_cuda[4] = {1, 1, 1, 1}; - for (std::size_t i = 0; i < std::min(dims_input.size(), size_t(4)); ++i) { + for (std::size_t i = 0; i < std::min(dims_input.size(), std::size_t(4)); ++i) { dims_input_cuda[i] = static_cast<int>(dims_input[i]); } @@ -109,7 +115,7 @@ void ILayerNormforward<float>(const float* input, float* output, double SF, cons cudaError_t err = cudaGetLastError(); if(err != cudaSuccess) { - std::cerr << "CUDA Error: " << cudaGetErrorString(err) << std::endl; + fmt::print(stderr, "CUDA Error: {}", cudaGetErrorString(err)); } cudaMemcpy(output,input_cuda_tensor, (size ) * sizeof(float), cudaMemcpyDeviceToHost); @@ -122,10 +128,10 @@ void ILayerNormforward<float>(const float* input, float* output, double SF, cons } template <> -void ILayerNormforward<double>(const double* input, double* output, double SF, const double* weight_raw, const double* bias_raw, size_t size, std::vector<long unsigned int> dims_input) +void ILayerNormforward<double>(const double* input, double* output, double SF, const double* weight_raw, const double* bias_raw, std::size_t size, std::vector<long unsigned int> dims_input) { int dims_input_cuda[4] = {1, 1, 1, 1}; - for (std::size_t i = 0; i < std::min(dims_input.size(), size_t(4)); ++i) { + for (std::size_t i = 0; i < std::min(dims_input.size(), std::size_t(4)); ++i) { dims_input_cuda[i] = static_cast<int>(dims_input[i]); } @@ -164,7 +170,7 @@ void ILayerNormforward<double>(const double* input, double* output, double SF, c cudaError_t err = cudaGetLastError(); if(err != cudaSuccess) { - std::cerr << "CUDA Error: " << cudaGetErrorString(err) << std::endl; + fmt::print(stderr, "CUDA Error: {}", cudaGetErrorString(err)); } cudaMemcpy(output,input_cuda_tensor, (size ) * sizeof(double), cudaMemcpyDeviceToHost); @@ -188,12 +194,12 @@ __global__ void ILayerNormbackward_(T* output_grad, T* input_tensor, T* output_t input_grad[i] = d_input; weight_grad[i] = output_grad[i] * output_tensor[i]; - bias_grad[i] = output_grad[i]; + bias_grad[i] = output_grad[i]; } } template <> -void ILayerNormbackward<float>(const float* input_tensor, const float* output_grad, const float* output_tensor,const float* mean,const float* var, const float* weight, const float* bias, float* input_grad, float* weight_grad, float* bias_grad, size_t size) +void ILayerNormbackward<float>(const float* input_tensor, const float* output_grad, const float* output_tensor,const float* mean,const float* var, const float* weight, const float* bias, float* input_grad, float* weight_grad, float* bias_grad, std::size_t size) { float* input_cuda_tensor; cudaMalloc(&input_cuda_tensor,size*sizeof(float)); @@ -223,7 +229,7 @@ void ILayerNormbackward<float>(const float* input_tensor, const float* output_gr cudaMalloc(&bias_,size*sizeof(float)); cudaMemcpy(bias_,bias,size*sizeof(float),cudaMemcpyHostToDevice); - + float* input_grad_; cudaMalloc(&input_grad_,size*sizeof(float)); @@ -243,7 +249,7 @@ void ILayerNormbackward<float>(const float* input_tensor, const float* output_gr cudaError_t err = cudaGetLastError(); if(err != cudaSuccess) { - printf("CUDA Error: %s\n", cudaGetErrorString(err)); + fmt::print(stderr, "CUDA Error: {}", cudaGetErrorString(err)); } cudaMemcpy(input_grad , input_grad_, (size) * sizeof(float), cudaMemcpyDeviceToHost); @@ -259,11 +265,11 @@ void ILayerNormbackward<float>(const float* input_tensor, const float* output_gr cudaFree(input_grad_); cudaFree(weight_grad_); cudaFree(bias_grad_); - + } template <> -void ILayerNormbackward<double>(const double* input_tensor, const double* output_grad, const double* output_tensor,const double* mean,const double* var, const double* weight, const double* bias, double* input_grad, double* weight_grad, double* bias_grad, size_t size) +void ILayerNormbackward<double>(const double* input_tensor, const double* output_grad, const double* output_tensor,const double* mean,const double* var, const double* weight, const double* bias, double* input_grad, double* weight_grad, double* bias_grad, std::size_t size) { double* input_cuda_tensor; cudaMalloc(&input_cuda_tensor,size*sizeof(double)); @@ -293,7 +299,7 @@ void ILayerNormbackward<double>(const double* input_tensor, const double* output cudaMalloc(&bias_,size*sizeof(double)); cudaMemcpy(bias_,bias,size*sizeof(double),cudaMemcpyHostToDevice); - + double* input_grad_; cudaMalloc(&input_grad_,size*sizeof(double)); @@ -313,7 +319,8 @@ void ILayerNormbackward<double>(const double* input_tensor, const double* output cudaError_t err = cudaGetLastError(); if(err != cudaSuccess) { - printf("CUDA Error: %s\n", cudaGetErrorString(err)); + fmt::print(stderr, "CUDA Error: {}", cudaGetErrorString(err)); + } diff --git a/src/operator/MatMulImpl.cpp b/src/operator/MatMulImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6cfabb529c61d8d593daa5f50658b158e278d6de --- /dev/null +++ b/src/operator/MatMulImpl.cpp @@ -0,0 +1,123 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <algorithm> +#include <cassert> +#include <numeric> +#include <vector> + +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/MatMulImpl.hpp" +#include "aidge/backend/cuda/operator/Cublas_kernels.hpp" +#include "aidge/backend/cuda/utils/CudaContext.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/operator/MatMul.hpp" +#include "aidge/utils/Types.h" + +void Aidge::MatMulImpl_cuda::forward() { + const MatMul_Op& op = static_cast<const MatMul_Op&>(mOp); + // Check inputs + AIDGE_ASSERT(op.getInput(0), "missing input in MatMul operator"); + AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run MatMul forward because the 0-th input has no implementation."); + DataType datatypeFirstInput = op.getInput(0)->dataType(); + for (IOIndex_t i = 1; i < op.nbInputs(); ++i) { + AIDGE_ASSERT(op.getInput(i), "missing input in MatMul operator"); + AIDGE_ASSERT(op.getInput(i)->hasImpl(), "cannot run MatMul forward because the {}-th input has no implementation.", i); + AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, "Cannot MatMul inputs with two differents data type."); + } + + + const auto& input0 = op.getInput(0)->refCastFrom(mInput0Fallback, *op.getOutput(0)); + const auto& input1 = op.getInput(1)->refCastFrom(mInput1Fallback, *op.getOutput(0)); + + switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + forward_<double>(input0, input1); + break; + case DataType::Float32: + forward_<float>(input0, input1); + break; + case DataType::Float16: + forward_<half>(input0, input1); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + } +} + +template <class T> +void Aidge::MatMulImpl_cuda::forward_(const Tensor& input0, const Tensor& input1) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const T * input0Ptr = static_cast<const T*>(input0.getImpl()->rawPtr()); + const T * input1Ptr = static_cast<const T*>(input1.getImpl()->rawPtr()); + T * outputPtr = static_cast<T*>(op.getOutput(0)->getImpl()->rawPtr()); + + const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; + const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; + + AIDGE_ASSERT(input0.nbDims()>=2 , "input#0 must be at least 2D"); + + if (input1.nbDims()>=2) { + int k = input0.dims()[input0.nbDims()-1]; // Number of columns of input0, rows of output + int m = input0.dims()[input0.nbDims()-2]; // Number of rows of input0 and output + int n = input1.dims()[input1.nbDims()-1]; // Number of columns of input1 and output + + int batchCount = op.getOutput(0)->size() / ( m * n ); + int batchCount0 = input0.size() / ( m * k ); + int batchCount1 = input1.size() / ( k * n ); + + AIDGE_ASSERT((batchCount0==1 || batchCount0==batchCount)&& + (batchCount1==1 || batchCount1==batchCount), + "Broadcasting case not yes suppported by Backend Cuda!"); + + long long int strideA = batchCount0 == 1 ? 0 : m * k; + long long int strideB = batchCount1 == 1 ? 0 : k * n; + long long int strideC = m * n; + + CHECK_CUBLAS_STATUS(cublasGemmStridedBatched(CudaContext::cublasHandle(), + CUBLAS_OP_N, + CUBLAS_OP_N, + n, + m, + k, + reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&alpha), + input1Ptr, n, strideB, + input0Ptr, k, strideA, + reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&beta), + outputPtr, n, strideC, + batchCount)); + } + else { + int k = input0.dims()[input0.nbDims()-1]; // Number of columns of input0 + int m = input0.size() / k; // Number of rows of input0 and output + + CHECK_CUBLAS_STATUS(cublasGemm(CudaContext::cublasHandle(), + CUBLAS_OP_N, + CUBLAS_OP_N, + 1, + m, + k, + reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&alpha), + input1Ptr, 1, + input0Ptr, k, + reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&beta), + outputPtr, 1)); + } +} + +void Aidge::MatMulImpl_cuda::backward() { + // TODO +} + +template <class T> +void Aidge::MatMulImpl_cuda::backward_(const Tensor& outGrad) { + // TODO +} \ No newline at end of file diff --git a/src/operator/ReshapeImpl.cpp b/src/operator/ReshapeImpl.cpp deleted file mode 100644 index 49f732e120fd5de9454b47828caaa5b2ce6f5c58..0000000000000000000000000000000000000000 --- a/src/operator/ReshapeImpl.cpp +++ /dev/null @@ -1,42 +0,0 @@ -/******************************************************************************** - * Copyright (c) 2024 CEA-List - * - * This program and the accompanying materials are made available under the - * terms of the Eclipse Public License 2.0 which is available at - * http://www.eclipse.org/legal/epl-2.0. - * - * SPDX-License-Identifier: EPL-2.0 - * - ********************************************************************************/ - -#include <cassert> -#include <chrono> // std::chrono::milliseconds -#include <numeric> // std::accumulate -#include <thread> // std::this_thread::sleep_for -#include <vector> - -#include "aidge/backend/cuda/data/TensorImpl.hpp" -#include "aidge/backend/cuda/operator/ReshapeImpl.hpp" -#include "aidge/backend/cuda/utils/CudaContext.hpp" -#include "aidge/operator/Reshape.hpp" -#include "aidge/utils/Types.h" - -void Aidge::ReshapeImpl_cuda::forward() { - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); - // FIXME: uncomment the following code once memory handling will work - assert(mOp.getRawInput(0) && "missing input #0"); - - const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0)); - - std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))-> getImpl() -> setRawPtr(input.getImpl()->rawPtr(), input.getImpl()->size()); -} - -void Aidge::ReshapeImpl_cuda::backward() { - - const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); - AIDGE_ASSERT(op.getOutput(0)->grad(), "missing output grad #0"); - - const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad()); - - std::static_pointer_cast<Tensor> (mOp.getRawInput(0))->grad()->getImpl()->setRawPtr(output_grad.getImpl()->rawPtr(), output_grad.getImpl()->size()); -} diff --git a/src/operator/ShiftMaxImpl_CUDA_kernels.cu b/src/operator/ShiftMaxImpl_CUDA_kernels.cu index ba3cfcb51e02fb0befbf9f7c1fc054e73a2a7157..169770c237cd80a8c3357dbd483251b480808b1a 100644 --- a/src/operator/ShiftMaxImpl_CUDA_kernels.cu +++ b/src/operator/ShiftMaxImpl_CUDA_kernels.cu @@ -13,11 +13,17 @@ #define MAX(X,Y) (((X) > (Y)) ? (X) : (Y)) #define CLAMP(X) (((X) < (0)) ? (0) : (X)) -#include <stdio.h> -#include <cuda_runtime.h> - #include "aidge/backend/cuda/operator/ShiftMaxImpl_CUDA_kernels.hpp" +#include <algorithm> // std::min +#include <math.h> +#include <cstddef> // std::size_t +#include <vector> + +#include <cuda.h> +#include <cuda_runtime.h> +#include <fmt/core.h> + __device__ inline int ExpShift(int I,int N, double SF) { int Ip = I + (I >> 1) - (I >> 4); @@ -79,7 +85,7 @@ __global__ void ShiftMaxforward_(T* input,int* quantized_tensor,int* factor, int template <> void ShiftMaxforward<float>(const float* input, float* output, double SF, int N, int output_bits, size_t size, std::vector<long unsigned int> dims_input) { - double new_SF = 1 / std::pow(2, 2 * output_bits - 1); // New scaling factor + double new_SF = 1 / powf(2, 2 * output_bits - 1); // New scaling factor int dims_input_cuda[4] = {1, 1, 1, 1}; for (std::size_t i = 0; i < std::min(dims_input.size(), size_t(4)); ++i) { @@ -116,7 +122,7 @@ void ShiftMaxforward<float>(const float* input, float* output, double SF, int N, // Check for CUDA errors cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { - std::cerr << "CUDA Error: " << cudaGetErrorString(err) << std::endl; + fmt::print(stderr, "CUDA Error: {}\n", cudaGetErrorString(err)); } // Copy the result back to host @@ -132,7 +138,7 @@ void ShiftMaxforward<float>(const float* input, float* output, double SF, int N, template <> void ShiftMaxforward<double>(const double* input, double* output, double SF, int N, int output_bits, size_t size, std::vector<long unsigned int> dims_input) { - double new_SF = 1 / std::pow(2, 2 * output_bits - 1); + double new_SF = 1 / powf(2, 2 * output_bits - 1); int dims_input_cuda[4] = {1, 1, 1, 1}; for (std::size_t i = 0; i < std::min(dims_input.size(), size_t(4)); ++i) { @@ -169,7 +175,7 @@ void ShiftMaxforward<double>(const double* input, double* output, double SF, int // Check for CUDA errors cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { - std::cerr << "CUDA Error: " << cudaGetErrorString(err) << std::endl; + fmt::print(stderr, "CUDA Error: {}\n", cudaGetErrorString(err)); } // Copy the result back to host @@ -201,7 +207,7 @@ __global__ void ShiftMaxbackward_(T* input_grad, const T* output_tensor, const T template <> void ShiftMaxbackward<float>(const float* output_tensor, const float* output_grad, float* input_grad, size_t size, std::vector<long unsigned int> dims) -{ +{ int dims_input_cuda[4] = {1, 1, 1, 1}; for (std::size_t i = 0; i < std::min(dims.size(), size_t(4)); ++i) { dims_input_cuda[i] = static_cast<int>(dims[i]); @@ -230,7 +236,7 @@ void ShiftMaxbackward<float>(const float* output_tensor, const float* output_gra cudaError_t err = cudaGetLastError(); if(err != cudaSuccess) { - printf("CUDA Error: %s\n", cudaGetErrorString(err)); + fmt::print(stderr, "CUDA Error: {}\n", cudaGetErrorString(err)); } cudaMemcpy(input_grad, input_grad_, (size) * sizeof(float), cudaMemcpyDeviceToHost); @@ -242,7 +248,7 @@ void ShiftMaxbackward<float>(const float* output_tensor, const float* output_gra template <> void ShiftMaxbackward<double>(const double* output_tensor, const double* output_grad, double* input_grad, size_t size, std::vector<long unsigned int> dims) -{ +{ int dims_input_cuda[4] = {1, 1, 1, 1}; for (std::size_t i = 0; i < std::min(dims.size(), size_t(4)); ++i) { dims_input_cuda[i] = static_cast<int>(dims[i]); @@ -271,7 +277,7 @@ void ShiftMaxbackward<double>(const double* output_tensor, const double* output_ cudaError_t err = cudaGetLastError(); if(err != cudaSuccess) { - printf("CUDA Error: %s\n", cudaGetErrorString(err)); + fmt::print(stderr, "CUDA Error: {}\n", cudaGetErrorString(err)); } cudaMemcpy(input_grad,input_grad_, (size) * sizeof(double), cudaMemcpyDeviceToHost); diff --git a/src/operator/SoftmaxImpl.cpp b/src/operator/SoftmaxImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8151b9ece7c33ad849b772a4385c7732931dd211 --- /dev/null +++ b/src/operator/SoftmaxImpl.cpp @@ -0,0 +1,111 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <algorithm> +#include <cassert> +#include <numeric> +#include <vector> + +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/SoftmaxImpl.hpp" +#include "aidge/backend/cuda/utils/CudaContext.hpp" +#include "aidge/backend/cuda/utils/CudaUtils.hpp" +#include "aidge/operator/Softmax.hpp" +#include "aidge/utils/Types.h" + +void Aidge::SoftmaxImpl_cuda::forward() { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + AIDGE_ASSERT(op.getInput(0), "Missing input in Softmax operator"); + AIDGE_ASSERT(op.getInput(0)->hasImpl(), "Cannot run Softmax forward because the input has no implementation."); + + const auto& input = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->refCastFrom(mInputFallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); + + const Softmax_Op& softmaxOp = static_cast<const Softmax_Op&>(mOp); + auto axis = softmaxOp.axis(); + + AIDGE_ASSERT(axis == 1, "Softmax is only supported on Channels currently for Backend Cuda."); + + switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + forward_<double>(input); + break; + case DataType::Float32: + forward_<float>(input); + break; + case DataType::Float16: + forward_<half>(input); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda."); + } +} + + +template <class T> +void Aidge::SoftmaxImpl_cuda::forward_(const Tensor& input) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; + const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; + + // Set the algorithm and mode for softmax + cudnnSoftmaxAlgorithm_t algo = CUDNN_SOFTMAX_ACCURATE; + cudnnSoftmaxMode_t mode = CUDNN_SOFTMAX_MODE_CHANNEL; + + AIDGE_ASSERT(input.nbDims() > 1 && input.nbDims() < 6, + "Tensor dimension {} is not yet supported, dimensions range supported for Softmax Backend Cuda is [2,5].", + input.nbDims()); + + // Call the softmax forward function + CHECK_CUDNN_STATUS(cudnnSoftmaxForward( + CudaContext::cudnnHandle(), + algo, + mode, + &alpha, + std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input), + input.getImpl()->rawPtr(), + &beta, + std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)), + std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr())); +} + +void Aidge::SoftmaxImpl_cuda::backward() { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + AIDGE_ASSERT(op.getOutput(0)->grad(), "missing outputGrad in Softmax operator"); + AIDGE_ASSERT(op.getOutput(0)->grad()->hasImpl(), "cannot run Softmax backward because the output grad has no implementation."); + + const auto& outGrad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getInput(0)->grad()); + + const Softmax_Op& softmaxOp = static_cast<const Softmax_Op&>(mOp); + auto axis = softmaxOp.axis(); + switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { + case DataType::Float64: + backward_<double>(outGrad, axis); + break; + case DataType::Float32: + backward_<float>(outGrad, axis); + break; + case DataType::Float16: + backward_<half>(outGrad, axis); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda"); + } +} + +template <class T> +void Aidge::SoftmaxImpl_cuda::backward_(const Tensor& outGrad, int axis) { + const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp); + // const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f; + // const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; + const T * outputGrad = static_cast<const T*>(op.getOutput(0)->grad()->getImpl()->rawPtr()); + T * inputGrad = static_cast<T*>(op.getInput(0)->grad()->getImpl()->rawPtr()); + //TODO +} diff --git a/unit_tests/CMakeLists.txt b/unit_tests/CMakeLists.txt index be41581aa5c3d5f0b0d5ba32b08ecdd0f6e4cd73..e2c102498dbf0e6f63e1bae5d9f9b3ad920f0930 100644 --- a/unit_tests/CMakeLists.txt +++ b/unit_tests/CMakeLists.txt @@ -1,17 +1,25 @@ -Include(FetchContent) - -set(CATCH2_VERSION v3.7.1) -message(STATUS "Retrieving Catch2 ${CATCH2_VERSION} from git") -FetchContent_Declare( - Catch2 - GIT_REPOSITORY https://github.com/catchorg/Catch2.git - GIT_TAG ${CATCH2_VERSION} # or a later release -) - -FetchContent_MakeAvailable(Catch2) +# Catch2 configuration +set(CATCH2_MIN_VERSION 3.3.0) + +# Try to find system installed Catch2 +find_package(Catch2 ${CATCH2_MIN_VERSION} QUIET) + +if(NOT Catch2_FOUND) + message(STATUS "Catch2 not found in system, retrieving from git") + Include(FetchContent) + + FetchContent_Declare( + Catch2 + GIT_REPOSITORY https://github.com/catchorg/Catch2.git + GIT_TAG devel # or a later release + ) + FetchContent_MakeAvailable(Catch2) + message(STATUS "Fetched Catch2 version ${Catch2_VERSION}") +else() + message(STATUS "Using system Catch2 version ${Catch2_VERSION}") +endif() file(GLOB_RECURSE src_files "*.cpp" "*.cu") - add_executable(tests${module_name} ${src_files}) target_link_libraries(tests${module_name} PUBLIC ${module_name}) @@ -22,7 +30,8 @@ target_link_libraries(tests${module_name} CUDA::cudart cudnn CUDA::cublas - + PUBLIC + ${module_name} ) list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras) diff --git a/unit_tests/Test_AbsImpl.cpp b/unit_tests/Test_AbsImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9daea18d8cf147337d18d7622233821938f9fae6 --- /dev/null +++ b/unit_tests/Test_AbsImpl.cpp @@ -0,0 +1,129 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <chrono> // std::micro, std::chrono::time_point, + // std::chrono::system_clock +#include <cstddef> // std::size_t +#include <cstdint> // std::uint16_t +#include <functional> // std::multiplies +#include <memory> +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937 + // std::uniform_int_distribution, std::uniform_real_distribution +#include <vector> + +#include <catch2/catch_test_macros.hpp> +#include <cuda.h> +#include <fmt/core.h> + +#include "aidge/backend/cpu/data/TensorImpl.hpp" +#include "aidge/backend/cpu/operator/AbsImpl.hpp" +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/AbsImpl.hpp" +#include "aidge/data/Data.hpp" +#include "aidge/data/Tensor.hpp" +#include "aidge/graph/Node.hpp" +#include "aidge/operator/Abs.hpp" +#include "aidge/utils/TensorUtils.hpp" + +using namespace std::chrono; +namespace Aidge { + +TEST_CASE("[gpu/operator] Abs", "[Abs][GPU]") +{ + // CONSTANTS + + constexpr std::uint16_t NB_TRIALS = 10; + + // SETUP RNGS + + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<float> valueDist(-1, 1); + std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1), std::size_t(20)); + std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(6)); + + for (std::uint16_t trial = 0; trial < NB_TRIALS; ++trial) + { + // PREPARE TEST DATA + + const std::size_t nbDims = nbDimsDist(gen); + + std::vector<std::size_t> dims; + for (std::size_t i = 0; i < nbDims; ++i) + dims.push_back(dimSizeDist(gen)); + + const std::size_t nbElements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + float* rawData = new float[nbElements]; + for (std::size_t i = 0; i < nbElements; ++i) + rawData[i] = valueDist(gen); + + // CPU FORWARD + + std::shared_ptr<Abs_Op> cpuOp = std::make_shared<Abs_Op>(); + cpuOp->setDataType(DataType::Float32); + cpuOp->setBackend("cpu"); + + std::shared_ptr<Tensor> cpuTensor = std::make_shared<Tensor>(); + cpuOp->associateInput(0, cpuTensor); + cpuTensor->setDataType(DataType::Float32); + cpuTensor->setBackend("cpu"); + cpuTensor->resize(dims); + cpuTensor->getImpl()->setRawPtr(rawData, nbElements); + + auto startTime = std::chrono::system_clock::now(); + cpuOp->forward(); + auto endTime = std::chrono::system_clock::now(); + auto cpuElapsedTime = duration_cast<milliseconds>(endTime - startTime).count(); + + Tensor cpuResult = *(cpuOp->getOutput(0)); + + // CUDA FORWARD + + std::shared_ptr<Abs_Op> cudaOp = std::make_shared<Abs_Op>(); + cudaOp->setDataType(DataType::Float32); + cudaOp->setBackend("cuda"); + + std::shared_ptr<Tensor> cudaTensor = std::make_shared<Tensor>(); + cudaTensor->setDataType(DataType::Float32); + cudaTensor->setBackend("cuda"); + cudaTensor->resize(dims); + cudaOp->associateInput(0, cudaTensor); + + float* rawDataDevice; + cudaMalloc(reinterpret_cast<void **> (&rawDataDevice), sizeof(float) * nbElements); + cudaMemcpy(rawDataDevice, rawData, sizeof(float) * nbElements, cudaMemcpyHostToDevice); + cudaTensor->getImpl()->setRawPtr(rawDataDevice, nbElements); + + startTime = std::chrono::system_clock::now(); + cudaOp->forward(); + endTime = std::chrono::system_clock::now(); + auto cudaElapsedTime = duration_cast<milliseconds>(endTime - startTime).count(); + + std::shared_ptr<Tensor> fallback; + Tensor& cudaResult = cudaOp->getOutput(0)->refCastFrom(fallback, DataType::Float32, "cpu"); + + // COMPARE + + REQUIRE(approxEq<float>(cudaResult, cpuResult)); + + // FREE MEMORY + + delete[] rawData; + cudaFree(rawDataDevice); + + // LOG INFOS + + fmt::print(" Execution time on CPU : {} ms\n", cpuElapsedTime); + fmt::print(" Execution time on CUDA : {} ms\n", cudaElapsedTime); + } +} +} // namespace Aidge diff --git a/unit_tests/Test_AddImpl.cpp b/unit_tests/Test_AddImpl.cpp index dffabe6aab92bdfdd0c79b61ab59e9bc6efb9d94..e7c37dd9905b3beb4a85e97802688d1a3c89a883 100644 --- a/unit_tests/Test_AddImpl.cpp +++ b/unit_tests/Test_AddImpl.cpp @@ -9,12 +9,26 @@ * ********************************************************************************/ -#include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <chrono> // std::micro, std::chrono::time_point, + // std::chrono::system_clock +#include <cmath> // std::fabs +#include <cstddef> // std::size_t +#include <cstdint> // std::uint16_t +#include <functional> // std::multiplies +#include <memory> +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937 + // std::uniform_int_distribution, std::uniform_real_distribution +#include <vector> + #include <catch2/catch_test_macros.hpp> +#include <cuda.h> // cudaMemcpy, cudaMemcpyDeviceToHost -#include "aidge/backend/cpu.hpp" -#include "aidge/backend/cuda.hpp" +#include "aidge/backend/cpu/data/TensorImpl.hpp" +#include "aidge/backend/cpu/operator/AddImpl.hpp" +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/AddImpl.hpp" +#include "aidge/data/Data.hpp" #include "aidge/data/Tensor.hpp" #include "aidge/operator/Add.hpp" #include "aidge/utils/TensorUtils.hpp" @@ -43,7 +57,8 @@ TEST_CASE("[gpu/operator] Add(forward)", "[Add][GPU]") { } // }); // input1->setBackend("cuda"); - std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<float,3,3,3,2> { + + Tensor expectedOutput = Array4D<float,3,3,3,2> { { { {{40, 94},{42, 96},{44, 98}}, @@ -61,58 +76,54 @@ TEST_CASE("[gpu/operator] Add(forward)", "[Add][GPU]") { {{88, 142},{90, 144},{92, 146}} } } - }); + }; - std::shared_ptr<Node> myAdd = Add(); - auto op = std::static_pointer_cast<OperatorTensor>(myAdd -> getOperator()); - op->associateInput(0, input1); - op->associateInput(1, input1); + std::shared_ptr<Add_Op> op = std::make_shared<Add_Op>(); op->setBackend("cuda"); op->setDataType(DataType::Float32); - myAdd->forward(); - float* computedOutput = new float[input1->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * expectedOutput->size(), cudaMemcpyDeviceToHost); + op->associateInput(0, input1); + op->associateInput(1, input1); + op->forward(); - for(int i = 0; i < expectedOutput->size(); i++){ - const float targetOutput = *(static_cast<float*>(expectedOutput->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); - } + Tensor myOutput = *(op->getOutput(0)); + myOutput.setBackend("cpu"); - delete[] computedOutput; + REQUIRE(approxEq<cpptype_t<DataType::Float32>>(expectedOutput, myOutput, (1.0E-5F), (1.0E-8F))); } SECTION("Broadcasting") { std::shared_ptr<Tensor> input_0 = std::make_shared<Tensor>(Array4D<float,3,1,3,2> { - { // - { // - {{0, 1},{2, 3},{4, 5}} // - }, // - { // - {{6, 7},{8, 9},{10, 11}} // - }, // - { // - {{12, 13},{14, 15},{16, 17}} // - } // - } // - }); // + { // + { // + {{0, 1},{2, 3},{4, 5}} // + }, // + { // + {{6, 7},{8, 9},{10, 11}} // + }, // + { // + {{12, 13},{14, 15},{16, 17}} // + } // + } // + }); + input_0->setBackend("cuda"); + std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<float,1,3,3,2> { - { // - { // - {{20, 21},{22, 23},{24, 25}}, // - {{26, 27},{28, 29},{30, 31}}, // - {{32, 33},{34, 35},{36, 37}} // - } // - } // - }); // + { // + { // + {{20, 21},{22, 23},{24, 25}}, // + {{26, 27},{28, 29},{30, 31}}, // + {{32, 33},{34, 35},{36, 37}} // + } // + } // + }); + input_1->setBackend("cuda"); std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<float,2> {{100,200}}); - input_0->setBackend("cuda"); - input_1->setBackend("cuda"); input_2->setBackend("cuda"); /// Input0(d0, 1, d2, d3) + Input1(1, d1, d2, d3) = Output(d0, d1, d2, d3) - std::shared_ptr<Tensor> expectedOutput0 = std::make_shared<Tensor>(Array4D<float,3,3,3,2> { + Tensor expectedOutput0 = Array4D<float,3,3,3,2> { { // { // {{ 20, 22},{ 24, 26},{ 28, 30}}, // @@ -130,56 +141,47 @@ TEST_CASE("[gpu/operator] Add(forward)", "[Add][GPU]") { {{ 44, 46},{ 48, 50},{52, 54}} // } // } // - }); // + }; - std::shared_ptr<Node> myAdd0 = Add(); - auto op0 = std::static_pointer_cast<OperatorTensor>(myAdd0 -> getOperator()); - op0->associateInput(0, input_0); - op0->associateInput(1, input_1); + std::shared_ptr<Add_Op> op0 = std::make_shared<Add_Op>(); op0->setDataType(DataType::Float32); op0->setBackend("cuda"); - myAdd0->forward(); - float* computedOutput0 = new float[expectedOutput0->size()](); - cudaMemcpy(computedOutput0, op0->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * expectedOutput0->size(), cudaMemcpyDeviceToHost); + op0->associateInput(0, input_0); + op0->associateInput(1, input_1); + op0->forward(); - for(int i = 0; i < expectedOutput0->size(); i++){ - const float targetOutput = *(static_cast<float*>(expectedOutput0->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedOutput0[i] - targetOutput) < 1e-6); - } + Tensor myOutput0 = *(op0->getOutput(0)); + myOutput0.setBackend("cpu"); - delete[] computedOutput0; + REQUIRE(approxEq<cpptype_t<DataType::Float32>>(expectedOutput0, myOutput0, (1.0E-5F), (1.0E-8F))); /// Input0(d0, d1, d2, d3) + Input1(d3) = Output(d0, d1, d2, d3) - std::shared_ptr<Tensor> expectedOutput1 = std::make_shared<Tensor>(Array4D<float,3,1,3,2> { - { // - { // - {{100, 201},{102, 203},{104, 205}} // - }, // - { // - {{106, 207},{108, 209},{110, 211}} // - }, // - { // - {{112, 213},{114, 215},{116, 217}} // - } // - } // - }); // - std::shared_ptr<Node> myAdd1 = Add(); - auto op1 = std::static_pointer_cast<OperatorTensor>(myAdd1 -> getOperator()); - op1->associateInput(0, input_0); - op1->associateInput(1, input_2); + Tensor expectedOutput1 = Array4D<float,3,1,3,2> { + { // + { // + {{100, 201},{102, 203},{104, 205}} // + }, // + { // + {{106, 207},{108, 209},{110, 211}} // + }, // + { // + {{112, 213},{114, 215},{116, 217}} // + } // + } // + }; // + std::shared_ptr<Add_Op> op1 = std::make_shared<Add_Op>(); op1->setDataType(DataType::Float32); op1->setBackend("cuda"); - myAdd1->forward(); - float* computedOutput1 = new float[expectedOutput1->size()](); - cudaMemcpy(computedOutput1, op1->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * expectedOutput1->size(), cudaMemcpyDeviceToHost); - for(int i = 0; i < expectedOutput1->size(); i++){ - const float targetOutput = *(static_cast<float*>(expectedOutput1->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedOutput1[i] - targetOutput) < 1e-6); - } + op1->associateInput(0, input_0); + op1->associateInput(1, input_2); + op1->forward(); + + Tensor myOutput1 = *(op1->getOutput(0)); + myOutput1.setBackend("cpu"); - delete[] computedOutput1; + REQUIRE(approxEq<cpptype_t<DataType::Float32>>(expectedOutput1, myOutput1, (1.0E-5F), (1.0E-8F))); } SECTION("Random Input") { @@ -202,12 +204,12 @@ TEST_CASE("[gpu/operator] Add(forward)", "[Add][GPU]") { for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { // Create Add Operator CUDA - std::shared_ptr<Node> myAddCUDA = Add("myaddcuda"); - auto op_cuda = std::static_pointer_cast<OperatorTensor>(myAddCUDA -> getOperator()); + std::shared_ptr<Add_Op> op_cuda = std::make_shared<Add_Op>(); + op_cuda->setDataType(DataType::Float32); + op_cuda->setBackend("cuda"); // Create Add Operator CPU - std::shared_ptr<Node> myAddCPU = Add("myaddcpu"); - auto op_cpu = std::static_pointer_cast<OperatorTensor>(myAddCPU -> getOperator()); + std::shared_ptr<Add_Op> op_cpu = std::make_shared<Add_Op>(); op_cpu->setDataType(DataType::Float32); op_cpu->setBackend("cpu"); @@ -216,16 +218,9 @@ TEST_CASE("[gpu/operator] Add(forward)", "[Add][GPU]") { for (std::size_t i = 0; i < nbDims; ++i) { const std::size_t dim = dimSizeDist(gen); // To test broadcasting, set some dims to 1 - if (boolDist(gen)) { - dims0.push_back(1); - }else{ - dims0.push_back(dim); - } - if (boolDist(gen)) { - dims1.push_back(1); - }else{ - dims1.push_back(dim); - } + dims0.push_back(boolDist(gen) ? 1 : dim); + dims1.push_back(boolDist(gen) ? 1 : dim); + dims.push_back(std::max(dims0[i], dims1[i])); } const std::size_t nb_elements0 = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); @@ -281,8 +276,6 @@ TEST_CASE("[gpu/operator] Add(forward)", "[Add][GPU]") { T1_cpu -> getImpl() -> setRawPtr(array1, nb_elements1); // forward CUDA - op_cuda->setDataType(DataType::Float32); - op_cuda->setBackend("cuda"); start = std::chrono::system_clock::now(); op_cuda->forward(); end = std::chrono::system_clock::now(); @@ -309,18 +302,20 @@ TEST_CASE("[gpu/operator] Add(forward)", "[Add][GPU]") { TEST_CASE("[gpu/operator] Add(backward)", "[Add][GPU]") { std::shared_ptr<Tensor> input_0 = std::make_shared<Tensor>(Array4D<float,3,1,3,2> { - { // - { // - {{0, 1},{2, 3},{4, 5}} // - }, // - { // - {{6, 7},{8, 9},{10, 11}} // - }, // - { // - {{12, 13},{14, 15},{16, 17}} // - } // - } // - }); // + { // + { // + {{0, 1},{2, 3},{4, 5}} // + }, // + { // + {{6, 7},{8, 9},{10, 11}} // + }, // + { // + {{12, 13},{14, 15},{16, 17}} // + } // + } // + }); + input_0->setBackend("cuda"); + std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<float,1,3,3,2> { { // { // @@ -330,16 +325,16 @@ TEST_CASE("[gpu/operator] Add(backward)", "[Add][GPU]") { } // } // }); // - - input_0->setBackend("cuda"); input_1->setBackend("cuda"); - std::shared_ptr<Node> myAdd = Add(); - auto op = std::static_pointer_cast<OperatorTensor>(myAdd -> getOperator()); - op->associateInput(0, input_0); - op->associateInput(1, input_1); + + auto op = std::make_shared<Add_Op>(); op->setDataType(DataType::Float32); op->setBackend("cuda"); - myAdd->forward(); + + op->associateInput(0, input_0); + op->associateInput(1, input_1); + + op->forward(); // Run and test backward operation std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array4D<float,3,3,3,2> { @@ -362,46 +357,38 @@ TEST_CASE("[gpu/operator] Add(backward)", "[Add][GPU]") { } // }); // myOutputGrad->setBackend("cuda"); + op->getOutput(0)->setGrad(myOutputGrad); - REQUIRE_NOTHROW(myAdd->backward()); - - std::shared_ptr<Tensor> expectedInput1Grad = std::make_shared<Tensor>(Array4D<float,3,1,3,2> { - { // - { // - {{21, 24},{27, 30},{33, 36}} // - }, // - { // - {{75, 78},{81, 84},{87, 90}} // - }, // - { // - {{129, 132},{135, 138},{141, 144}}// - } // - } // - }); // - std::shared_ptr<Tensor> expectedInput2Grad = std::make_shared<Tensor>(Array4D<float,1,3,3,2> { - { // - { // - {{57, 60},{63, 66},{69, 72}}, // - {{75, 78},{81, 84},{87, 90}}, // - {{93, 96},{99, 102},{105, 108}} // - } // - } // - }); // + REQUIRE_NOTHROW(op->backward()); - float *computedGrad1Cuda = new float[expectedInput1Grad->size()](); - cudaMemcpy(computedGrad1Cuda, op->getInput(0)->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInput1Grad->size(), cudaMemcpyDeviceToHost); - float *computedGrad2Cuda = new float[expectedInput2Grad->size()](); - cudaMemcpy(computedGrad2Cuda, op->getInput(1)->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInput2Grad->size(), cudaMemcpyDeviceToHost); + Tensor expectedInput1Grad = Array4D<float,3,1,3,2> { + { // + { // + {{21, 24},{27, 30},{33, 36}} // + }, // + { // + {{75, 78},{81, 84},{87, 90}} // + }, // + { // + {{129, 132},{135, 138},{141, 144}}// + } // + } // + }; + Tensor expectedInput2Grad = Array4D<float,1,3,3,2> { + { // + { // + {{57, 60},{63, 66},{69, 72}}, // + {{75, 78},{81, 84},{87, 90}}, // + {{93, 96},{99, 102},{105, 108}} // + } // + } // + }; - for(int i = 0; i < expectedInput1Grad->size(); i++){ - const float targetOutput = *(static_cast<float*>(expectedInput1Grad->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedGrad1Cuda[i] - targetOutput) < 1e-6); - } - for(int i = 0; i < expectedInput2Grad->size(); i++){ - const float targetOutput = *(static_cast<float*>(expectedInput2Grad->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedGrad2Cuda[i] - targetOutput) < 1e-6); - } + Tensor input1Grad = *(op->getInput(0)->grad()); + input1Grad.setBackend("cpu"); + Tensor input2Grad = *(op->getInput(1)->grad()); + input2Grad.setBackend("cpu"); - delete[] computedGrad1Cuda; - delete[] computedGrad2Cuda; + REQUIRE(approxEq<cpptype_t<DataType::Float32>>(expectedInput1Grad, input1Grad, (1.0E-5F), (1.0E-8F))); + REQUIRE(approxEq<cpptype_t<DataType::Float32>>(expectedInput2Grad, input2Grad, (1.0E-5F), (1.0E-8F))); } \ No newline at end of file diff --git a/unit_tests/Test_AndImpl.cpp b/unit_tests/Test_AndImpl.cpp index 66de926088bb47c06ea1f9f10655730404787149..951868a2f652a00bcea793f8b5abb00012aabca8 100644 --- a/unit_tests/Test_AndImpl.cpp +++ b/unit_tests/Test_AndImpl.cpp @@ -9,11 +9,13 @@ * ********************************************************************************/ -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <memory> + #include <catch2/catch_test_macros.hpp> -#include "aidge/backend/cpu.hpp" -#include "aidge/backend/cuda.hpp" +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/AndImpl.hpp" +#include "aidge/data/Data.hpp" #include "aidge/data/Tensor.hpp" #include "aidge/utils/TensorUtils.hpp" @@ -81,13 +83,12 @@ TEST_CASE("[gpu/operator] And(forward)", "[And][GPU]") { } }); - std::shared_ptr<Node> myAnd = And(); - auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator()); + std::shared_ptr<And_Op> op = std::make_shared<And_Op>(); op->associateInput(0, input_1); op->associateInput(1, input_2); op->setBackend("cuda"); op->setDataType(DataType::Float32); - myAnd->forward(); + op->forward(); std::shared_ptr<Tensor> outputFallback; @@ -106,7 +107,7 @@ TEST_CASE("[gpu/operator] And(forward)", "[And][GPU]") { } // }); // input_1->setBackend("cuda"); - std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<float,2> {{10, 20}}); + std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<float,2> {{10, 20}}); const Tensor myOutput = Tensor(Array4D<float,1,3,3,2> { { // { // @@ -117,13 +118,12 @@ TEST_CASE("[gpu/operator] And(forward)", "[And][GPU]") { } // }); // input_2->setBackend("cuda"); - std::shared_ptr<Node> myAnd = And(); - auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator()); + std::shared_ptr<And_Op> op = std::make_shared<And_Op>(); op->associateInput(0, input_1); op->associateInput(1, input_2); op->setDataType(DataType::Float32); op->setBackend("cuda"); - myAnd->forward(); + op->forward(); std::shared_ptr<Tensor> outputFallback; const auto& cudaOutput = op->getOutput(0)->refCastFrom(outputFallback, myOutput); diff --git a/unit_tests/Test_ArgMaxImpl.cpp b/unit_tests/Test_ArgMaxImpl.cpp index d123b5bd3376c7169b2e003d8b366bb9045fe3e1..ecd232257ec6a4a15959d73ef0481cfbc2332ae8 100644 --- a/unit_tests/Test_ArgMaxImpl.cpp +++ b/unit_tests/Test_ArgMaxImpl.cpp @@ -9,40 +9,42 @@ * ********************************************************************************/ -#include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <memory> + #include <catch2/catch_test_macros.hpp> -#include "aidge/backend/cpu.hpp" -#include "aidge/backend/cuda.hpp" +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/ArgMaxImpl.hpp" +#include "aidge/data/Data.hpp" #include "aidge/data/Tensor.hpp" -#include "aidge/operator/Add.hpp" +#include "aidge/operator/ArgMax.hpp" #include "aidge/utils/TensorUtils.hpp" using namespace Aidge; TEST_CASE("[cpu/operator] ArgMax(forward)", "[ArgMax][CPU]") { SECTION("3D Tensor") { - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,2,3,4> { + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,2,3,4> { + { { - { - { 1.0, 2.0, 3.0, 4.0}, - { 8.0, 0.0, 17.0, 1.0}, - { 5.0, 10.0, 6.0, 0.0} - }, - { - { 7.0, 1.0, 9.0, 4.0}, - { 0.0, 8.0, 4.0, 2.0}, - { 9.0, 2.0, 0.0, 5.0} - } + { 1.0, 2.0, 3.0, 4.0}, + { 8.0, 0.0, 17.0, 1.0}, + { 5.0, 10.0, 6.0, 0.0} + }, + { + { 7.0, 1.0, 9.0, 4.0}, + { 0.0, 8.0, 4.0, 2.0}, + { 9.0, 2.0, 0.0, 5.0} } - }); - myInput->setBackend("cuda"); + } + }); + myInput->setBackend("cuda"); + SECTION("Axis 2") { const Tensor myOutput = Tensor(Array3D<float,2,3, 1> { - { - { + { + { {3.0}, {2.0}, {1.0} @@ -55,12 +57,11 @@ TEST_CASE("[cpu/operator] ArgMax(forward)", "[ArgMax][CPU]") { } }); - std::shared_ptr<Node> myArgMax = ArgMax(2); - auto op = std::static_pointer_cast<OperatorTensor>(myArgMax -> getOperator()); + std::shared_ptr<ArgMax_Op> op = std::make_shared<ArgMax_Op>(2); op->associateInput(0,myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); - myArgMax->forward(); + op->forward(); std::shared_ptr<Tensor> outputFallback; const auto& cudaOutput = op->getOutput(0)->refCastFrom(outputFallback, myOutput); @@ -70,18 +71,17 @@ TEST_CASE("[cpu/operator] ArgMax(forward)", "[ArgMax][CPU]") { SECTION("Axis 2 with keep_dims false") { const Tensor myOutput = Tensor(Array2D<float,2,3> { - { + { { 3.0, 2.0, 1.0 }, { 2.0, 1.0, 0.0 } } }); - std::shared_ptr<Node> myArgMax = ArgMax(2,0); - auto op = std::static_pointer_cast<OperatorTensor>(myArgMax -> getOperator()); + std::shared_ptr<ArgMax_Op> op = std::make_shared<ArgMax_Op>(2,0); op->associateInput(0,myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); - myArgMax->forward(); + op->forward(); std::shared_ptr<Tensor> outputFallback; const auto& cudaOutput = op->getOutput(0)->refCastFrom(outputFallback, myOutput); @@ -99,12 +99,11 @@ TEST_CASE("[cpu/operator] ArgMax(forward)", "[ArgMax][CPU]") { } }); - std::shared_ptr<Node> myArgMax = ArgMax(1); - auto op = std::static_pointer_cast<OperatorTensor>(myArgMax -> getOperator()); + std::shared_ptr<ArgMax_Op> op = std::make_shared<ArgMax_Op>(1); op->associateInput(0,myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); - myArgMax->forward(); + op->forward(); std::shared_ptr<Tensor> outputFallback; const auto& cudaOutput = op->getOutput(0)->refCastFrom(outputFallback, myOutput); @@ -121,12 +120,11 @@ TEST_CASE("[cpu/operator] ArgMax(forward)", "[ArgMax][CPU]") { } }); - std::shared_ptr<Node> myArgMax = ArgMax(0); - auto op = std::static_pointer_cast<OperatorTensor>(myArgMax -> getOperator()); + std::shared_ptr<ArgMax_Op> op = std::make_shared<ArgMax_Op>(0); op->associateInput(0,myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); - myArgMax->forward(); + op->forward(); std::shared_ptr<Tensor> outputFallback; const auto& cudaOutput = op->getOutput(0)->refCastFrom(outputFallback, myOutput); @@ -141,12 +139,11 @@ TEST_CASE("[cpu/operator] ArgMax(forward)", "[ArgMax][CPU]") { }); const Tensor myOutput = Tensor(Array1D<float,1> {{9}}); - std::shared_ptr<Node> myArgMax = ArgMax(0, 1, 1); - auto op = std::static_pointer_cast<OperatorTensor>(myArgMax -> getOperator()); + std::shared_ptr<ArgMax_Op> op = std::make_shared<ArgMax_Op>(0,1,1); op->associateInput(0,myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); - myArgMax->forward(); + op->forward(); std::shared_ptr<Tensor> outputFallback; const auto& cudaOutput = op->getOutput(0)->refCastFrom(outputFallback, myOutput); diff --git a/unit_tests/Test_AvgPoolingImpl.cpp b/unit_tests/Test_AvgPoolingImpl.cpp index 3dccd6b7f909a9e9b4f8affb151898b77d94a7cf..b658ee759d946249c7fc3c56c80dfa367fa0371f 100644 --- a/unit_tests/Test_AvgPoolingImpl.cpp +++ b/unit_tests/Test_AvgPoolingImpl.cpp @@ -9,17 +9,31 @@ * ********************************************************************************/ -#include <array> -#include <cuda_fp16.h> // half type -#include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <chrono> // std::micro, std::chrono::time_point, + // std::chrono::system_clock +#include <cmath> // std::fabs +#include <cstddef> // std::size_t +#include <cstdint> // std::uint16_t +#include <functional> // std::multiplies +#include <memory> +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937 + // std::uniform_int_distribution, std::uniform_real_distribution +#include <vector> #include <catch2/catch_test_macros.hpp> +#include <cuda.h> +#include <cuda_fp16.h> // half type +#include <fmt/core.h> -#include "aidge/backend/cpu.hpp" -#include "aidge/backend/cuda.hpp" -#include "aidge/data/Tensor.hpp" +#include "aidge/backend/cpu/data/TensorImpl.hpp" +#include "aidge/backend/cpu/operator/AvgPoolingImpl.hpp" +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/AvgPoolingImpl.hpp" +#include "aidge/data/Data.hpp" #include "aidge/data/half.hpp" +#include "aidge/data/Tensor.hpp" +#include "aidge/operator/AvgPooling.hpp" #include "aidge/utils/TensorUtils.hpp" using namespace Aidge; @@ -52,8 +66,7 @@ TEST_CASE("[gpu/operator] AvgPooling(forward)", "[AvgPooling][GPU]") {145, 146, 147, 148, 149}}}}}); SECTION("Stride") { - std::shared_ptr<Node> myAvgPool = AvgPooling({2, 2}, "myAvgPool", {2, 2}); - auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool->getOperator()); + std::shared_ptr<AvgPooling_Op<2>> op = std::make_shared<AvgPooling_Op<2>>(std::array<DimSize_t, 2>({2,2}), std::array<DimSize_t, 2>({2,2})); std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 2>{ {{{{3, 5}, @@ -67,7 +80,7 @@ TEST_CASE("[gpu/operator] AvgPooling(forward)", "[AvgPooling][GPU]") op->associateInput(0, myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); - myAvgPool->forward(); + op->forward(); float *computedOutput = new float[myOutput->size()](); cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); @@ -75,7 +88,7 @@ TEST_CASE("[gpu/operator] AvgPooling(forward)", "[AvgPooling][GPU]") for (int i = 0; i < myOutput->size(); i++) { const float targetOutput = *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + REQUIRE(std::fabs(computedOutput[i] - targetOutput) < 1e-6); } delete[] computedOutput; @@ -84,19 +97,18 @@ TEST_CASE("[gpu/operator] AvgPooling(forward)", "[AvgPooling][GPU]") SECTION("Stride >= feature dim") { std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(Array4D<float, 1, 1, 3, 3>{// NCHW - { - {{{0.3745, 0.9507, 0.7320}, - {0.5987, 0.1560, 0.1560}, - {0.0581, 0.8662, 0.6011}}}}}); - std::shared_ptr<Node> myAvgPool = AvgPooling({3, 3}, "myAvgPool", {3, 3}); - auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool->getOperator()); + { + {{{0.3745, 0.9507, 0.7320}, + {0.5987, 0.1560, 0.1560}, + {0.0581, 0.8662, 0.6011}}}}}); + std::shared_ptr<AvgPooling_Op<2>> op = std::make_shared<AvgPooling_Op<2>>(std::array<DimSize_t, 2>({3,3}), std::array<DimSize_t, 2>({3,3})); std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float, 1, 1, 1, 1>{ {{{{(0.3745 + 0.9507 + 0.7320 + 0.5987 + 0.1560 + 0.1560 + 0.0581 + 0.8662 + 0.6011) / 9.0}}}}}); op->associateInput(0, myInput2); op->setDataType(DataType::Float32); op->setBackend("cuda"); - myAvgPool->forward(); + op->forward(); float *computedOutput = new float[myOutput->size()](); cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); @@ -104,7 +116,7 @@ TEST_CASE("[gpu/operator] AvgPooling(forward)", "[AvgPooling][GPU]") for (int i = 0; i < myOutput->size(); i++) { const float targetOutput = *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + REQUIRE(std::fabs(computedOutput[i] - targetOutput) < 1e-6); } delete[] computedOutput; @@ -119,14 +131,13 @@ TEST_CASE("[gpu/operator] AvgPooling(forward)", "[AvgPooling][GPU]") {half_float::half(0.0581), half_float::half(0.8662), half_float::half(0.6011)}}}}}); myInput2->setBackend("cuda"); - std::shared_ptr<Node> myAvgPool = AvgPooling({3, 3}, "myAvgPoolcdw", {3, 3}); - auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool->getOperator()); + std::shared_ptr<AvgPooling_Op<2>> op = std::make_shared<AvgPooling_Op<2>>(std::array<DimSize_t, 2>({3,3}), std::array<DimSize_t, 2>({3,3})); std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<half_float::half, 1, 1, 1, 1>{ {{{{(half_float::half(0.3745) + half_float::half(0.9507) + half_float::half(0.7320) + half_float::half(0.5987) + half_float::half(0.1560) + half_float::half(0.1560) + half_float::half(0.0581) + half_float::half(0.8662) + half_float::half(0.6011)) / half_float::half(9.0)}}}}}); op->associateInput(0, myInput2); op->setDataType(DataType::Float16); op->setBackend("cuda"); - myAvgPool->forward(); + op->forward(); half_float::half *computedOutput = new half_float::half[myOutput->size()](); cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(half_float::half) * myOutput->size(), cudaMemcpyDeviceToHost); @@ -134,7 +145,7 @@ TEST_CASE("[gpu/operator] AvgPooling(forward)", "[AvgPooling][GPU]") for (int i = 0; i < myOutput->size(); i++) { const half_float::half targetOutput = *(static_cast<half_float::half *>(myOutput->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + REQUIRE(std::fabs(computedOutput[i] - targetOutput) < 1e-6); } delete[] computedOutput; @@ -160,14 +171,12 @@ TEST_CASE("[gpu/operator] AvgPooling(forward)", "[AvgPooling][GPU]") for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { // Create AveragePooling Operator CUDA - std::shared_ptr<Node> myAvgPoolCuda = AvgPooling({kernel, kernel}, "myAvgPoolCuda", {stride, stride}); - auto op_cuda = std::static_pointer_cast<OperatorTensor>(myAvgPoolCuda->getOperator()); + std::shared_ptr<AvgPooling_Op<2>> op_cuda = std::make_shared<AvgPooling_Op<2>>(std::array<DimSize_t, 2>({kernel, kernel}), std::array<DimSize_t, 2>({stride, stride})); op_cuda->setDataType(DataType::Float32); op_cuda->setBackend("cuda"); - // Create AveragePooling Operator CUDA - std::shared_ptr<Node> myAvgPoolCpu = AvgPooling({kernel, kernel}, "myAvgPoolCpu", {stride, stride}); - auto op_cpu = std::static_pointer_cast<OperatorTensor>(myAvgPoolCpu->getOperator()); + // Create AveragePooling Operator CPU + std::shared_ptr<AvgPooling_Op<2>> op_cpu = std::make_shared<AvgPooling_Op<2>>(std::array<DimSize_t, 2>({kernel, kernel}), std::array<DimSize_t, 2>({stride, stride})); op_cpu->setDataType(DataType::Float32); op_cpu->setBackend("cpu"); @@ -208,7 +217,7 @@ TEST_CASE("[gpu/operator] AvgPooling(forward)", "[AvgPooling][GPU]") T0_cpu->resize(dims); T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements); - // Run inference + // Run inference start = std::chrono::system_clock::now(); op_cuda->forward(); end = std::chrono::system_clock::now(); @@ -227,8 +236,8 @@ TEST_CASE("[gpu/operator] AvgPooling(forward)", "[AvgPooling][GPU]") delete[] array0; cudaFree(array0_d); } - std::cout << "number of elements over time spent: " << (number_of_operation / duration.count()) << std::endl; - std::cout << "total time: " << duration.count() << "μs" << std::endl; + Log::info("Number of elements over time spent: {}\n", number_of_operation / duration.count()); + Log::info("Total time: {}μs\n", duration.count()); } } @@ -249,12 +258,11 @@ TEST_CASE("[gpu/operator] AvgPooling(backward)", "[AvgPooling][GPU]") }); myInput->setBackend("cuda"); - std::shared_ptr<Node> myAvgPool = AvgPooling({2, 2}, "myAvgPool", {2, 2}); - auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool->getOperator()); + std::shared_ptr<AvgPooling_Op<2>> op = std::make_shared<AvgPooling_Op<2>>(std::array<DimSize_t, 2>({2, 2}), std::array<DimSize_t, 2>({2, 2})); op->associateInput(0, myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); - myAvgPool->forward(); + op->forward(); // Run and test backward operation std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array4D<float, 1,1,2,2> { @@ -271,7 +279,7 @@ TEST_CASE("[gpu/operator] AvgPooling(backward)", "[AvgPooling][GPU]") std::shared_ptr<Tensor> predictedOutput = op->getOutput(0); std::shared_ptr<Tensor> input = op->getInput(0); predictedOutput->setGrad(myOutputGrad); - REQUIRE_NOTHROW(myAvgPool->backward()); + REQUIRE_NOTHROW(op->backward()); std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array4D<float, 1, 1, 4, 4>{ { @@ -288,10 +296,10 @@ TEST_CASE("[gpu/operator] AvgPooling(backward)", "[AvgPooling][GPU]") float *computedGradCuda = new float[expectedInputGrad->size()](); cudaMemcpy(computedGradCuda, input->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInputGrad->size(), cudaMemcpyDeviceToHost); - + for(int i = 0; i < expectedInputGrad->size(); i++){ const float targetOutput = *(static_cast<float*>(expectedInputGrad->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6); + REQUIRE(std::fabs(computedGradCuda[i] - targetOutput) < 1e-6); } delete[] computedGradCuda; diff --git a/unit_tests/Test_BatchNormImpl.cpp b/unit_tests/Test_BatchNormImpl.cpp index 5b8d3eae7b8816bf70ffdd7e78b56305fe0f7191..b09d7f74d8e748fb642ab006e3aad95efd840174 100644 --- a/unit_tests/Test_BatchNormImpl.cpp +++ b/unit_tests/Test_BatchNormImpl.cpp @@ -9,17 +9,31 @@ * ********************************************************************************/ -#include <array> -#include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <chrono> // std::micro, std::chrono::time_point, + // std::chrono::system_clock +#include <cmath> // std::fabs +#include <cstddef> // std::size_t +#include <cstdint> // std::uint16_t +#include <functional> // std::multiplies +#include <memory> +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937 + // std::uniform_int_distribution, std::uniform_real_distribution +#include <vector> #include <catch2/catch_test_macros.hpp> - -#include "aidge/backend/cpu.hpp" -#include "aidge/backend/cuda.hpp" +#include <cuda.h> +#include <fmt/core.h> + +#include "aidge/backend/cpu/data/TensorImpl.hpp" +#include "aidge/backend/cpu/operator/BatchNormImpl.hpp" +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/BatchNormImpl.hpp" +#include "aidge/data/Data.hpp" #include "aidge/data/Tensor.hpp" +#include "aidge/graph/Node.hpp" +#include "aidge/operator/BatchNorm.hpp" #include "aidge/utils/TensorUtils.hpp" -#include "Test_cuda.hpp" using namespace Aidge; @@ -64,7 +78,7 @@ TEST_CASE("[gpu/operator] BatchNorm(forward)") { } } }); - + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,3,3,3> { { { @@ -113,7 +127,7 @@ TEST_CASE("[gpu/operator] BatchNorm(forward)") { for(int i = 0; i < myOutput->size(); i++){ const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-5); + REQUIRE(std::fabs(computedOutput[i] - targetOutput) < 1e-5); } delete[] computedOutput; @@ -149,13 +163,13 @@ TEST_CASE("[gpu/operator] BatchNorm(forward)") { // Create BatchNorm Operator Cuda std::shared_ptr<Node> myBatchNormCuda = BatchNorm<2>(nbChannels, epsilon, momentum, false, "mybatchnormcuda"); - auto op_cuda = std::static_pointer_cast<OperatorTensor>(myBatchNormCuda -> getOperator()); + auto op_cuda = std::static_pointer_cast<BatchNorm_Op<2>>(myBatchNormCuda -> getOperator()); op_cuda->setDataType(DataType::Float32); op_cuda->setBackend("cuda"); // Create BatchNorm Operator CPU std::shared_ptr<Node> myBatchNormCpu = BatchNorm<2>(nbChannels, epsilon, momentum, false, "mybatchnormcpu"); - auto op_cpu = std::static_pointer_cast<OperatorTensor>(myBatchNormCpu -> getOperator()); + auto op_cpu = std::static_pointer_cast<BatchNorm_Op<2>>(myBatchNormCpu -> getOperator()); op_cpu->setDataType(DataType::Float32); op_cpu->setBackend("cpu"); @@ -294,14 +308,14 @@ TEST_CASE("[gpu/operator] BatchNorm(forward)") { cudaFree(mean_d); cudaFree(var_d); } - std::cout << "total time: " << duration.count() << "μs" << std::endl; + fmt::print("Total time: {} μs\n", duration.count()); } } TEST_CASE("[gpu/operator] BatchNorm(backward)") { SECTION("Static Input") { std::shared_ptr<Node> myBatchNorm = BatchNorm<2>(3, 0.00001F, 0.1F, "mybatchnorm"); - auto op = std::static_pointer_cast<OperatorTensor>(myBatchNorm -> getOperator()); + auto op = std::static_pointer_cast<BatchNorm_Op<2>>(myBatchNorm -> getOperator()); op->setDataType(DataType::Float32); op->setBackend("cuda"); // Forward @@ -396,7 +410,7 @@ TEST_CASE("[gpu/operator] BatchNorm(backward)") { std::shared_ptr<Tensor> weights = op->getInput(1); std::shared_ptr<Tensor> bias = op->getInput(2); predictedOutput->setGrad(myOutputGrad); - REQUIRE_NOTHROW(myBatchNorm->backward()); + REQUIRE_NOTHROW(op->backward()); std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array4D<float, 2, 3, 2, 2>{ { @@ -427,10 +441,10 @@ TEST_CASE("[gpu/operator] BatchNorm(backward)") { float *computedGradCuda = new float[expectedInputGrad->size()](); cudaMemcpy(computedGradCuda, input->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInputGrad->size(), cudaMemcpyDeviceToHost); - + for(int i = 0; i < expectedInputGrad->size(); i++){ const float targetOutput = *(static_cast<float*>(expectedInputGrad->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6); + REQUIRE(std::fabs(computedGradCuda[i] - targetOutput) < 1e-6); } delete[] computedGradCuda; diff --git a/unit_tests/Test_CastMove.cpp b/unit_tests/Test_CastMove.cpp index c96600f79967c69e43b3c334d3624f6514b6f936..f9645e432af34fbe5d57d236e7f49092adac8f06 100644 --- a/unit_tests/Test_CastMove.cpp +++ b/unit_tests/Test_CastMove.cpp @@ -9,19 +9,24 @@ * ********************************************************************************/ -#include <catch2/catch_test_macros.hpp> #include <memory> #include <string> +#include <catch2/catch_test_macros.hpp> + +#include "aidge/backend/cpu/data/TensorImpl.hpp" +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/ConvImpl.hpp" +#include "aidge/data/Data.hpp" #include "aidge/data/Tensor.hpp" -#include "aidge/utils/TensorUtils.hpp" #include "aidge/graph/Node.hpp" #include "aidge/graph/GraphView.hpp" #include "aidge/graph/OpArgs.hpp" +#include "aidge/operator/Conv.hpp" +#include "aidge/recipes/Recipes.hpp" // explicitCastMove #include "aidge/scheduler/SequentialScheduler.hpp" -#include "aidge/recipes/Recipes.hpp" - -#include "aidge/backend/cuda.hpp" +#include "aidge/utils/TensorUtils.hpp" +#include "aidge/utils/ArrayHelpers.hpp" using namespace Aidge; @@ -111,19 +116,19 @@ TEST_CASE("[cuda/castmove] CastMove(forward)") { {{2340904, 2414668, 2488432}, {2709724, 2783488, 2857252}, {3078544, 3152308, 3226072}}, {{3670577, 3786245, 3901913}, {4248917, 4364585, 4480253}, {4827257, 4942925, 5058593}}}}}); - std::shared_ptr<Tensor> other1 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv1")->getOperator())->getOutput(0); + std::shared_ptr<Tensor> other1 = std::static_pointer_cast<Conv_Op<2>>(g->getNode("conv1")->getOperator())->getOutput(0); Tensor hostOther1(other1->dataType()); hostOther1.setBackend("cpu"); hostOther1.copyCastFrom(*other1); REQUIRE(approxEq<half_float::half, int>(hostOther1, *expectedOutput1, 0.001, 0.0)); - std::shared_ptr<Tensor> other2 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv2")->getOperator())->getOutput(0); + std::shared_ptr<Tensor> other2 = std::static_pointer_cast<Conv_Op<2>>(g->getNode("conv2")->getOperator())->getOutput(0); Tensor hostOther2(other2->dataType()); hostOther2.setBackend("cpu"); hostOther2.copyCastFrom(*other2); REQUIRE(approxEq<float, int>(hostOther2, *expectedOutput2, 0.001, 0.0)); - std::shared_ptr<Tensor> other3 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv3")->getOperator())->getOutput(0); + std::shared_ptr<Tensor> other3 = std::static_pointer_cast<Conv_Op<2>>(g->getNode("conv3")->getOperator())->getOutput(0); Tensor hostOther3(other3->dataType()); hostOther3.setBackend("cpu"); hostOther3.copyCastFrom(*other3); @@ -198,19 +203,19 @@ TEST_CASE("[cuda/castmove] CastMove(forward)") { {{2340904, 2414668, 2488432}, {2709724, 2783488, 2857252}, {3078544, 3152308, 3226072}}, {{3670577, 3786245, 3901913}, {4248917, 4364585, 4480253}, {4827257, 4942925, 5058593}}}}}); - std::shared_ptr<Tensor> other1 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv1")->getOperator())->getOutput(0); + std::shared_ptr<Tensor> other1 = std::static_pointer_cast<Conv_Op<2>>(g->getNode("conv1")->getOperator())->getOutput(0); Tensor hostOther1(other1->dataType()); hostOther1.setBackend("cpu"); hostOther1.copyCastFrom(*other1); REQUIRE(approxEq<half_float::half, int>(hostOther1, *expectedOutput1, 0.001, 0.0)); - std::shared_ptr<Tensor> other2 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv2")->getOperator())->getOutput(0); + std::shared_ptr<Tensor> other2 = std::static_pointer_cast<Conv_Op<2>>(g->getNode("conv2")->getOperator())->getOutput(0); Tensor hostOther2(other2->dataType()); hostOther2.setBackend("cpu"); hostOther2.copyCastFrom(*other2); REQUIRE(approxEq<float, int>(hostOther2, *expectedOutput2, 0.001, 0.0)); - std::shared_ptr<Tensor> other3 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv3")->getOperator())->getOutput(0); + std::shared_ptr<Tensor> other3 = std::static_pointer_cast<Conv_Op<2>>(g->getNode("conv3")->getOperator())->getOutput(0); Tensor hostOther3(other3->dataType()); hostOther3.setBackend("cpu"); hostOther3.copyCastFrom(*other3); diff --git a/unit_tests/Test_ClipImpl.cpp b/unit_tests/Test_ClipImpl.cpp index b62f874c198a544f571a98ebb22c1cd7fbd923cf..5c53bac5919da2d80226003259ffa9e4a9d06cf2 100644 --- a/unit_tests/Test_ClipImpl.cpp +++ b/unit_tests/Test_ClipImpl.cpp @@ -9,28 +9,42 @@ * ********************************************************************************/ -#include <array> -#include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution -#include <chrono> -#include <catch2/catch_test_macros.hpp> +#include <chrono> // std::micro, std::chrono::time_point, + // std::chrono::system_clock +#include <cstddef> // std::size_t +#include <cstdint> // std::uint16_t +#include <functional> // std::multiplies +#include <memory> +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937 + // std::uniform_int_distribution, std::uniform_real_distribution +#include <vector> -#include "aidge/backend/cpu.hpp" -#include "aidge/backend/cuda.hpp" -#include "aidge/operator/Clip.hpp" +#include <catch2/catch_test_macros.hpp> +#include <cuda.h> +#include <fmt/core.h> + +#include "aidge/backend/cpu/data/TensorImpl.hpp" +#include "aidge/backend/cpu/operator/ClipImpl.hpp" +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/ClipImpl.hpp" +#include "aidge/data/Data.hpp" #include "aidge/data/Tensor.hpp" +#include "aidge/graph/Node.hpp" +#include "aidge/operator/Clip.hpp" #include "aidge/utils/TensorUtils.hpp" + using namespace std::chrono; namespace Aidge { -TEST_CASE("[gpu/operator] Clip", "[Clip][GPU]") +TEST_CASE("[gpu/operator] Clip", "[Clip][GPU]") { constexpr std::uint16_t NBTRIALS = 10; // Create a random number generator std::random_device rd; std::mt19937 gen(rd()); - std::uniform_real_distribution<float> valueDist(0, 10); + std::uniform_real_distribution<float> valueDist(0, 10); std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1),std::size_t(20)); std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(4), std::size_t(6)); std::uniform_int_distribution<int> boolDist(0,1); @@ -44,24 +58,22 @@ TEST_CASE("[gpu/operator] Clip", "[Clip][GPU]") std::size_t number_of_operation = 0; for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { - std::shared_ptr<Node> myClipCUDA = Clip("clcuda",1.0,3.0); - auto op_cuda = std::static_pointer_cast<OperatorTensor>(myClipCUDA -> getOperator()); + std::shared_ptr<Clip_Op> op_cuda = std::make_shared<Clip_Op>(1.0, 3.0); // Create Div Operator CPU - std::shared_ptr<Node> myClipCPU = Clip("clcpu",1.0,3.0); - auto op_cpu = std::static_pointer_cast<OperatorTensor>(myClipCPU -> getOperator()); + std::shared_ptr<Clip_Op> op_cpu = std::make_shared<Clip_Op>(1.0, 3.0); op_cpu->setDataType(DataType::Float32); op_cpu->setBackend("cpu"); const std::size_t nbDims = nbDimsDist(gen); std::vector<std::size_t> dims0; - for (std::size_t i = 0; i < nbDims; ++i) + for (std::size_t i = 0; i < nbDims; ++i) { const std::size_t dim = dimSizeDist(gen); dims0.push_back(dim); } - + const std::size_t nb_elements0 = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); float* array0 = new float[nb_elements0]; for (std::size_t i = 0; i < nb_elements0; ++i) { @@ -109,13 +121,13 @@ TEST_CASE("[gpu/operator] Clip", "[Clip][GPU]") delete[] array0; cudaFree(array0_d); - auto duration_cuda = duration_cast<milliseconds>(endcuda - startcuda).count(); - std::cout << "CUDA exec time: " << duration_cuda << " ms" << std::endl; - auto duration_cpu = duration_cast<milliseconds>(endcpu - startcpu).count(); - std::cout << "CPU exec time: " << duration_cpu << " ms" << std::endl; + const auto duration_cuda = duration_cast<milliseconds>(endcuda - startcuda).count(); + fmt::print("CUDA exec time: {} ms\n", duration_cuda); + const auto duration_cpu = duration_cast<milliseconds>(endcpu - startcpu).count(); + fmt::print("CPU exec time: {} ms\n", duration_cpu); //Exec time difference (CPU - CUDA): - auto difference = duration_cpu - duration_cuda; - std::cout << "Exec time difference (CPU - CUDA): " << difference << " ms" << std::endl; + const float ratio = static_cast<float>(duration_cuda) / static_cast<float>(duration_cpu); + fmt::print("Exec time ratio (CUDA/CPU): {} %\n", ratio*100); } } diff --git a/unit_tests/Test_ConvDepthWiseImpl.cpp b/unit_tests/Test_ConvDepthWiseImpl.cpp index 4655de069cce86e80881a06673621c8159be18f6..2fc5ae20740218622bd2bd23a720b288ae12d211 100644 --- a/unit_tests/Test_ConvDepthWiseImpl.cpp +++ b/unit_tests/Test_ConvDepthWiseImpl.cpp @@ -9,15 +9,32 @@ * ********************************************************************************/ -#include <array> -#include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <chrono> // std::micro, std::chrono::time_point, + // std::chrono::system_clock +#include <cmath> // std::fabs +#include <cstddef> // std::size_t +#include <cstdint> // std::uint16_t +#include <functional> // std::multiplies +#include <memory> +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937 + // std::uniform_int_distribution, std::uniform_real_distribution +#include <vector> #include <catch2/catch_test_macros.hpp> - -#include "aidge/backend/cpu.hpp" -#include "aidge/backend/cuda.hpp" +#include <cuda.h> // cudaMemcpy, cudaMemcpyDeviceToHost +#include <fmt/core.h> + +#include "aidge/backend/cpu/data/TensorImpl.hpp" +#include "aidge/backend/cpu/operator/ConvDepthWiseImpl.hpp" +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/ConvImpl.hpp" +#include "aidge/data/Data.hpp" #include "aidge/data/Tensor.hpp" +#include "aidge/graph/Node.hpp" +#include "aidge/operator/ConvDepthWise.hpp" +#include "aidge/operator/OperatorTensor.hpp" +#include "aidge/utils/ArrayHelpers.hpp" #include "aidge/utils/TensorUtils.hpp" using namespace Aidge; @@ -161,7 +178,7 @@ TEST_CASE("[cpu/operator] ConvDepthWise(forward)", "[ConvDepthWise][CPU]") { for(int i = 0; i < myOutput->size(); i++){ const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + REQUIRE(std::fabs(computedOutput[i] - targetOutput) < 1e-6); } delete[] computedOutput; @@ -309,6 +326,6 @@ TEST_CASE("[cpu/operator] ConvDepthWise(forward)", "[ConvDepthWise][CPU]") { cudaFree(weight_d); cudaFree(bias_d); } - std::cout << "total time: " << duration.count() << "μs" << std::endl; + fmt::print("total time: {}μs", duration.count()); } } \ No newline at end of file diff --git a/unit_tests/Test_ConvImpl.cpp b/unit_tests/Test_ConvImpl.cpp index 72a4040a8ecbd091e24f8441d9c29970ea82c606..38876a78098ee9ce1e12b00d99ba969722fe6685 100644 --- a/unit_tests/Test_ConvImpl.cpp +++ b/unit_tests/Test_ConvImpl.cpp @@ -9,15 +9,32 @@ * ********************************************************************************/ -#include <array> -#include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <chrono> // std::micro, std::chrono::time_point, + // std::chrono::system_clock +#include <cmath> // std::fabs +#include <cstddef> // std::size_t +#include <cstdint> // std::uint16_t +#include <functional> // std::multiplies +#include <memory> +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937 + // std::uniform_int_distribution, std::uniform_real_distribution +#include <vector> #include <catch2/catch_test_macros.hpp> - -#include "aidge/backend/cpu.hpp" -#include "aidge/backend/cuda.hpp" +#include <cuda.h> // cudaMemcpy, cudaMemcpyDeviceToHost +#include <fmt/core.h> + +#include "aidge/backend/cpu/data/TensorImpl.hpp" +#include "aidge/backend/cpu/operator/ConvImpl.hpp" +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/ConvImpl.hpp" +#include "aidge/data/Data.hpp" #include "aidge/data/Tensor.hpp" +#include "aidge/graph/Node.hpp" +#include "aidge/operator/Conv.hpp" +#include "aidge/operator/OperatorTensor.hpp" +#include "aidge/utils/ArrayHelpers.hpp" #include "aidge/utils/TensorUtils.hpp" using namespace Aidge; @@ -169,7 +186,7 @@ TEST_CASE("[gpu/operator] Conv(forward)") { } } }); - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,4,3,3> { + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,4,3,3> { { { {{ 15226, 15577, 15928}, @@ -363,7 +380,7 @@ TEST_CASE("[gpu/operator] Conv(forward)") { cudaFree(weight_d); cudaFree(bias_d); } - std::cout << "total time: " << duration.count() << "μs" << std::endl; + fmt::print("total time: {}μs", duration.count()); } } diff --git a/unit_tests/Test_DivImpl.cpp b/unit_tests/Test_DivImpl.cpp index 07cde5d6acb8eeeff2667e5c67aedb87b893e84c..1f3e38a10b8c694fc1f8980d6e756775dcc35c4b 100644 --- a/unit_tests/Test_DivImpl.cpp +++ b/unit_tests/Test_DivImpl.cpp @@ -9,15 +9,30 @@ * ********************************************************************************/ -#include <array> -#include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <chrono> // std::micro, std::chrono::time_point, + // std::chrono::system_clock +#include <cmath> // std::fabs +#include <cstddef> // std::size_t +#include <cstdint> // std::uint16_t +#include <functional> // std::multiplies +#include <memory> +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937 + // std::uniform_int_distribution, std::uniform_real_distribution +#include <vector> #include <catch2/catch_test_macros.hpp> +#include <cuda.h> // cudaMemcpy, cudaMemcpyDeviceToHost -#include "aidge/backend/cpu.hpp" -#include "aidge/backend/cuda.hpp" +#include "aidge/backend/cpu/data/TensorImpl.hpp" +#include "aidge/backend/cpu/operator/DivImpl.hpp" +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/DivImpl.hpp" +#include "aidge/data/Data.hpp" #include "aidge/data/Tensor.hpp" +#include "aidge/graph/Node.hpp" +#include "aidge/operator/Div.hpp" +#include "aidge/operator/OperatorTensor.hpp" #include "aidge/utils/TensorUtils.hpp" namespace Aidge { @@ -42,12 +57,12 @@ constexpr std::uint16_t NBTRIALS = 10; for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { // Create Div Operator CUDA - std::shared_ptr<Node> myDivCUDA = Div(); - auto op_cuda = std::static_pointer_cast<OperatorTensor>(myDivCUDA -> getOperator()); + std::shared_ptr<Div_Op> op_cuda = std::make_shared<Div_Op>(); + op_cuda->setDataType(DataType::Float32); + op_cuda->setBackend("cuda"); // Create Div Operator CPU - std::shared_ptr<Node> myDivCPU = Div(); - auto op_cpu = std::static_pointer_cast<OperatorTensor>(myDivCPU -> getOperator()); + std::shared_ptr<Div_Op> op_cpu = std::make_shared<Div_Op>(); op_cpu->setDataType(DataType::Float32); op_cpu->setBackend("cpu"); @@ -55,12 +70,8 @@ constexpr std::uint16_t NBTRIALS = 10; std::vector<std::size_t> dims0, dims1, dims; for (std::size_t i = 0; i < nbDims; ++i) { const std::size_t dim = dimSizeDist(gen); - dims0.push_back(dim); - if (boolDist(gen)) { - dims1.push_back(1); - }else{ - dims1.push_back(dim); - } + dims0.push_back(dim); + dims1.push_back(boolDist(gen) ? 1 : dim); dims.push_back(std::max(dims0[i], dims1[i])); } @@ -116,8 +127,6 @@ constexpr std::uint16_t NBTRIALS = 10; T1_cpu -> getImpl() -> setRawPtr(array1, nb_elements1); // forward CUDA - op_cuda->setDataType(DataType::Float32); - op_cuda->setBackend("cuda"); start = std::chrono::system_clock::now(); op_cuda->forward(); end = std::chrono::system_clock::now(); @@ -136,5 +145,6 @@ constexpr std::uint16_t NBTRIALS = 10; cudaFree(array0_d); cudaFree(array1_d); } + fmt::print("total time: {}μs", duration.count()); } } // namespace Aidge diff --git a/unit_tests/Test_ErfImpl.cpp b/unit_tests/Test_ErfImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7e81cdee09c03577954b7d061036756ca17d1b8f --- /dev/null +++ b/unit_tests/Test_ErfImpl.cpp @@ -0,0 +1,106 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <array> +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution + +#include <catch2/catch_test_macros.hpp> + +#include "aidge/backend/cpu.hpp" +#include "aidge/backend/cuda.hpp" +#include "aidge/data/Tensor.hpp" +#include "aidge/utils/TensorUtils.hpp" + +namespace Aidge { + +TEST_CASE("[gpu/operator] Erf", "[Erf][GPU]") { +constexpr std::uint16_t NBTRIALS = 10; + // Create a random number generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<float> valueDist( + 0.1f, 1.1f); // Random float distribution between 0 and 1 + std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1), + std::size_t(10)); + std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(8)); + + // To measure execution time of 'forward()' + std::chrono::time_point<std::chrono::system_clock> start; + std::chrono::time_point<std::chrono::system_clock> end; + std::chrono::duration<double, std::micro> duration{}; + std::size_t number_of_operation = 0; + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) + { + // Create Erf Operator CUDA + std::shared_ptr<Node> myErfCUDA = Erf(); + auto op_cuda = std::static_pointer_cast<OperatorTensor>(myErfCUDA -> getOperator()); + + // Create Erf Operator CPU + std::shared_ptr<Node> myErfCPU = Erf(); + auto op_cpu = std::static_pointer_cast<OperatorTensor>(myErfCPU -> getOperator()); + op_cpu->setDataType(DataType::Float32); + op_cpu->setBackend("cpu"); + + const std::size_t nbDims = nbDimsDist(gen); + std::vector<std::size_t> dims; + for (std::size_t i = 0; i < nbDims; ++i) { + dims.push_back(dimSizeDist(gen)); + } + + const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + number_of_operation += nb_elements; + float* array0 = new float[nb_elements]; + + for (std::size_t i = 0; i < nb_elements; ++i) { + array0[i] = valueDist(gen); + } + + // input0 CUDA + float* array0_d; + std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>(); + T0_cuda->setDataType(DataType::Float32); + T0_cuda->setBackend("cuda"); + T0_cuda->resize(dims); + op_cuda->associateInput(0, T0_cuda); + cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements); + cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice); + T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements); + + // input0 CPU + std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>(); + op_cpu->associateInput(0,T0_cpu); + T0_cpu->setDataType(DataType::Float32); + T0_cpu->setBackend("cpu"); + T0_cpu->resize(dims); + T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements); + + // forward CUDA + op_cuda->setDataType(DataType::Float32); + op_cuda->setBackend("cuda"); + start = std::chrono::system_clock::now(); + op_cuda->forward(); + end = std::chrono::system_clock::now(); + duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + + // forward CPU + op_cpu->forward(); + float *computedCPU = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr()); + + std::shared_ptr<Tensor> outputFallback; + const auto& cudaOutput = op_cuda->getOutput(0)->refCastFrom(outputFallback, *op_cpu->getOutput(0)); + REQUIRE(approxEq<float>(cudaOutput, *(op_cpu->getOutput(0)))); + + delete[] array0; + cudaFree(array0_d); + } +} +} // namespace Aidge \ No newline at end of file diff --git a/unit_tests/Test_FCImpl.cpp b/unit_tests/Test_FCImpl.cpp index 472fd273b1b5eff49e0d05ebd499afdb1435770c..284c46204d3a7a5623c2183b09f1e4691fc9487f 100644 --- a/unit_tests/Test_FCImpl.cpp +++ b/unit_tests/Test_FCImpl.cpp @@ -9,15 +9,32 @@ * ********************************************************************************/ -#include <array> -#include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <chrono> // std::micro, std::chrono::time_point, + // std::chrono::system_clock +#include <cmath> // std::fabs +#include <cstddef> // std::size_t +#include <cstdint> // std::uint16_t +#include <functional> // std::multiplies +#include <memory> +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937 + // std::uniform_int_distribution, std::uniform_real_distribution +#include <vector> #include <catch2/catch_test_macros.hpp> +#include <cuda.h> +#include <fmt/core.h> -#include "aidge/backend/cpu.hpp" -#include "aidge/backend/cuda.hpp" +#include "aidge/backend/cpu/data/TensorImpl.hpp" +#include "aidge/backend/cpu/operator/FCImpl.hpp" +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/FCImpl.hpp" #include "aidge/data/Tensor.hpp" +#include "aidge/graph/Node.hpp" +#include "aidge/operator/FC.hpp" +#include "aidge/operator/OperatorTensor.hpp" +#include "aidge/operator/OperatorTensor.hpp" +#include "aidge/utils/ArrayHelpers.hpp" #include "aidge/utils/TensorUtils.hpp" using namespace Aidge; @@ -76,7 +93,7 @@ TEST_CASE("[gpu/operator] FC(forward)", "[FC][GPU]") { for(int i = 0; i < myOutput->size(); i++){ const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + REQUIRE(std::fabs(computedOutput[i] - targetOutput) < 1e-6); } delete[] computedOutput; @@ -124,7 +141,7 @@ TEST_CASE("[gpu/operator] FC(forward)", "[FC][GPU]") { for(int i = 0; i < myOutput->size(); i++){ const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + REQUIRE(std::fabs(computedOutput[i] - targetOutput) < 1e-6); } delete[] computedOutput; @@ -264,7 +281,7 @@ TEST_CASE("[gpu/operator] FC(forward)", "[FC][GPU]") { cudaFree(weight_d); cudaFree(bias_d); } - std::cout << "total time: " << duration.count() << "μs" << std::endl; + fmt::print("Total time: {}μs\n", duration.count()); } } @@ -330,15 +347,15 @@ TEST_CASE("[gpu/operator] FC(backward)", "[FC][GPU]") { for(int i = 0; i < expectedInputGrad->size(); i++){ const float targetOutput = *(static_cast<float*>(expectedInputGrad->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6); + REQUIRE(std::fabs(computedGradCuda[i] - targetOutput) < 1e-6); } for(int i = 0; i < expectedBiasGrad->size(); i++){ const float targetOutput = *(static_cast<float*>(expectedBiasGrad->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedGradBCuda[i] - targetOutput) < 1e-6); + REQUIRE(std::fabs(computedGradBCuda[i] - targetOutput) < 1e-6); } for(int i = 0; i < expectedWeightsGrad->size(); i++){ const float targetOutput = *(static_cast<float*>(expectedWeightsGrad->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedGradWCuda[i] - targetOutput) < 1e-6); + REQUIRE(std::fabs(computedGradWCuda[i] - targetOutput) < 1e-6); } diff --git a/unit_tests/Test_GlobalAveragePoolingImpl.cpp b/unit_tests/Test_GlobalAveragePoolingImpl.cpp index 0a0f22ab60ced3a3f7648ce798484f72bd67839a..5c13f45bc40d3aaa874209f2475468e9b59d500e 100644 --- a/unit_tests/Test_GlobalAveragePoolingImpl.cpp +++ b/unit_tests/Test_GlobalAveragePoolingImpl.cpp @@ -9,22 +9,35 @@ * ********************************************************************************/ -// #include <aidge/utils/Types.h> -// #include <catch2/catch_test_macros.hpp> -// #include <chrono> -// #include <cmath> -// #include <memory> -#include <array> -#include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <chrono> // std::micro, std::chrono::time_point, + // std::chrono::system_clock +#include <cmath> // std::fabs +#include <cstddef> // std::size_t +#include <cstdint> // std::uint16_t +#include <functional> // std::multiplies +#include <memory> +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937 + // std::uniform_int_distribution, std::uniform_real_distribution #include <catch2/catch_test_macros.hpp> - -#include "aidge/backend/cpu.hpp" -#include "aidge/backend/cuda.hpp" +#include <cuda.h> +#include <fmt/core.h> + +#include "aidge/backend/cpu/data/TensorImpl.hpp" +#include "aidge/backend/cpu/operator/GlobalAveragePoolingImpl.hpp" +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/GlobalAveragePoolingImpl.hpp" +#include "aidge/data/Data.hpp" #include "aidge/data/Tensor.hpp" +#include "aidge/graph/Node.hpp" +#include "aidge/operator/OperatorTensor.hpp" +#include "aidge/operator/Pad.hpp" +#include "aidge/utils/ArrayHelpers.hpp" #include "aidge/utils/TensorUtils.hpp" + namespace Aidge { + TEST_CASE("[gpu/operator] GlobalAveragePooling", "[GlobalAveragePooling][GPU]") { @@ -74,7 +87,7 @@ TEST_CASE("[gpu/operator] GlobalAveragePooling", for(int i = 0; i < myOutput->size(); i++){ const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + REQUIRE(std::fabs(computedOutput[i] - targetOutput) < 1e-6); } delete[] computedOutput; @@ -146,7 +159,7 @@ TEST_CASE("[gpu/operator] GlobalAveragePooling", T0_cpu->resize(dims); T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements); - // Run inference + // Run inference start = std::chrono::system_clock::now(); op_cuda->forward(); end = std::chrono::system_clock::now(); @@ -165,8 +178,8 @@ TEST_CASE("[gpu/operator] GlobalAveragePooling", delete[] array0; cudaFree(array0_d); } - std::cout << "number of elements over time spent: " << (number_of_operation / duration.count()) << std::endl; - std::cout << "total time: " << duration.count() << "μs" << std::endl; + Log::info("Number of elements over time spent: {}\n", number_of_operation / duration.count()); + Log::info("Total time: {}μs\n", duration.count()); } } } // namespace Aidge diff --git a/unit_tests/Test_ILayerNormImpl.cpp b/unit_tests/Test_ILayerNormImpl.cpp index 0487b7c4716596e0d2e7bcbdaf812358be4de3bf..04b16c716360d3a67d960ff53e7dc210ced09df1 100644 --- a/unit_tests/Test_ILayerNormImpl.cpp +++ b/unit_tests/Test_ILayerNormImpl.cpp @@ -11,16 +11,20 @@ * ********************************************************************************/ -#include <array> +#include <cmath> // std::fabs +#include <cstddef> // std::size_t +#include <memory> #include <catch2/catch_test_macros.hpp> +#include <cuda.h> -#include "Test_cuda.hpp" - +#include "aidge/backend/cpu/data/TensorImpl.hpp" +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/ILayerNormImpl.hpp" +#include "aidge/data/Data.hpp" #include "aidge/data/Tensor.hpp" - -#include "aidge/backend/cpu.hpp" -#include "aidge/backend/cuda.hpp" +#include "aidge/operator/ILayerNorm.hpp" +#include "aidge/utils/TensorUtils.hpp" using namespace Aidge; @@ -57,8 +61,7 @@ TEST_CASE("[gpu/operator] ILayerNorm(forward)", "[ILayerNorm][GPU]") { myWeight->setBackend("cuda"); myBias->setBackend("cuda"); - std::shared_ptr<Node> myILayerNorm = ILayerNorm(); - auto op = std::static_pointer_cast<OperatorTensor>(myILayerNorm -> getOperator()); + std::shared_ptr<ILayerNorm_Op> op = std::make_shared<ILayerNorm_Op>(); op -> associateInput(1, myWeight); op -> associateInput(2, myBias); @@ -101,9 +104,9 @@ TEST_CASE("[gpu/operator] ILayerNorm(forward)", "[ILayerNorm][GPU]") { cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * output_ilayernorm->size(), cudaMemcpyDeviceToHost); //test if forward result are as expected - for(int i = 0; i < output_ilayernorm->size(); i++){ + for(std::size_t i = 0; i < output_ilayernorm->size(); i++){ const float targetOutput = *(static_cast<float*>(output_ilayernorm->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + REQUIRE(std::fabs(computedOutput[i] - targetOutput) < 1e-6); } } @@ -112,7 +115,7 @@ TEST_CASE("[gpu/operator] ILayerNorm(forward)", "[ILayerNorm][GPU]") { TEST_CASE("[gpu/operator] ILayerNorm(backward)", "[ILayerNorm][GPU]") -{ +{ std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array4D<float,1,1,1,8> { //NCHW { { @@ -122,7 +125,7 @@ TEST_CASE("[gpu/operator] ILayerNorm(backward)", "[ILayerNorm][GPU]") }, } }); - + std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array4D<float,1,1,1,8> { //NCHW { { @@ -132,7 +135,7 @@ TEST_CASE("[gpu/operator] ILayerNorm(backward)", "[ILayerNorm][GPU]") }, } }); - + std::shared_ptr<Tensor> myWeight = std::make_shared<Tensor>(Array4D<float,1,1,1,8> { //NCHW { { @@ -142,13 +145,12 @@ TEST_CASE("[gpu/operator] ILayerNorm(backward)", "[ILayerNorm][GPU]") }, } }); - + myWeight->setBackend("cuda"); myBias->setBackend("cuda"); - std::shared_ptr<Node> myILayerNorm = ILayerNorm(); - auto op = std::static_pointer_cast<OperatorTensor>(myILayerNorm -> getOperator()); + std::shared_ptr<ILayerNorm_Op> op = std::make_shared<ILayerNorm_Op>(); op -> associateInput(1, myWeight); op -> associateInput(2, myBias); @@ -158,7 +160,7 @@ TEST_CASE("[gpu/operator] ILayerNorm(backward)", "[ILayerNorm][GPU]") op -> associateInput(0,input0); op->setDataType(DataType::Float32); op->setBackend("cuda"); - myILayerNorm->forward(); + op->forward(); std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array4D<float,1,1,1,8> { { @@ -175,7 +177,7 @@ TEST_CASE("[gpu/operator] ILayerNorm(backward)", "[ILayerNorm][GPU]") std::shared_ptr<Tensor> predictedOutput = op->getOutput(0); std::shared_ptr<Tensor> input = op->getInput(0); predictedOutput->setGrad(myOutputGrad); - REQUIRE_NOTHROW(myILayerNorm->backward()); + REQUIRE_NOTHROW(op->backward()); std::shared_ptr<Tensor> expectedInputGradILayerNorm = std::make_shared<Tensor>(Array4D<float,1,1,1,8> { { @@ -192,9 +194,9 @@ TEST_CASE("[gpu/operator] ILayerNorm(backward)", "[ILayerNorm][GPU]") cudaMemcpy(computedInputGradCuda, op->getInput(0)->grad()->getImpl()->rawPtr(), sizeof(float) * myOutputGrad->size(), cudaMemcpyDeviceToHost); //test if backward result are as expected - for(int i = 0; i < expectedInputGradILayerNorm->size(); i++){ + for(std::size_t i = 0; i < expectedInputGradILayerNorm->size(); i++){ const float targetOutput = *(static_cast<float*>(expectedInputGradILayerNorm->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedInputGradCuda[i] - targetOutput) < 2e-6); + REQUIRE(std::fabs(computedInputGradCuda[i] - targetOutput) < 2e-6); } delete[] computedInputGradCuda; diff --git a/unit_tests/Test_LnImpl.cpp b/unit_tests/Test_LnImpl.cpp index 06e2205ba38ce0becd0326bf4d258b9f55a228bd..0d5734553edf2a2493c9020113970159cd7768f2 100644 --- a/unit_tests/Test_LnImpl.cpp +++ b/unit_tests/Test_LnImpl.cpp @@ -9,15 +9,27 @@ * ********************************************************************************/ -#include <array> -#include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <chrono> // std::micro, std::chrono::time_point, + // std::chrono::system_clock +#include <cstddef> // std::size_t +#include <cstdint> // std::uint16_t +#include <functional> // std::multiplies +#include <memory> +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937 + // std::uniform_int_distribution, std::uniform_real_distribution +#include <vector> #include <catch2/catch_test_macros.hpp> +#include <cuda.h> -#include "aidge/backend/cpu.hpp" -#include "aidge/backend/cuda.hpp" +#include "aidge/backend/cpu/data/TensorImpl.hpp" +#include "aidge/backend/cpu/operator/LnImpl.hpp" +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/LnImpl.hpp" +#include "aidge/data/Data.hpp" #include "aidge/data/Tensor.hpp" +#include "aidge/operator/Ln.hpp" #include "aidge/utils/TensorUtils.hpp" namespace Aidge { @@ -41,12 +53,12 @@ constexpr std::uint16_t NBTRIALS = 10; for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { // Create Ln Operator CUDA - std::shared_ptr<Node> myLnCUDA = Ln(); - auto op_cuda = std::static_pointer_cast<OperatorTensor>(myLnCUDA -> getOperator()); + std::shared_ptr<Ln_Op> op_cuda = std::make_shared<Ln_Op>(); + op_cuda->setDataType(DataType::Float32); + op_cuda->setBackend("cuda"); // Create Ln Operator CPU - std::shared_ptr<Node> myLnCPU = Ln(); - auto op_cpu = std::static_pointer_cast<OperatorTensor>(myLnCPU -> getOperator()); + std::shared_ptr<Ln_Op> op_cpu = std::make_shared<Ln_Op>(); op_cpu->setDataType(DataType::Float32); op_cpu->setBackend("cpu"); @@ -84,8 +96,6 @@ constexpr std::uint16_t NBTRIALS = 10; T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements); // forward CUDA - op_cuda->setDataType(DataType::Float32); - op_cuda->setBackend("cuda"); start = std::chrono::system_clock::now(); op_cuda->forward(); end = std::chrono::system_clock::now(); diff --git a/unit_tests/Test_MatMulImpl.cpp b/unit_tests/Test_MatMulImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..91278b6291675e07cf5159c2e63a85e6cc9f66c2 --- /dev/null +++ b/unit_tests/Test_MatMulImpl.cpp @@ -0,0 +1,378 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ +#include <array> +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution + +#include <catch2/catch_test_macros.hpp> + +#include "aidge/backend/cpu.hpp" +#include "aidge/backend/cuda.hpp" +#include "aidge/data/Tensor.hpp" +#include "aidge/utils/TensorUtils.hpp" + +using namespace Aidge; + +TEST_CASE("[gpu/operator] MatMul(forward)", "[MatMul][GPU]") { + const std::uint16_t NBTRIALS = 10; + // Create a random number generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<float> dis(0.0, 1.0); // Random float distribution between 0 and 1 + std::uniform_int_distribution<std::size_t> distDims(2, 10); + std::uniform_int_distribution<std::size_t> distNbMatrix(1, 5); + + // Create MatMul Operator + std::shared_ptr<Node> myMatMulCPU = MatMul(); + auto op_cpu = std::static_pointer_cast<OperatorTensor>(myMatMulCPU -> getOperator()); + + std::shared_ptr<Node> myMatMulCUDA = MatMul(); + auto op_cuda = std::static_pointer_cast<OperatorTensor>(myMatMulCUDA -> getOperator()); + + // To measure execution time of 'MatMul_Op::forward()' member function call + std::chrono::time_point<std::chrono::system_clock> start; + std::chrono::time_point<std::chrono::system_clock> end; + std::chrono::duration<double, std::micro> duration; + + SECTION("2-D Tensors") { + std::size_t totalComputation = 0; + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { + // generate Tensors dimensions + const std::size_t dim0 = distDims(gen); + const std::size_t dim1 = distDims(gen); + const std::size_t dim2 = distDims(gen); + totalComputation += dim0*dim1*dim2; + + // Create and populate the array with random float values + float* bigArray1 = new float[dim0*dim1]; + for (int i = 0; i < dim0*dim1; ++i) { + bigArray1[i] = dis(gen); // Generate random float value + } + float* bigArray2 = new float[dim1*dim2]; + for (int i = 0; i < dim1*dim2; ++i) { + bigArray2[i] = dis(gen); // Generate random float value + } + + float * bigArray1_d, *bigArray2_d; + cudaMalloc(reinterpret_cast<void **>(&bigArray1_d), sizeof(float) * dim0*dim1); + cudaMemcpy(bigArray1_d, bigArray1, sizeof(float) * dim0*dim1, cudaMemcpyHostToDevice); + cudaMalloc(reinterpret_cast<void **>(&bigArray2_d), sizeof(float) * dim1*dim2); + cudaMemcpy(bigArray2_d, bigArray2, sizeof(float) * dim1*dim2, cudaMemcpyHostToDevice); + + // Create Input0 + std::shared_ptr<Tensor> T1_cuda = std::make_shared<Tensor>(DataType::Float32); + T1_cuda -> resize({dim0,dim1}); + T1_cuda->setBackend("cuda"); + op_cuda->associateInput(0, T1_cuda); + T1_cuda->getImpl()->setRawPtr(bigArray1_d, dim0*dim1); + + std::shared_ptr<Tensor> T1_cpu = std::make_shared<Tensor>(); + op_cpu->associateInput(0,T1_cpu); + T1_cpu->setDataType(DataType::Float32); + T1_cpu->setBackend("cpu"); + T1_cpu->resize({dim0,dim1}); + T1_cpu -> getImpl() -> setRawPtr(bigArray1, dim0*dim1); + + // Create Input1 + std::shared_ptr<Tensor> T2_cuda = std::make_shared<Tensor>(DataType::Float32); + T2_cuda -> resize({dim1,dim2}); + T2_cuda->setBackend("cuda"); + op_cuda->associateInput(1, T2_cuda); + T2_cuda->getImpl()->setRawPtr(bigArray2_d, dim1*dim2); + + std::shared_ptr<Tensor> T2_cpu = std::make_shared<Tensor>(); + op_cpu->associateInput(1,T2_cpu); + T2_cpu->setDataType(DataType::Float32); + T2_cpu->setBackend("cpu"); + T2_cpu->resize({dim1,dim2}); + T2_cpu -> getImpl() -> setRawPtr(bigArray2, dim1*dim2); + + // forward CUDA + op_cuda->setDataType(DataType::Float32); + op_cuda->setBackend("cuda"); + op_cuda->forwardDims(); + start = std::chrono::system_clock::now(); + op_cuda->forward(); + end = std::chrono::system_clock::now(); + duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + + // forward CPU + op_cpu->setDataType(DataType::Float32); + op_cpu->setBackend("cpu"); + op_cpu->forwardDims(); + op_cpu->forward(); + float *computedCPU = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr()); + + std::shared_ptr<Tensor> outputFallback; + const auto& cudaOutput = op_cuda->getOutput(0)->refCastFrom(outputFallback, *op_cpu->getOutput(0)); + REQUIRE(approxEq<float>(cudaOutput, *(op_cpu->getOutput(0)))); + + delete[] bigArray1; + delete[] bigArray2; + cudaFree(bigArray1_d); + cudaFree(bigArray2_d); + } + Log::info("multiplications over time spent: {}\n", (totalComputation / duration.count())); + Log::info("total time: {}μs\n", duration.count()); + } + SECTION("3-D Tensors") { + std::size_t totalComputation = 0; + duration = std::chrono::duration<double, std::micro>::zero(); + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { + // generate Tensors dimensions + const std::size_t dimNb = distNbMatrix(gen); + const std::size_t dim0 = distDims(gen); + const std::size_t dim1 = distDims(gen); + const std::size_t dim2 = distDims(gen); + totalComputation += dim0*dim1*dim2*dimNb; + + // Create and populate the array with random float values + float* bigArray1 = new float[dimNb*dim0*dim1]; + for (std::size_t i = 0; i < dimNb*dim0*dim1; ++i) { + bigArray1[i] = dis(gen); // Generate random float value + } + float* bigArray2 = new float[dimNb*dim1*dim2]; + for (int i = 0; i < dimNb*dim1*dim2; ++i) { + bigArray2[i] = dis(gen); // Generate random float value + } + + float * bigArray1_d, *bigArray2_d; + cudaMalloc(reinterpret_cast<void **>(&bigArray1_d), sizeof(float) * dimNb*dim0*dim1); + cudaMemcpy(bigArray1_d, bigArray1, sizeof(float) * dimNb*dim0*dim1, cudaMemcpyHostToDevice); + cudaMalloc(reinterpret_cast<void **>(&bigArray2_d), sizeof(float) * dimNb*dim1*dim2); + cudaMemcpy(bigArray2_d, bigArray2, sizeof(float) * dimNb*dim1*dim2, cudaMemcpyHostToDevice); + + // Create Input0 + std::shared_ptr<Tensor> T1_cuda = std::make_shared<Tensor>(DataType::Float32); + T1_cuda -> resize({dimNb,dim0,dim1}); + T1_cuda->setBackend("cuda"); + op_cuda->associateInput(0, T1_cuda); + T1_cuda->getImpl()->setRawPtr(bigArray1_d, dimNb*dim0*dim1); + + std::shared_ptr<Tensor> T1_cpu = std::make_shared<Tensor>(); + op_cpu->associateInput(0,T1_cpu); + T1_cpu->setDataType(DataType::Float32); + T1_cpu->setBackend("cpu"); + T1_cpu->resize({dimNb,dim0,dim1}); + T1_cpu -> getImpl() -> setRawPtr(bigArray1, dimNb*dim0*dim1); + + // Create Input1 + std::shared_ptr<Tensor> T2_cuda = std::make_shared<Tensor>(DataType::Float32); + T2_cuda -> resize({dimNb,dim1,dim2}); + T2_cuda->setBackend("cuda"); + op_cuda->associateInput(1, T2_cuda); + T2_cuda->getImpl()->setRawPtr(bigArray2_d, dimNb*dim1*dim2); + + std::shared_ptr<Tensor> T2_cpu = std::make_shared<Tensor>(); + op_cpu->associateInput(1,T2_cpu); + T2_cpu->setDataType(DataType::Float32); + T2_cpu->setBackend("cpu"); + T2_cpu->resize({dimNb,dim1,dim2}); + T2_cpu -> getImpl() -> setRawPtr(bigArray2, dimNb*dim1*dim2); + + // forward CUDA + op_cuda->setDataType(DataType::Float32); + op_cuda->setBackend("cuda"); + op_cuda->forwardDims(); + start = std::chrono::system_clock::now(); + op_cuda->forward(); + end = std::chrono::system_clock::now(); + duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + + // forward CPU + op_cpu->setDataType(DataType::Float32); + op_cpu->setBackend("cpu"); + op_cpu->forwardDims(); + op_cpu->forward(); + float *computedCPU = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr()); + + std::shared_ptr<Tensor> outputFallback; + const auto& cudaOutput = op_cuda->getOutput(0)->refCastFrom(outputFallback, *op_cpu->getOutput(0)); + REQUIRE(approxEq<float>(cudaOutput, *(op_cpu->getOutput(0)))); + + delete[] bigArray1; + delete[] bigArray2; + cudaFree(bigArray1_d); + cudaFree(bigArray2_d); + } + Log::info("multiplications over time spent: {}\n", (totalComputation / duration.count())); + Log::info("total time: {}μs\n", duration.count()); + } + + SECTION("4-D Tensors") { + std::size_t totalComputation = 0; + duration = std::chrono::duration<double, std::micro>::zero(); + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { + // generate Tensors dimensions + const std::size_t dimNb1 = distNbMatrix(gen); + const std::size_t dimNb2 = distNbMatrix(gen); + const std::size_t dim0 = distDims(gen); + const std::size_t dim1 = distDims(gen); + const std::size_t dim2 = distDims(gen); + totalComputation += dim0*dim1*dim2*dimNb1*dimNb2; + + // Create and populate the array with random float values + float* bigArray1 = new float[dimNb1*dimNb2*dim0*dim1]; + for (std::size_t i = 0; i < dimNb1*dimNb2*dim0*dim1; ++i) { + bigArray1[i] = dis(gen); // Generate random float value + } + float* bigArray2 = new float[1*1*dim1*dim2]; + for (std::size_t i = 0; i < 1*1*dim1*dim2; ++i) { + bigArray2[i] = dis(gen); // Generate random float value + } + + float * bigArray1_d, *bigArray2_d; + cudaMalloc(reinterpret_cast<void **>(&bigArray1_d), sizeof(float) * dimNb1*dimNb2*dim0*dim1); + cudaMemcpy(bigArray1_d, bigArray1, sizeof(float) * dimNb1*dimNb2*dim0*dim1, cudaMemcpyHostToDevice); + cudaMalloc(reinterpret_cast<void **>(&bigArray2_d), sizeof(float) * 1*1*dim1*dim2); + cudaMemcpy(bigArray2_d, bigArray2, sizeof(float) * 1*1*dim1*dim2, cudaMemcpyHostToDevice); + + // Create Input0 + std::shared_ptr<Tensor> T1_cuda = std::make_shared<Tensor>(DataType::Float32); + T1_cuda -> resize({dimNb1,dimNb2,dim0,dim1}); + T1_cuda->setBackend("cuda"); + op_cuda->associateInput(0, T1_cuda); + T1_cuda->getImpl()->setRawPtr(bigArray1_d, dimNb1*dimNb2*dim0*dim1); + + std::shared_ptr<Tensor> T1_cpu = std::make_shared<Tensor>(); + op_cpu->associateInput(0,T1_cpu); + T1_cpu->setDataType(DataType::Float32); + T1_cpu->setBackend("cpu"); + T1_cpu->resize({dimNb1,dimNb2,dim0,dim1}); + T1_cpu -> getImpl() -> setRawPtr(bigArray1, dimNb1*dimNb2*dim0*dim1); + + // Create Input1 + std::shared_ptr<Tensor> T2_cuda = std::make_shared<Tensor>(DataType::Float32); + T2_cuda -> resize({1,1,dim1,dim2}); + T2_cuda->setBackend("cuda"); + op_cuda->associateInput(1, T2_cuda); + T2_cuda->getImpl()->setRawPtr(bigArray2_d, 1*1*dim1*dim2); + + std::shared_ptr<Tensor> T2_cpu = std::make_shared<Tensor>(); + op_cpu->associateInput(1,T2_cpu); + T2_cpu->setDataType(DataType::Float32); + T2_cpu->setBackend("cpu"); + T2_cpu->resize({1,1,dim1,dim2}); + T2_cpu -> getImpl() -> setRawPtr(bigArray2, 1*1*dim1*dim2); + + // forward CUDA + op_cuda->setDataType(DataType::Float32); + op_cuda->setBackend("cuda"); + op_cuda->forwardDims(); + start = std::chrono::system_clock::now(); + op_cuda->forward(); + end = std::chrono::system_clock::now(); + duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + + // forward CPU + op_cpu->setDataType(DataType::Float32); + op_cpu->setBackend("cpu"); + op_cpu->forwardDims(); + op_cpu->forward(); + float *computedCPU = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr()); + + std::shared_ptr<Tensor> outputFallback; + const auto& cudaOutput = op_cuda->getOutput(0)->refCastFrom(outputFallback, *op_cpu->getOutput(0)); + REQUIRE(approxEq<float>(cudaOutput, *(op_cpu->getOutput(0)))); + + delete[] bigArray1; + delete[] bigArray2; + cudaFree(bigArray1_d); + cudaFree(bigArray2_d); + } + Log::info("multiplications over time spent: {}\n", (totalComputation / duration.count())); + Log::info("total time: {}μs\n", duration.count()); + } + + SECTION("+3-D / 1-D") { + std::size_t totalComputation = 0; + duration = std::chrono::duration<double, std::micro>::zero(); + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { + // generate Tensors dimensions + const std::size_t dimNb = distNbMatrix(gen); + const std::size_t dim0 = distDims(gen); + const std::size_t dim1 = distDims(gen); + totalComputation += dim0*dim1*dimNb; + + // Create and populate the array with random float values + float* bigArray1 = new float[dimNb*dim0*dim1]; + for (std::size_t i = 0; i < dimNb*dim0*dim1; ++i) { + bigArray1[i] = dis(gen); // Generate random float value + } + float* bigArray2 = new float[dim1]; + for (int i = 0; i < dim1; ++i) { + bigArray2[i] = dis(gen); // Generate random float value + } + + float * bigArray1_d, *bigArray2_d; + cudaMalloc(reinterpret_cast<void **>(&bigArray1_d), sizeof(float) * dimNb*dim0*dim1); + cudaMemcpy(bigArray1_d, bigArray1, sizeof(float) * dimNb*dim0*dim1, cudaMemcpyHostToDevice); + cudaMalloc(reinterpret_cast<void **>(&bigArray2_d), sizeof(float) * dim1); + cudaMemcpy(bigArray2_d, bigArray2, sizeof(float) * dim1, cudaMemcpyHostToDevice); + + // Create Input0 + std::shared_ptr<Tensor> T1_cuda = std::make_shared<Tensor>(DataType::Float32); + T1_cuda -> resize({dimNb,dim0,dim1}); + T1_cuda->setBackend("cuda"); + op_cuda->associateInput(0, T1_cuda); + T1_cuda->getImpl()->setRawPtr(bigArray1_d, dimNb*dim0*dim1); + + std::shared_ptr<Tensor> T1_cpu = std::make_shared<Tensor>(); + op_cpu->associateInput(0,T1_cpu); + T1_cpu->setDataType(DataType::Float32); + T1_cpu->setBackend("cpu"); + T1_cpu->resize({dimNb,dim0,dim1}); + T1_cpu -> getImpl() -> setRawPtr(bigArray1, dimNb*dim0*dim1); + + // Create Input1 + std::shared_ptr<Tensor> T2_cuda = std::make_shared<Tensor>(DataType::Float32); + T2_cuda -> resize({dim1}); + T2_cuda->setBackend("cuda"); + op_cuda->associateInput(1, T2_cuda); + T2_cuda->getImpl()->setRawPtr(bigArray2_d, dim1); + + std::shared_ptr<Tensor> T2_cpu = std::make_shared<Tensor>(); + op_cpu->associateInput(1,T2_cpu); + T2_cpu->setDataType(DataType::Float32); + T2_cpu->setBackend("cpu"); + T2_cpu->resize({dim1}); + T2_cpu -> getImpl() -> setRawPtr(bigArray2, dim1); + + // forward CUDA + op_cuda->setDataType(DataType::Float32); + op_cuda->setBackend("cuda"); + op_cuda->forwardDims(); + start = std::chrono::system_clock::now(); + op_cuda->forward(); + end = std::chrono::system_clock::now(); + duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + + // forward CPU + op_cpu->setDataType(DataType::Float32); + op_cpu->setBackend("cpu"); + op_cpu->forwardDims(); + op_cpu->forward(); + float *computedCPU = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr()); + + std::shared_ptr<Tensor> outputFallback; + const auto& cudaOutput = op_cuda->getOutput(0)->refCastFrom(outputFallback, *op_cpu->getOutput(0)); + REQUIRE(approxEq<float>(cudaOutput, *(op_cpu->getOutput(0)))); + + delete[] bigArray1; + delete[] bigArray2; + cudaFree(bigArray1_d); + cudaFree(bigArray2_d); + } + Log::info("multiplications over time spent: {}\n", (totalComputation / duration.count())); + Log::info("total time: {}μs\n", duration.count()); + } + +} \ No newline at end of file diff --git a/unit_tests/Test_MaxPoolingImpl.cpp b/unit_tests/Test_MaxPoolingImpl.cpp index 99850a0715cf8feb3164d58c410a1ef689feece1..0474a70dee755ccb1830a7ac27761fbf522e1260 100644 --- a/unit_tests/Test_MaxPoolingImpl.cpp +++ b/unit_tests/Test_MaxPoolingImpl.cpp @@ -9,15 +9,31 @@ * ********************************************************************************/ -#include <array> -#include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <chrono> // std::micro, std::chrono::time_point, + // std::chrono::system_clock +#include <cmath> // std::fabs +#include <cstddef> // std::size_t +#include <cstdint> // std::uint16_t +#include <functional> // std::multiplies +#include <memory> +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937 + // std::uniform_int_distribution, std::uniform_real_distribution #include <catch2/catch_test_macros.hpp> - -#include "aidge/backend/cpu.hpp" -#include "aidge/backend/cuda.hpp" +#include <cuda.h> +#include <fmt/core.h> + +#include "aidge/backend/cpu/data/TensorImpl.hpp" +#include "aidge/backend/cpu/operator/MaxPoolingImpl.hpp" +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/MaxPoolingImpl.hpp" +#include "aidge/data/Data.hpp" #include "aidge/data/Tensor.hpp" +#include "aidge/graph/Node.hpp" +#include "aidge/operator/OperatorTensor.hpp" +#include "aidge/operator/MaxPooling.hpp" +#include "aidge/utils/ArrayHelpers.hpp" #include "aidge/utils/TensorUtils.hpp" using namespace Aidge; @@ -78,13 +94,13 @@ TEST_CASE("[gpu/operator] MaxPooling(forward)", "[MaxPooling][GPU]") { myMaxPool->getOperator()->setDataType(DataType::Float32); myMaxPool->getOperator()->setBackend("cuda"); myMaxPool->forward(); - + float* computedOutput = new float[myOutput->size()](); cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); for(int i = 0; i < myOutput->size(); i++){ const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + REQUIRE(std::fabs(computedOutput[i] - targetOutput) < 1e-6); } delete[] computedOutput; @@ -92,8 +108,8 @@ TEST_CASE("[gpu/operator] MaxPooling(forward)", "[MaxPooling][GPU]") { SECTION("Random Input") { constexpr std::uint16_t NBTRIALS = 10; - std::size_t kernel = 3; - std::size_t stride = 3; + constexpr std::size_t kernel = 3; + constexpr std::size_t stride = 3; // Create a random number generator std::random_device rd; std::mt19937 gen(rd()); @@ -158,7 +174,7 @@ TEST_CASE("[gpu/operator] MaxPooling(forward)", "[MaxPooling][GPU]") { T0_cpu->resize(dims); T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements); - // Run inference + // Run inference start = std::chrono::system_clock::now(); op_cuda->forward(); end = std::chrono::system_clock::now(); @@ -177,7 +193,7 @@ TEST_CASE("[gpu/operator] MaxPooling(forward)", "[MaxPooling][GPU]") { delete[] array0; cudaFree(array0_d); } - std::cout << "number of elements over time spent: " << (number_of_operation / duration.count()) << std::endl; - std::cout << "total time: " << duration.count() << "μs" << std::endl; + Log::info("Number of elements over time spent: {}\n", number_of_operation / duration.count()); + Log::info("Total time: {}μs\n", duration.count()); } } \ No newline at end of file diff --git a/unit_tests/Test_MulImpl.cpp b/unit_tests/Test_MulImpl.cpp index 9eaba6e80971a7075576cd3d4d409b79dac4eb0c..2559cb963921c5a4ec7be01e494163014c9fb001 100644 --- a/unit_tests/Test_MulImpl.cpp +++ b/unit_tests/Test_MulImpl.cpp @@ -9,15 +9,28 @@ * ********************************************************************************/ -#include <array> -#include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <chrono> // std::micro, std::chrono::time_point, + // std::chrono::system_clock +#include <cmath> // std::fabs +#include <cstddef> // std::size_t +#include <cstdint> // std::uint16_t +#include <functional> // std::multiplies +#include <memory> +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937 + // std::uniform_int_distribution, std::uniform_real_distribution +#include <vector> #include <catch2/catch_test_macros.hpp> +#include <cuda.h> // cudaMemcpy, cudaMemcpyDeviceToHost -#include "aidge/backend/cpu.hpp" -#include "aidge/backend/cuda.hpp" +#include "aidge/backend/cpu/data/TensorImpl.hpp" +#include "aidge/backend/cpu/operator/MulImpl.hpp" +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/MulImpl.hpp" +#include "aidge/data/Data.hpp" #include "aidge/data/Tensor.hpp" +#include "aidge/operator/Mul.hpp" #include "aidge/utils/TensorUtils.hpp" namespace Aidge { @@ -42,12 +55,12 @@ constexpr std::uint16_t NBTRIALS = 10; for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { // Create Mul Operator CUDA - std::shared_ptr<Node> myMulCUDA = Mul(); - auto op_cuda = std::static_pointer_cast<OperatorTensor>(myMulCUDA -> getOperator()); + std::shared_ptr<Mul_Op> op_cuda = std::make_shared<Mul_Op>(); + op_cuda->setDataType(DataType::Float32); + op_cuda->setBackend("cuda"); // Create Mul Operator CPU - std::shared_ptr<Node> myMulCPU = Mul(); - auto op_cpu = std::static_pointer_cast<OperatorTensor>(myMulCPU -> getOperator()); + std::shared_ptr<Mul_Op> op_cpu = std::make_shared<Mul_Op>(); op_cpu->setDataType(DataType::Float32); op_cpu->setBackend("cpu"); @@ -116,8 +129,6 @@ constexpr std::uint16_t NBTRIALS = 10; T1_cpu -> getImpl() -> setRawPtr(array1, nb_elements1); // forward CUDA - op_cuda->setDataType(DataType::Float32); - op_cuda->setBackend("cuda"); start = std::chrono::system_clock::now(); op_cuda->forward(); end = std::chrono::system_clock::now(); diff --git a/unit_tests/Test_PadImpl.cpp b/unit_tests/Test_PadImpl.cpp index 4e799ea6b7d11c9b446e0e4c8b9d12beae24bb05..a30061d43c90dae801b8368a3df5723c50674a8d 100644 --- a/unit_tests/Test_PadImpl.cpp +++ b/unit_tests/Test_PadImpl.cpp @@ -9,15 +9,31 @@ * ********************************************************************************/ -#include <array> -#include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <chrono> // std::micro, std::chrono::time_point, + // std::chrono::system_clock +#include <cmath> // std::fabs +#include <cstddef> // std::size_t +#include <cstdint> // std::uint16_t +#include <functional> // std::multiplies +#include <memory> +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937 + // std::uniform_int_distribution, std::uniform_real_distribution #include <catch2/catch_test_macros.hpp> - -#include "aidge/backend/cpu.hpp" -#include "aidge/backend/cuda.hpp" +#include <cuda.h> +#include <fmt/core.h> + +#include "aidge/backend/cpu/data/TensorImpl.hpp" +#include "aidge/backend/cpu/operator/PadImpl.hpp" +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/PadImpl.hpp" +#include "aidge/data/Data.hpp" #include "aidge/data/Tensor.hpp" +#include "aidge/graph/Node.hpp" +#include "aidge/operator/OperatorTensor.hpp" +#include "aidge/operator/Pad.hpp" +#include "aidge/utils/ArrayHelpers.hpp" #include "aidge/utils/TensorUtils.hpp" using namespace Aidge; @@ -137,7 +153,7 @@ TEST_CASE("[gpu/operator] Pad(forward)", "[Pad][GPU]") { for(int i = 0; i < myOutput->size(); i++){ const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + REQUIRE(std::fabs(computedOutput[i] - targetOutput) < 1e-6); } delete[] computedOutput; @@ -251,7 +267,7 @@ TEST_CASE("[gpu/operator] Pad(forward)", "[Pad][GPU]") { for(int i = 0; i < myOutput->size(); i++){ const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + REQUIRE(std::fabs(computedOutput[i] - targetOutput) < 1e-6); } delete[] computedOutput; @@ -367,7 +383,7 @@ TEST_CASE("[gpu/operator] Pad(forward)", "[Pad][GPU]") { cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); for(int i = 0; i < myOutput->size(); i++){ const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + REQUIRE(std::fabs(computedOutput[i] - targetOutput) < 1e-6); } delete[] computedOutput; @@ -492,7 +508,7 @@ TEST_CASE("[gpu/operator] Pad(forward)", "[Pad][GPU]") { for(int i = 0; i < myOutput->size(); i++){ const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + REQUIRE(std::fabs(computedOutput[i] - targetOutput) < 1e-6); } delete[] computedOutput; @@ -609,7 +625,7 @@ TEST_CASE("[gpu/operator] Pad(forward)", "[Pad][GPU]") { for(int i = 0; i < myOutput->size(); i++){ const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + REQUIRE(std::fabs(computedOutput[i] - targetOutput) < 1e-6); } delete[] computedOutput; @@ -622,7 +638,7 @@ TEST_CASE("[gpu/operator] Pad(forward)", "[Pad][GPU]") { std::uniform_real_distribution<float> valueDist( 0.1f, 1.1f); // Random float distribution between 0 and 1 std::uniform_int_distribution<std::size_t> padTypeDist(std::size_t(0), std::size_t(1)); - // TODO: fix Reflect and Wrap Pad, cpu and gpu only five same results when padding = 1 + // TODO: fix Reflect and Wrap Pad, cpu and gpu only five same results when padding = 1 std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1), std::size_t(10)); std::uniform_int_distribution<std::size_t> padSizeDist(std::size_t(0), std::size_t(5)); @@ -697,7 +713,7 @@ TEST_CASE("[gpu/operator] Pad(forward)", "[Pad][GPU]") { delete[] computed_cuda; cudaFree(array0_d); } - std::cout << "total time: " << duration.count() << "μs" << std::endl; + fmt::print("Total time: {}μs\n", duration.count()); } } @@ -776,7 +792,7 @@ TEST_CASE("[gpu/operator] Pad(backward)", "[Pad][GPU]") { myInput->setBackend("cpu"); for(int i = 0; i < myInput->size(); i++){ const float targetOutput = *(static_cast<float*>(myInput->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6); + REQUIRE(std::fabs(computedGradCuda[i] - targetOutput) < 1e-6); } delete[] computedGradCuda; diff --git a/unit_tests/Test_PowImpl.cpp b/unit_tests/Test_PowImpl.cpp index 49e65b46d7d85b7087c5c73151d643593d91e02e..78be1ca2c92ccb7fc444dbbfa4ea53e0894c99f1 100644 --- a/unit_tests/Test_PowImpl.cpp +++ b/unit_tests/Test_PowImpl.cpp @@ -9,15 +9,30 @@ * ********************************************************************************/ -#include <array> -#include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <chrono> // std::micro, std::chrono::time_point, + // std::chrono::system_clock +#include <cstddef> // std::size_t +#include <cstdint> // std::uint16_t +#include <functional> // std::multiplies +#include <memory> +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937 + // std::uniform_int_distribution, std::uniform_real_distribution #include <catch2/catch_test_macros.hpp> - -#include "aidge/backend/cpu.hpp" -#include "aidge/backend/cuda.hpp" +#include <cuda.h> +#include <fmt/core.h> + +#include "aidge/backend/cpu/data/TensorImpl.hpp" +#include "aidge/backend/cpu/operator/PowImpl.hpp" +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/PowImpl.hpp" +#include "aidge/data/Data.hpp" #include "aidge/data/Tensor.hpp" +#include "aidge/graph/Node.hpp" +#include "aidge/operator/OperatorTensor.hpp" +#include "aidge/operator/Pow.hpp" +#include "aidge/utils/ArrayHelpers.hpp" #include "aidge/utils/TensorUtils.hpp" namespace Aidge { @@ -133,8 +148,8 @@ TEST_CASE("[gpu/operator] Pow", "[Pow][GPU]") { cudaFree(array0_d); cudaFree(array1_d); } - std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl; - std::cout << "total time: " << duration.count() << "μs" << std::endl; + Log::info("Number of elements over time spent: {}\n", number_of_operation / duration.count()); + Log::info("Total time: {}μs\n", duration.count()); } SECTION("+1-D Tensor / +1-D Tensor - broadcasting") { @@ -147,7 +162,7 @@ TEST_CASE("[gpu/operator] Pow", "[Pow][GPU]") { auto op_cpu = std::static_pointer_cast<OperatorTensor>(myPowCPU-> getOperator()); op_cpu->setDataType(DataType::Float32); op_cpu->setBackend("cpu"); - + std::size_t number_of_operation = 0; for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { @@ -241,8 +256,8 @@ TEST_CASE("[gpu/operator] Pow", "[Pow][GPU]") { const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>()); number_of_operation += nb_elements; } - std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl; - std::cout << "total time: " << duration.count() << "μs" << std::endl; + Log::info("Number of elements over time spent: {}\n", number_of_operation / duration.count()); + Log::info("Total time: {}μs\n", duration.count()); } SECTION("+1-D Tensor / 1-D Tensor") { // Create Pow Operator @@ -346,9 +361,8 @@ TEST_CASE("[gpu/operator] Pow", "[Pow][GPU]") { const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>()); number_of_operation += nb_elements; } - - std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl; - std::cout << "total time: " << duration.count() << "μs" << std::endl; + Log::info("Number of elements over time spent: {}\n", number_of_operation / duration.count()); + Log::info("Total time: {}μs\n", duration.count()); } } } diff --git a/unit_tests/Test_ReLUImpl.cpp b/unit_tests/Test_ReLUImpl.cpp index 7ab38aa7def7f846555ae33ccd3871d6ee5a1539..d55ccd1c1ca17e2992ac2178f556a178914694b8 100644 --- a/unit_tests/Test_ReLUImpl.cpp +++ b/unit_tests/Test_ReLUImpl.cpp @@ -9,14 +9,28 @@ * ********************************************************************************/ -#include <array> +#include <chrono> // std::micro, std::chrono::time_point, + // std::chrono::system_clock +#include <cmath> // std::fabs +#include <cstddef> // std::size_t +#include <cstdint> // std::uint16_t +#include <functional> // std::multiplies +#include <memory> +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937 + // std::uniform_int_distribution, std::uniform_real_distribution +#include <vector> + #include <catch2/catch_test_macros.hpp> -#include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <cuda.h> +#include <fmt/core.h> -#include "aidge/backend/cpu.hpp" -#include "aidge/backend/cuda.hpp" +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/ReLUImpl.hpp" +#include "aidge/data/Data.hpp" #include "aidge/data/Tensor.hpp" +#include "aidge/operator/Pad.hpp" +#include "aidge/utils/ArrayHelpers.hpp" #include "aidge/utils/TensorUtils.hpp" using namespace Aidge; @@ -73,8 +87,7 @@ TEST_CASE("[gpu/operator] ReLU(forward)", "[ReLU][GPU]") { } }); - std::shared_ptr<Node> myReLU = ReLU(); - auto op = std::static_pointer_cast<OperatorTensor>(myReLU -> getOperator()); + std::shared_ptr<ReLU_Op> op = std::make_shared<ReLU_Op>(); op->associateInput(0,input0); op->setDataType(DataType::Float32); op->setBackend("cuda"); @@ -85,7 +98,7 @@ TEST_CASE("[gpu/operator] ReLU(forward)", "[ReLU][GPU]") { for(int i = 0; i < myOutput->size(); i++){ const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + REQUIRE(std::fabs(computedOutput[i] - targetOutput) < 1e-6); } delete[] computedOutput; @@ -110,8 +123,7 @@ TEST_CASE("[gpu/operator] ReLU(forward)", "[ReLU][GPU]") { for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { // Create ReLU Operator - std::shared_ptr<Node> myReLU = ReLU("myReLU"); - auto op = std::static_pointer_cast<OperatorTensor>(myReLU->getOperator()); + std::shared_ptr<ReLU_Op> op = std::make_shared<ReLU_Op>(); op->setDataType(DataType::Float32); op->setBackend("cuda"); @@ -147,9 +159,9 @@ TEST_CASE("[gpu/operator] ReLU(forward)", "[ReLU][GPU]") { cudaMemcpy(input_d, input_h, sizeof(float) * nb_elements, cudaMemcpyHostToDevice); T0->getImpl()->setRawPtr(input_d, nb_elements); - // Run inference + // Run inference start = std::chrono::system_clock::now(); - myReLU->forward(); + op->forward(); end = std::chrono::system_clock::now(); duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); @@ -162,8 +174,7 @@ TEST_CASE("[gpu/operator] ReLU(forward)", "[ReLU][GPU]") { delete[] input_h; cudaFree(input_d); } - std::cout << "number of elements over time spent: " << (number_of_operation / duration.count()) << std::endl; - std::cout << "total time: " << duration.count() << "μs" << std::endl; - + Log::info("Number of elements over time spent: {}\n", number_of_operation / duration.count()); + Log::info("Total time: {}μs\n", duration.count()); } } diff --git a/unit_tests/Test_ReduceMeanImpl.cpp b/unit_tests/Test_ReduceMeanImpl.cpp index 041ad6e02d5f39fde22f34ce715d2b807e164b1a..e9d678fc6c507c734b0835a843e5f3cd0fa3bd32 100644 --- a/unit_tests/Test_ReduceMeanImpl.cpp +++ b/unit_tests/Test_ReduceMeanImpl.cpp @@ -9,15 +9,24 @@ * ********************************************************************************/ -#include <array> -#include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <chrono> // std::micro, std::chrono::time_point, + // std::chrono::system_clock +#include <cmath> // std::fabs +#include <cstddef> // std::size_t +#include <memory> #include <catch2/catch_test_macros.hpp> +#include <cuda.h> +#include <fmt/core.h> -#include "aidge/backend/cpu.hpp" -#include "aidge/backend/cuda.hpp" +#include "aidge/backend/cpu/data/TensorImpl.hpp" +#include "aidge/backend/cpu/operator/ReduceMeanImpl.hpp" +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/ReduceMeanImpl.hpp" #include "aidge/data/Tensor.hpp" +#include "aidge/operator/ReduceMean.hpp" +#include "aidge/operator/OperatorTensor.hpp" +#include "aidge/utils/ArrayHelpers.hpp" #include "aidge/utils/TensorUtils.hpp" namespace Aidge { @@ -51,18 +60,17 @@ TEST_CASE("[gpu/operator] ReduceMean(forward)", "[ReduceMean][GPU]") { } }); - std::shared_ptr<Node> myReduceMean = ReduceMean({1}); - auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator()); + std::shared_ptr<ReduceMean_Op> op = std::make_shared<ReduceMean_Op>(std::vector<std::int32_t>({1})); op->associateInput(0,myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); - myReduceMean->forward(); + op->forward(); float* computedOutput = new float[myOutput->size()](); cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); - for(int i = 0; i < myOutput->size(); i++){ + for(std::size_t i = 0; i < myOutput->size(); i++){ const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + REQUIRE(std::fabs(computedOutput[i] - targetOutput) < 1e-6); } delete[] computedOutput; @@ -97,18 +105,17 @@ TEST_CASE("[gpu/operator] ReduceMean(forward)", "[ReduceMean][GPU]") { } }); - std::shared_ptr<Node> myReduceMean = ReduceMean({1, 2}); - auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator()); + std::shared_ptr<ReduceMean_Op> op = std::make_shared<ReduceMean_Op>(std::vector<std::int32_t>({1, 2})); op->associateInput(0,myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); - myReduceMean->forward(); + op->forward(); float* computedOutput = new float[myOutput->size()](); cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); - for(int i = 0; i < myOutput->size(); i++){ + for(std::size_t i = 0; i < myOutput->size(); ++i) { const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + REQUIRE(std::fabs(computedOutput[i] - targetOutput) < 1e-6); } delete[] computedOutput; @@ -140,18 +147,17 @@ TEST_CASE("[gpu/operator] ReduceMean(forward)", "[ReduceMean][GPU]") { } }); - std::shared_ptr<Node> myReduceMean = ReduceMean({1}, false); - auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator()); + std::shared_ptr<ReduceMean_Op> op = std::make_shared<ReduceMean_Op>(std::vector<std::int32_t>({1}), false); op->associateInput(0,myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); - myReduceMean->forward(); + op->forward(); float* computedOutput = new float[myOutput->size()](); cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); - for(int i = 0; i < myOutput->size(); i++){ + for(std::size_t i = 0; i < myOutput->size(); ++i) { const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); - std::cout << "computed: " << computedOutput[i] << ", target: " << targetOutput << std::endl; - REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + fmt::print("Computed: {}, target: {}\n", computedOutput[i], targetOutput); + REQUIRE(std::fabs(computedOutput[i] - targetOutput) < 1e-6); } delete[] computedOutput; @@ -180,17 +186,16 @@ TEST_CASE("[gpu/operator] ReduceMean(forward)", "[ReduceMean][GPU]") { {18.25} }); - std::shared_ptr<Node> myReduceMean = ReduceMean({0, 1, 2}, false); - auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator()); + std::shared_ptr<ReduceMean_Op> op = std::make_shared<ReduceMean_Op>(std::vector<std::int32_t>({0, 1, 2}), false); op->associateInput(0,myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); - myReduceMean->forward(); + op->forward(); float* computedOutput = new float[myOutput->size()](); cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); - for(int i = 0; i < myOutput->size(); i++){ + for(std::size_t i = 0; i < myOutput->size(); ++i) { const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + REQUIRE(std::fabs(computedOutput[i] - targetOutput) < 1e-6); } delete[] computedOutput; @@ -208,18 +213,17 @@ TEST_CASE("[gpu/operator] ReduceMean(forward)", "[ReduceMean][GPU]") { {0.1293547f} }); - std::shared_ptr<Node> myReduceMean = ReduceMean({0, 1}, false); - auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator()); + std::shared_ptr<ReduceMean_Op> op = std::make_shared<ReduceMean_Op>(std::vector<std::int32_t>({0, 1}), false); op->associateInput(0,myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); - myReduceMean->forward(); + op->forward(); float* computedOutput = new float[myOutput->size()](); cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); - for(int i = 0; i < myOutput->size(); i++){ + for(std::size_t i = 0; i < myOutput->size(); i++){ const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + REQUIRE(std::fabs(computedOutput[i] - targetOutput) < 1e-6); } delete[] computedOutput; @@ -242,19 +246,18 @@ TEST_CASE("[gpu/operator] ReduceMean(forward)", "[ReduceMean][GPU]") { } }); myInput->setBackend("cuda"); - std::shared_ptr<Node> myReduceMean = ReduceMean({}, false, true); - auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator()); + std::shared_ptr<ReduceMean_Op> op = std::make_shared<ReduceMean_Op>(std::vector<std::int32_t>(), false, true); op->associateInput(0,myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); - myReduceMean->forward(); - + op->forward(); + myInput->setBackend("cpu"); float* computedOutput = new float[myInput->size()](); cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myInput->size(), cudaMemcpyDeviceToHost); - for(int i = 0; i < myInput->size(); i++){ + for(std::size_t i = 0; i < myInput->size(); i++){ const float targetOutput = *(static_cast<float*>(myInput->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + REQUIRE(std::fabs(computedOutput[i] - targetOutput) < 1e-6); } delete[] computedOutput; @@ -283,12 +286,11 @@ TEST_CASE("[gpu/operator] ReduceMean(backward)", "[ReduceMean][GPU]") { myInput->setBackend("cuda"); - std::shared_ptr<Node> myReduceMean = ReduceMean({1}); - auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator()); + std::shared_ptr<ReduceMean_Op> op = std::make_shared<ReduceMean_Op>(std::vector<std::int32_t>({1})); op->associateInput(0,myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); - myReduceMean->forward(); + op->forward(); std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array3D<float,3,1,2> { @@ -317,14 +319,14 @@ TEST_CASE("[gpu/operator] ReduceMean(backward)", "[ReduceMean][GPU]") { }); myOutputGrad->setBackend("cuda"); op->getOutput(0)->setGrad(myOutputGrad); - REQUIRE_NOTHROW(myReduceMean->backward()); + REQUIRE_NOTHROW(op->backward()); float *computedGradCuda = new float[expectedInputGrad->size()](); cudaMemcpy(computedGradCuda, op->getInput(0)->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInputGrad->size(), cudaMemcpyDeviceToHost); - - for(int i = 0; i < expectedInputGrad->size(); i++){ + + for(std::size_t i = 0; i < expectedInputGrad->size(); i++){ const float targetOutput = *(static_cast<float*>(expectedInputGrad->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6); + REQUIRE(std::fabs(computedGradCuda[i] - targetOutput) < 1e-6); } delete[] computedGradCuda; diff --git a/unit_tests/Test_ReduceSumImpl.cpp b/unit_tests/Test_ReduceSumImpl.cpp index d0d37754102331c8f91a1ce1c81d679761916339..224601b906b0a18ccf1e09705f832f36fb29dad6 100644 --- a/unit_tests/Test_ReduceSumImpl.cpp +++ b/unit_tests/Test_ReduceSumImpl.cpp @@ -9,15 +9,19 @@ * ********************************************************************************/ -#include <array> -#include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <cmath> // std::fabs +#include <memory> #include <catch2/catch_test_macros.hpp> +#include <cuda.h> +#include <fmt/core.h> -#include "aidge/backend/cpu.hpp" -#include "aidge/backend/cuda.hpp" +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/ReduceSumImpl.hpp" +#include "aidge/data/Data.hpp" #include "aidge/data/Tensor.hpp" +#include "aidge/graph/Node.hpp" +#include "aidge/operator/ReduceSum.hpp" #include "aidge/utils/TensorUtils.hpp" namespace Aidge { @@ -51,7 +55,7 @@ TEST_CASE("[gpu/operator] ReduceSum(forward)", "[ReduceSum][GPU]") { }); std::shared_ptr<Node> myReduceSum = ReduceSum({1}); - auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator()); + auto op = std::static_pointer_cast<ReduceSum_Op>(myReduceSum -> getOperator()); op->associateInput(0,myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); @@ -61,7 +65,7 @@ TEST_CASE("[gpu/operator] ReduceSum(forward)", "[ReduceSum][GPU]") { cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); for(int i = 0; i < myOutput->size(); i++){ const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); - std::cout << "i: " << i << ", computed: " << computedOutput[i] << ", target: "<< targetOutput <<std::endl; + fmt::print("i: {}, computed: {}, target: {}\n", i, computedOutput[i], targetOutput); REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); } @@ -98,7 +102,7 @@ TEST_CASE("[gpu/operator] ReduceSum(forward)", "[ReduceSum][GPU]") { }); std::shared_ptr<Node> myReduceSum = ReduceSum({1, 2}); - auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator()); + auto op = std::static_pointer_cast<ReduceSum_Op>(myReduceSum -> getOperator()); op->associateInput(0,myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); @@ -108,7 +112,7 @@ TEST_CASE("[gpu/operator] ReduceSum(forward)", "[ReduceSum][GPU]") { cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); for(int i = 0; i < myOutput->size(); i++){ const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + REQUIRE(std::fabs(computedOutput[i] - targetOutput) < 1e-6f); } delete[] computedOutput; @@ -141,7 +145,7 @@ TEST_CASE("[gpu/operator] ReduceSum(forward)", "[ReduceSum][GPU]") { }); std::shared_ptr<Node> myReduceSum = ReduceSum({1}, false); - auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator()); + auto op = std::static_pointer_cast<ReduceSum_Op>(myReduceSum -> getOperator()); op->associateInput(0,myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); @@ -176,11 +180,11 @@ TEST_CASE("[gpu/operator] ReduceSum(forward)", "[ReduceSum][GPU]") { }); myInput->setBackend("cuda"); std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array1D<float,1> { - {219.0} + {219.0f} }); std::shared_ptr<Node> myReduceSum = ReduceSum({0, 1, 2}, false); - auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator()); + auto op = std::static_pointer_cast<ReduceSum_Op>(myReduceSum -> getOperator()); op->associateInput(0,myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); @@ -189,7 +193,7 @@ TEST_CASE("[gpu/operator] ReduceSum(forward)", "[ReduceSum][GPU]") { cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); for(int i = 0; i < myOutput->size(); i++){ const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + REQUIRE(std::fabs(computedOutput[i] - targetOutput) < 1e-6); } delete[] computedOutput; @@ -208,7 +212,7 @@ TEST_CASE("[gpu/operator] ReduceSum(forward)", "[ReduceSum][GPU]") { }); std::shared_ptr<Node> myReduceSum = ReduceSum({0, 1}, false); - auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator()); + auto op = std::static_pointer_cast<ReduceSum_Op>(myReduceSum -> getOperator()); op->associateInput(0,myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); @@ -248,7 +252,7 @@ TEST_CASE("[gpu/operator] ReduceSum(backward)", "[ReduceSum][GPU]") { std::shared_ptr<Node> myReduceSum = ReduceSum({1}); - auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator()); + auto op = std::static_pointer_cast<ReduceSum_Op>(myReduceSum -> getOperator()); op->associateInput(0,myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); @@ -285,7 +289,7 @@ TEST_CASE("[gpu/operator] ReduceSum(backward)", "[ReduceSum][GPU]") { float *computedGradCuda = new float[expectedInputGrad->size()](); cudaMemcpy(computedGradCuda, op->getInput(0)->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInputGrad->size(), cudaMemcpyDeviceToHost); - + for(int i = 0; i < expectedInputGrad->size(); i++){ const float targetOutput = *(static_cast<float*>(expectedInputGrad->getImpl()->rawPtr()) + i); REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6); diff --git a/unit_tests/Test_ReshapeImpl.cpp b/unit_tests/Test_ReshapeImpl.cpp index df9a4dda6d59371c8dd07f8c4442e3a3bb4a7159..d62fc4625c51ff4affb207d8dd7c30d0661ad294 100644 --- a/unit_tests/Test_ReshapeImpl.cpp +++ b/unit_tests/Test_ReshapeImpl.cpp @@ -9,15 +9,29 @@ * ********************************************************************************/ -#include <array> -#include <numeric> // std::accumulate, std::shuffle, std::transform -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <chrono> // std::micro, std::chrono::time_point, + // std::chrono::system_clock +#include <cmath> // std::fabs +#include <cstddef> // std::size_t +#include <cstdint> // std::uint16_t +#include <functional> // std::multiplies +#include <memory> +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937 + // std::uniform_int_distribution, std::uniform_real_distribution #include <catch2/catch_test_macros.hpp> +#include <cuda.h> +#include <fmt/core.h> -#include "aidge/backend/cpu.hpp" -#include "aidge/backend/cuda.hpp" +#include "aidge/backend/cpu/data/TensorImpl.hpp" +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/data/Data.hpp" #include "aidge/data/Tensor.hpp" +#include "aidge/graph/Node.hpp" +#include "aidge/operator/OperatorTensor.hpp" +#include "aidge/operator/Reshape.hpp" +#include "aidge/utils/ArrayHelpers.hpp" #include "aidge/utils/TensorUtils.hpp" using namespace Aidge; @@ -28,29 +42,26 @@ TEST_CASE("[gpu/operator] Reshape(forward)") { std::shared_ptr<Tensor> input = std::make_shared<Tensor>(Array1D<float,6> { {1.0, 2.0, 3.0, 4.0, 5.0, 6.0} }); - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float,2,3> { + input->setBackend("cuda"); + + Tensor expectedOutput = Array2D<float,2,3> { { {1.0, 2.0, 3.0}, {4.0, 5.0, 6.0} } - }); + }; - std::shared_ptr<Node> myReshape = Reshape({2, 3}); - auto op = std::static_pointer_cast<OperatorTensor>(myReshape -> getOperator()); - op->associateInput(0, input); + std::shared_ptr<Reshape_Op> op = std::make_shared<Reshape_Op>(std::vector<std::int64_t>({2, 3})); op->setDataType(DataType::Float32); op->setBackend("cuda"); - myReshape->forward(); - float* computedOutput = new float[myOutput->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + op->associateInput(0, input); + op->forward(); - for(int i = 0; i < myOutput->size(); i++){ - const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); - } + Tensor myOutput = *(op->getOutput(0)); + myOutput.setBackend("cpu", 0, true); // create a new impl on CPU - delete[] computedOutput; + REQUIRE(approxEq<cpptype_t<DataType::Float32>>(expectedOutput, myOutput, (1.0E-5F), (1.0E-8F))); } SECTION("2D Tensor") { std::shared_ptr<Tensor> input = std::make_shared<Tensor>(Array2D<float,2,3> { @@ -60,30 +71,28 @@ TEST_CASE("[gpu/operator] Reshape(forward)") { } }); - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float,3,2> { + input->setBackend("cuda"); + + Tensor expectedOutput = Array2D<float,3,2> { { {1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0} } - }); + }; - std::shared_ptr<Node> myReshape = Reshape({3, 2}); - auto op = std::static_pointer_cast<OperatorTensor>(myReshape -> getOperator()); - op->associateInput(0, input); + std::shared_ptr<Reshape_Op> op = std::make_shared<Reshape_Op>(std::vector<std::int64_t>({2, 3})); op->setDataType(DataType::Float32); op->setBackend("cuda"); - myReshape->forward(); - float* computedOutput = new float[myOutput->size()](); - cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + op->associateInput(0, input); + op->forward(); - for(int i = 0; i < myOutput->size(); i++){ - const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); - } + Tensor myOutput = *(op->getOutput(0)); + myOutput.setBackend("cpu", 0, true); // create a new impl on CPU + + REQUIRE(approxEq<cpptype_t<DataType::Float32>>(expectedOutput, myOutput, (1.0E-5F), (1.0E-8F))); - delete[] computedOutput; } SECTION("Random Input") { @@ -158,7 +167,7 @@ TEST_CASE("[gpu/operator] Reshape(forward)") { T0_cpu->resize(dims); T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements); - // Run inference + // Run inference start = std::chrono::system_clock::now(); op_cuda->forward(); end = std::chrono::system_clock::now(); @@ -176,99 +185,89 @@ TEST_CASE("[gpu/operator] Reshape(forward)") { delete[] array0; cudaFree(array0_d); } - std::cout << "number of elements over time spent: " << (number_of_operation / duration.count()) << std::endl; - std::cout << "total time: " << duration.count() << "μs" << std::endl; - + Log::info("Number of elements over time spent: {}\n", number_of_operation / duration.count()); + Log::info("Total time: {}μs\n", duration.count()); } } TEST_CASE("[gpu/operator] Reshape(backward)") { SECTION("1D Tensor") { - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<float,2,3> { + std::shared_ptr<Tensor> input = std::make_shared<Tensor>(Array2D<float,2,3> { { {1.0, 2.0, 3.0}, {4.0, 5.0, 6.0} } }); + input->setBackend("cuda"); - std::shared_ptr<Node> myReshape = Reshape({6}); - auto op = std::static_pointer_cast<OperatorTensor>(myReshape -> getOperator()); - op->associateInput(0, myInput); + std::shared_ptr<Reshape_Op> op = std::make_shared<Reshape_Op>(std::vector<std::int64_t>({6})); op->setDataType(DataType::Float32); op->setBackend("cuda"); - myReshape->forward(); + + op->associateInput(0, input); + op->forward(); // Run and test backward operation - std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array1D<float, 6> { + std::shared_ptr<Tensor> outputGrad = std::make_shared<Tensor>(Array1D<float, 6> { {1, 2, 3, 4, 5, 6} }); - myOutputGrad->setBackend("cuda"); - std::shared_ptr<Tensor> predictedOutput = op->getOutput(0); - std::shared_ptr<Tensor> input = op->getInput(0); - predictedOutput->setGrad(myOutputGrad); - REQUIRE_NOTHROW(myReshape->backward()); + outputGrad->setBackend("cuda"); + op->getOutput(0)->setGrad(outputGrad); + + REQUIRE_NOTHROW(op->backward()); - std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array2D<float,2,3> { + Tensor expectedInputGrad = Array2D<float,2,3> { { {1.0, 2.0, 3.0}, {4.0, 5.0, 6.0} } - }); + }; - float *computedGradCuda = new float[expectedInputGrad->size()](); - cudaMemcpy(computedGradCuda, input->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInputGrad->size(), cudaMemcpyDeviceToHost); - - for(int i = 0; i < expectedInputGrad->size(); i++){ - const float targetOutput = *(static_cast<float*>(expectedInputGrad->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6); - } + Tensor inputGrad = *(input->grad()); + inputGrad.setBackend("cpu"); - delete[] computedGradCuda; + REQUIRE(approxEq<cpptype_t<DataType::Float32>>(expectedInputGrad, inputGrad, (1.0E-5F), (1.0E-8F))); } + SECTION("2D Tensor") { - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<float,2,3> { + std::shared_ptr<Tensor> input = std::make_shared<Tensor>(Array2D<float,2,3> { { {1.0, 2.0, 3.0}, {4.0, 5.0, 6.0} } }); + input->setBackend("cuda"); - std::shared_ptr<Node> myReshape = Reshape({3, 2}); - auto op = std::static_pointer_cast<OperatorTensor>(myReshape -> getOperator()); - op->associateInput(0, myInput); + std::shared_ptr<Reshape_Op> op = std::make_shared<Reshape_Op>(std::vector<std::int64_t>({3, 2})); op->setDataType(DataType::Float32); op->setBackend("cuda"); - myReshape->forward(); + + op->associateInput(0, input); + op->forward(); // Run and test backward operation - std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array2D<float, 3, 2> { + std::shared_ptr<Tensor> outputGrad = std::make_shared<Tensor>(Array2D<float, 3, 2> { { {1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0} } }); - myOutputGrad->setBackend("cuda"); - std::shared_ptr<Tensor> predictedOutput = op->getOutput(0); - std::shared_ptr<Tensor> input = op->getInput(0); - predictedOutput->setGrad(myOutputGrad); - REQUIRE_NOTHROW(myReshape->backward()); + outputGrad->setBackend("cuda"); + op->getOutput(0)->setGrad(outputGrad); - std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array2D<float,2,3> { + REQUIRE_NOTHROW(op->backward()); + + Tensor expectedInputGrad = Array2D<float,2,3> { { {1.0, 2.0, 3.0}, {4.0, 5.0, 6.0} } - }); + }; - float *computedGradCuda = new float[expectedInputGrad->size()](); - cudaMemcpy(computedGradCuda, input->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInputGrad->size(), cudaMemcpyDeviceToHost); - - for(int i = 0; i < expectedInputGrad->size(); i++){ - const float targetOutput = *(static_cast<float*>(expectedInputGrad->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6); - } + Tensor inputGrad = *(input->grad()); + inputGrad.setBackend("cpu"); - delete[] computedGradCuda; + REQUIRE(approxEq<cpptype_t<DataType::Float32>>(expectedInputGrad, inputGrad, (1.0E-5F), (1.0E-8F))); } } diff --git a/unit_tests/Test_RoundImpl.cpp b/unit_tests/Test_RoundImpl.cpp index 4602e29c8070884927f2b4b3cda560aae380fd71..7d2dcbf512cef340e2e4ba49dd0472f4c647fd1c 100644 --- a/unit_tests/Test_RoundImpl.cpp +++ b/unit_tests/Test_RoundImpl.cpp @@ -9,28 +9,43 @@ * ********************************************************************************/ -#include <array> -#include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution -#include <chrono> -#include <catch2/catch_test_macros.hpp> +#include <chrono> // std::micro, std::chrono::time_point, + // std::chrono::system_clock +#include <cstddef> // std::size_t +#include <cstdint> // std::uint16_t +#include <functional> // std::multiplies +#include <memory> +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937 + // std::uniform_int_distribution, std::uniform_real_distribution +#include <vector> -#include "aidge/backend/cpu.hpp" -#include "aidge/backend/cuda.hpp" -#include "aidge/operator/Round.hpp" +#include <catch2/catch_test_macros.hpp> +#include <cuda.h> +#include <fmt/core.h> + +#include "aidge/backend/cpu/data/TensorImpl.hpp" +#include "aidge/backend/cpu/operator/RoundImpl.hpp" +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/RoundImpl.hpp" +#include "aidge/data/Data.hpp" #include "aidge/data/Tensor.hpp" +#include "aidge/graph/Node.hpp" +#include "aidge/operator/OperatorTensor.hpp" +#include "aidge/operator/Round.hpp" #include "aidge/utils/TensorUtils.hpp" + using namespace std::chrono; namespace Aidge { -TEST_CASE("[gpu/operator] Round", "[Round][GPU]") +TEST_CASE("[gpu/operator] Round", "[Round][GPU]") { constexpr std::uint16_t NBTRIALS = 10; // Create a random number generator std::random_device rd; std::mt19937 gen(rd()); - std::uniform_real_distribution<float> valueDist(0, 10); + std::uniform_real_distribution<float> valueDist(0, 10); std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1), std::size_t(20)); std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(3), std::size_t(6)); @@ -55,7 +70,7 @@ TEST_CASE("[gpu/operator] Round", "[Round][GPU]") const std::size_t nbDims = nbDimsDist(gen); std::vector<std::size_t> dims0; - for (std::size_t i = 0; i < nbDims; ++i) + for (std::size_t i = 0; i < nbDims; ++i) { const std::size_t dim = dimSizeDist(gen); dims0.push_back(dim); @@ -106,13 +121,13 @@ TEST_CASE("[gpu/operator] Round", "[Round][GPU]") delete[] array0; cudaFree(array0_d); - auto duration_cuda = duration_cast<milliseconds>(endcuda - startcuda).count(); - std::cout << "Temps d'exécution CUDA: " << duration_cuda << " ms" << std::endl; - auto duration_cpu = duration_cast<milliseconds>(endcpu - startcpu).count(); - std::cout << "Temps d'exécution CPU: " << duration_cpu << " ms" << std::endl; + const auto duration_cuda = duration_cast<milliseconds>(endcuda - startcuda).count(); + fmt::print("Execution time on CUDA: {} ms\n", duration_cuda); + const auto duration_cpu = duration_cast<milliseconds>(endcpu - startcpu).count(); + fmt::print("Execution time on CPU: {} ms\n", duration_cpu); // Benchmark between CUDA and CPU execution time - auto difference = duration_cpu - duration_cuda; - std::cout << "Différence de temps (CPU - CUDA): " << difference << " ms" << std::endl; + const float ratio = static_cast<float>(duration_cuda) / static_cast<float>(duration_cpu); + fmt::print("Execution time ratio (CUDA/CPU): {} %\n", ratio*100); } } diff --git a/unit_tests/Test_ShiftGELUImpl.cpp b/unit_tests/Test_ShiftGELUImpl.cpp index 86e747e735eccb397caa8062f52c2561e8ef759d..44c6a72c68442ec4737b2774751a8752d63926fe 100644 --- a/unit_tests/Test_ShiftGELUImpl.cpp +++ b/unit_tests/Test_ShiftGELUImpl.cpp @@ -11,16 +11,20 @@ * ********************************************************************************/ -#include <array> +#include <cmath> // std::fabs +#include <cstddef> // std::size_t +#include <memory> #include <catch2/catch_test_macros.hpp> +#include <cuda.h> -#include "Test_cuda.hpp" - +#include "aidge/backend/cpu/data/TensorImpl.hpp" +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/ShiftGELUImpl.hpp" +#include "aidge/data/Data.hpp" #include "aidge/data/Tensor.hpp" - -#include "aidge/backend/cpu.hpp" -#include "aidge/backend/cuda.hpp" +#include "aidge/operator/ShiftGELU.hpp" +#include "aidge/utils/TensorUtils.hpp" using namespace Aidge; @@ -103,27 +107,26 @@ TEST_CASE("[gpu/operator] ShiftGELU(forward)", "[ShiftGELU][GPU]") { } }); - std::shared_ptr<Node> myShiftGELU = ShiftGELU(); - auto op = std::static_pointer_cast<OperatorTensor>(myShiftGELU -> getOperator()); + std::shared_ptr<ShiftGELU_Op> op = std::make_shared<ShiftGELU_Op>(); op->associateInput(0,input0); op->setDataType(DataType::Float32); op->setBackend("cuda"); op->forward(); - + float* computedOutput = new float[output_shiftGELU->size()](); cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * output_shiftGELU->size(), cudaMemcpyDeviceToHost); //test if forward result are as expected - for(int i = 0; i < output_shiftGELU->size(); i++){ + for(std::size_t i = 0; i < output_shiftGELU->size(); i++){ const float targetOutput = *(static_cast<float*>(output_shiftGELU->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + REQUIRE(std::fabs(computedOutput[i] - targetOutput) < 1e-6); } //measure difference between GELU and shiftgelu float sum = 0.0; - for(int i = 0; i < output_GELU->size(); i++){ + for(std::size_t i = 0; i < output_GELU->size(); i++){ const float targetOutput = *(static_cast<float*>(output_GELU->getImpl()->rawPtr()) + i); - sum += fabs(computedOutput[i] - targetOutput); + sum += std::fabs(computedOutput[i] - targetOutput); } sum = sum / output_GELU->size(); REQUIRE(sum < 1.5e-1); @@ -146,15 +149,14 @@ TEST_CASE("[gpu/operator] ShiftGELU(backward)", "[ShiftGELU][GPU]") }, } }); - + input0->setBackend("cuda"); - std::shared_ptr<Node> myShiftGELU = ShiftGELU(); - auto op = std::static_pointer_cast<OperatorTensor>(myShiftGELU->getOperator()); + std::shared_ptr<ShiftGELU_Op> op = std::make_shared<ShiftGELU_Op>(); op->associateInput(0, input0); op->setDataType(DataType::Float32); op->setBackend("cuda"); - myShiftGELU->forward(); + op->forward(); std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array4D<float,1,1,1,8> { { @@ -171,7 +173,7 @@ TEST_CASE("[gpu/operator] ShiftGELU(backward)", "[ShiftGELU][GPU]") std::shared_ptr<Tensor> predictedOutput = op->getOutput(0); std::shared_ptr<Tensor> input = op->getInput(0); predictedOutput->setGrad(myOutputGrad); - REQUIRE_NOTHROW(myShiftGELU->backward()); + REQUIRE_NOTHROW(op->backward()); //expected output of shiftgelu backward operator std::shared_ptr<Tensor> expectedInputGradShiftGELU = std::make_shared<Tensor>(Array4D<float,1,1,1,8> { @@ -201,16 +203,16 @@ TEST_CASE("[gpu/operator] ShiftGELU(backward)", "[ShiftGELU][GPU]") cudaMemcpy(computedGradCuda, input->grad()->getImpl()->rawPtr(), sizeof(float) * myOutputGrad->size(), cudaMemcpyDeviceToHost); //test if backward result are as expected - for(int i = 0; i < expectedInputGradShiftGELU->size(); i++){ + for(std::size_t i = 0; i < expectedInputGradShiftGELU->size(); i++){ const float targetOutput = *(static_cast<float*>(expectedInputGradShiftGELU->getImpl()->rawPtr()) + i); - REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 2e-6); + REQUIRE(std::fabs(computedGradCuda[i] - targetOutput) < 2e-6); } //measure difference between gelu and shifgelu float sum = 0.0; - for(int i = 0; i < expectedInputGradGELU->size(); i++){ + for(std::size_t i = 0; i < expectedInputGradGELU->size(); i++){ const float targetOutput = *(static_cast<float*>(expectedInputGradGELU->getImpl()->rawPtr()) + i); - sum += fabs(computedGradCuda[i] - targetOutput); + sum += std::fabs(computedGradCuda[i] - targetOutput); } sum = sum / expectedInputGradGELU->size(); REQUIRE(sum < 2e-1); diff --git a/unit_tests/Test_ShiftMaxImpl.cpp b/unit_tests/Test_ShiftMaxImpl.cpp index 2a94a23c3a04edd72cb535ebfb6e2c538e4aeee8..c357dc602515dc151ba039f12f67ba1cb4e0fdd5 100644 --- a/unit_tests/Test_ShiftMaxImpl.cpp +++ b/unit_tests/Test_ShiftMaxImpl.cpp @@ -11,16 +11,21 @@ * ********************************************************************************/ -#include <array> -#include <catch2/catch_test_macros.hpp> +#include <cmath> // std::fabs +#include <cstddef> // std::size_t +#include <memory> -#include "Test_cuda.hpp" +#include <catch2/catch_test_macros.hpp> +#include <cuda.h> +#include "aidge/backend/cpu/data/TensorImpl.hpp" +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/ShiftMaxImpl.hpp" +#include "aidge/data/Data.hpp" #include "aidge/data/Tensor.hpp" - -#include "aidge/backend/cpu.hpp" -#include "aidge/backend/cuda.hpp" +#include "aidge/operator/ShiftMax.hpp" +#include "aidge/utils/TensorUtils.hpp" using namespace Aidge; @@ -107,7 +112,7 @@ TEST_CASE("[gpu/operator] ShiftMax(forward)", "[ShiftMax][GPU]") { op->setDataType(DataType::Float32); op->setBackend("cuda"); op->forward(); - + float* computedOutput = new float[output_shiftmax->size()](); cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * output_shiftmax->size(), cudaMemcpyDeviceToHost); @@ -144,7 +149,7 @@ TEST_CASE("[gpu/operator] ShiftMax(backward)", "[ShiftMax][GPU]") }, } }); - + input0->setBackend("cuda"); std::shared_ptr<Node> myShiftMax = ShiftMax(); diff --git a/unit_tests/Test_SoftmaxImpl.cpp b/unit_tests/Test_SoftmaxImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..399f0d3437819f8414992a899678a42053e81400 --- /dev/null +++ b/unit_tests/Test_SoftmaxImpl.cpp @@ -0,0 +1,106 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <array> +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution + +#include <catch2/catch_test_macros.hpp> + +#include "aidge/backend/cpu.hpp" +#include "aidge/backend/cuda.hpp" +#include "aidge/data/Tensor.hpp" +#include "aidge/utils/TensorUtils.hpp" + +namespace Aidge { + +TEST_CASE("[gpu/operator] Softmax", "[Softmax][GPU]") { +constexpr std::uint16_t NBTRIALS = 10; + // Create a random number generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<float> valueDist( + 0.1f, 1.1f); // Random float distribution between 0 and 1 + std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1), + std::size_t(10)); + std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(2), std::size_t(5)); + + // To measure execution time of 'forward()' + std::chrono::time_point<std::chrono::system_clock> start; + std::chrono::time_point<std::chrono::system_clock> end; + std::chrono::duration<double, std::micro> duration{}; + std::size_t number_of_operation = 0; + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) + { + // Create Softmax Operator CUDA + std::shared_ptr<Node> mySoftmaxCUDA = Softmax(1); + auto op_cuda = std::static_pointer_cast<OperatorTensor>(mySoftmaxCUDA -> getOperator()); + + // Create Softmax Operator CPU + std::shared_ptr<Node> mySoftmaxCPU = Softmax(1); + auto op_cpu = std::static_pointer_cast<OperatorTensor>(mySoftmaxCPU -> getOperator()); + op_cpu->setDataType(DataType::Float32); + op_cpu->setBackend("cpu"); + + const std::size_t nbDims = nbDimsDist(gen); + std::vector<std::size_t> dims; + for (std::size_t i = 0; i < nbDims; ++i) { + dims.push_back(dimSizeDist(gen)); + } + + const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + number_of_operation += nb_elements; + float* array0 = new float[nb_elements]; + + for (std::size_t i = 0; i < nb_elements; ++i) { + array0[i] = valueDist(gen); + } + + // input0 CUDA + float* array0_d; + std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>(); + T0_cuda->setDataType(DataType::Float32); + T0_cuda->setBackend("cuda"); + T0_cuda->resize(dims); + op_cuda->associateInput(0, T0_cuda); + cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements); + cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice); + T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements); + + // input0 CPU + std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>(); + op_cpu->associateInput(0,T0_cpu); + T0_cpu->setDataType(DataType::Float32); + T0_cpu->setBackend("cpu"); + T0_cpu->resize(dims); + T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements); + + // forward CUDA + op_cuda->setDataType(DataType::Float32); + op_cuda->setBackend("cuda"); + start = std::chrono::system_clock::now(); + op_cuda->forward(); + end = std::chrono::system_clock::now(); + duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + + // forward CPU + op_cpu->forward(); + + std::shared_ptr<Tensor> outputFallback; + const auto& cudaOutput = op_cuda->getOutput(0)->refCastFrom(outputFallback, *op_cpu->getOutput(0)); + + REQUIRE(approxEq<float>(cudaOutput, *(op_cpu->getOutput(0)))); + + delete[] array0; + cudaFree(array0_d); + } +} +} // namespace Aidge \ No newline at end of file diff --git a/unit_tests/Test_SqrtImpl.cpp b/unit_tests/Test_SqrtImpl.cpp index 432b5f474748eb2a68579328bd2a65daeaa1f734..934cb0949be2ea110758a916846afbcfc85cec70 100644 --- a/unit_tests/Test_SqrtImpl.cpp +++ b/unit_tests/Test_SqrtImpl.cpp @@ -9,15 +9,30 @@ * ********************************************************************************/ -#include <array> -#include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <chrono> // std::micro, std::chrono::time_point, + // std::chrono::system_clock +#include <cstddef> // std::size_t +#include <cstdint> // std::uint16_t +#include <functional> // std::multiplies +#include <memory> +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937 + // std::uniform_int_distribution, std::uniform_real_distribution +#include <vector> #include <catch2/catch_test_macros.hpp> - -#include "aidge/backend/cpu.hpp" -#include "aidge/backend/cuda.hpp" +#include <cuda.h> +#include <fmt/core.h> + +#include "aidge/backend/cpu/data/TensorImpl.hpp" +#include "aidge/backend/cpu/operator/SqrtImpl.hpp" +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/SqrtImpl.hpp" +#include "aidge/data/Data.hpp" #include "aidge/data/Tensor.hpp" +#include "aidge/graph/Node.hpp" +#include "aidge/operator/OperatorTensor.hpp" +#include "aidge/operator/Sqrt.hpp" #include "aidge/utils/TensorUtils.hpp" namespace Aidge { @@ -101,7 +116,7 @@ constexpr std::uint16_t NBTRIALS = 10; delete[] array0; cudaFree(array0_d); } - std::cout << "number of elements over time spent: " << (number_of_operation / duration.count()) << std::endl; - std::cout << "total time: " << duration.count() << "μs" << std::endl; + Log::info("Number of elements over time spent: {}\n", number_of_operation / duration.count()); + Log::info("Total time: {}μs\n", duration.count()); } } // namespace Aidge diff --git a/unit_tests/Test_TensorImpl.cpp b/unit_tests/Test_TensorImpl.cpp index c24b5b457cce602d465a1ecddefc7b7f35964794..b25f4bc3cad4712236d97c46f31b1029e3cb3980 100644 --- a/unit_tests/Test_TensorImpl.cpp +++ b/unit_tests/Test_TensorImpl.cpp @@ -9,21 +9,23 @@ * ********************************************************************************/ -#include <array> +#include <algorithm> // std::equal +#include <cmath> // std::fabs +#include <vector> #include <catch2/catch_test_macros.hpp> +#include <cuda.h> #include "Test_cuda.hpp" - +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/data/TensorImpl.hpp" #include "aidge/data/Tensor.hpp" - -#include "aidge/backend/cpu.hpp" -#include "aidge/backend/cuda.hpp" +#include "aidge/utils/ArrayHelpers.hpp" using namespace Aidge; TEST_CASE("CUDA test") { - const int N = 100; + constexpr int N = 100; // Allocate host memory float* a = new float[N](); @@ -54,7 +56,7 @@ TEST_CASE("CUDA test") { // Verification for(int i = 0; i < N; i++){ - REQUIRE(fabs(out[i] - a[i] - b[i]) < 1e-6); + REQUIRE(std::fabs(out[i] - a[i] - b[i]) < 1e-6); } // Deallocate device memory @@ -90,7 +92,7 @@ TEST_CASE("Tensor creation", "[Connector]") { REQUIRE(x.dims()[2] == 2); REQUIRE(x.size() == 8); - std::array<int, 8> val; + int val[8] = {0,0,0,0,0,0,0,0}; cudaMemcpy(&val[0], x.getImpl()->rawPtr(), 8 * sizeof(int), cudaMemcpyDeviceToHost); REQUIRE(val[0] == 1); REQUIRE(val[7] == 8); diff --git a/version.txt b/version.txt index 26b5dec61fefd5d211e3f0786863a4be5edeb195..8ea2ddfc77b9f41e373e0591f46fc2fc155eb4ca 100644 --- a/version.txt +++ b/version.txt @@ -1,2 +1,2 @@ -0.4.0 +0.5.0