From e3d5cc509d06518e233b227e56fb3fc20b36419f Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Wed, 13 Sep 2023 18:32:30 +0200
Subject: [PATCH] Fixed CUDA compilation

---
 CMakeLists.txt                    | 62 +++++++++++++------------------
 cmake/PybindModuleCreation.cmake  | 34 ++++++++---------
 include/aidge/data/TensorImpl.hpp | 14 +++++--
 include/aidge/utils/CudaUtils.hpp |  1 +
 unit_tests/CMakeLists.txt         |  7 +---
 unit_tests/Test_TensorImpl.cpp    | 48 ++++++++++++++++++++++++
 unit_tests/Test_cuda.cu           | 12 ++++++
 unit_tests/Test_cuda.hpp          |  6 +++
 8 files changed, 120 insertions(+), 64 deletions(-)
 create mode 100644 unit_tests/Test_cuda.cu
 create mode 100644 unit_tests/Test_cuda.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a52f230..cf4866d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,5 @@
-cmake_minimum_required(VERSION 3.15)
+# CMake >= 3.18 is required for good support of FindCUDAToolkit
+cmake_minimum_required(VERSION 3.18)
 
 file(READ "${CMAKE_SOURCE_DIR}/version.txt" version)
 file(READ "${CMAKE_SOURCE_DIR}/project_name.txt" project)
@@ -12,8 +13,6 @@ set(module_name _${project}) # target name
 
 project(${project})
 
-enable_language(CUDA)
-
 ##############################################
 # Define options
 option(PYBIND "python binding" ON)
@@ -26,18 +25,18 @@ option(COVERAGE "Enable coverage" OFF)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
 include(PybindModuleCreation)
 
-#if(CMAKE_COMPILER_IS_GNUCXX AND COVERAGE)
-#    Include(CodeCoverage)
-#endif()
+if(CMAKE_COMPILER_IS_GNUCXX AND COVERAGE)
+    Include(CodeCoverage)
+endif()
+
+enable_language(CUDA)
 
 ##############################################
 # Find system dependencies
-find_library(CUDART_LIBRARY cudart ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
-message(STATUS "CUDA Toolkit include DIRS: ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}")
+find_package(CUDAToolkit REQUIRED)
 
 find_package(aidge_core REQUIRED)
 find_package(aidge_backend_cpu REQUIRED)
-# TODO: add here additional Aidge dependencies if you need to.
 
 ##############################################
 # Create target and set properties
@@ -50,22 +49,26 @@ target_link_libraries(${module_name}
     PUBLIC
         _aidge_core # _ is added because we link the target not the project
         _aidge_backend_cpu # _ is added because we link the target not the project
-        # TODO: add here additional Aidge dependencies if you need to.
-        ${CUDART_LIBRARY}
+        CUDA::cudart
 )
 
 #Set target properties
-set_property(TARGET ${module_name} PROPERTY POSITION_INDEPENDENT_CODE ON)
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --std=c++14")
 target_include_directories(${module_name}
     PUBLIC
         $<INSTALL_INTERFACE:include>
         $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
     PRIVATE
         ${CMAKE_CURRENT_SOURCE_DIR}/src
-        ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
 )
 
+if(NOT DEFINED CMAKE_CUDA_STANDARD)
+    set(CMAKE_CUDA_STANDARD 14)
+    set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+endif()
+
+set_property(TARGET ${module_name} PROPERTY POSITION_INDEPENDENT_CODE ON)
+set_target_properties(${module_name} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
 # PYTHON BINDING
 if (PYBIND)
     generate_python_binding(${project} ${module_name})
@@ -81,32 +84,19 @@ endif()
 
 target_compile_features(${module_name} PRIVATE cxx_std_14)
 
-
-if(WERROR)
-    target_compile_options(${module_name} PRIVATE
+target_compile_options(${module_name} PRIVATE
     $<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:
-    -Wall -Wextra -fPIC -Wold-style-cast -Winline -pedantic -Werror=narrowing -Wshadow -Werror>)
-    target_compile_options(${module_name} PRIVATE
+    -Wall -Wextra -Wold-style-cast -Winline -pedantic -Werror=narrowing -Wshadow $<$<BOOL:${WERROR}>:-Werror>>)
+target_compile_options(${module_name} PRIVATE
+    $<$<COMPILE_LANGUAGE:CUDA>:
+    -Wall>)
+target_compile_options(${module_name} PRIVATE
     $<$<CXX_COMPILER_ID:MSVC>:
     /W4>)
-else()
-    target_compile_options(${module_name} PRIVATE
-        $<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:
-        -Wall -Wextra -fPIC -Wold-style-cast -Winline -pedantic -Werror=narrowing -Wshadow -Wpedantic>)
-        target_compile_options(${module_name} PRIVATE
-        $<$<CXX_COMPILER_ID:MSVC>:
-        /W4>)
-endif()
-
-#if(CMAKE_COMPILER_IS_GNUCXX AND COVERAGE)
-#    append_coverage_compiler_flags()
-#endif()
 
-# We need to explicitly state that we need all CUDA files in the 
-# ${module_name} library to be built with -dc as the member functions 
-# could be called by other libraries and executables
-set_target_properties(${module_name}
-                       PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+if(CMAKE_COMPILER_IS_GNUCXX AND COVERAGE)
+    append_coverage_compiler_flags()
+endif()
 
 ##############################################
 # Installation instructions
diff --git a/cmake/PybindModuleCreation.cmake b/cmake/PybindModuleCreation.cmake
index 18f4abc..8030c1a 100644
--- a/cmake/PybindModuleCreation.cmake
+++ b/cmake/PybindModuleCreation.cmake
@@ -1,23 +1,21 @@
-function(generate_python_binding name target_to_bind) 
-    if (PYBIND)
-        add_definitions(-DPYBIND)
-        Include(FetchContent)
+function(generate_python_binding name target_to_bind)
+    add_definitions(-DPYBIND)
+    Include(FetchContent)
 
-        FetchContent_Declare(
-        PyBind11
-        GIT_REPOSITORY https://github.com/pybind/pybind11.git
-        GIT_TAG        v2.10.4 # or a later release
-        )
+    FetchContent_Declare(
+    PyBind11
+    GIT_REPOSITORY https://github.com/pybind/pybind11.git
+    GIT_TAG        v2.10.4 # or a later release
+    )
 
-        # Use the New FindPython mode, recommanded. Requires CMake 3.15+
-        find_package(Python COMPONENTS Interpreter Development)
-        FetchContent_MakeAvailable(PyBind11)
+    # Use the New FindPython mode, recommanded. Requires CMake 3.15+
+    find_package(Python COMPONENTS Interpreter Development)
+    FetchContent_MakeAvailable(PyBind11)
 
-        message(STATUS "Creating binding for module ${name}")
-        file(GLOB_RECURSE pybind_src_files "python_binding/*.cpp")
+    message(STATUS "Creating binding for module ${name}")
+    file(GLOB_RECURSE pybind_src_files "python_binding/*.cpp")
 
-        pybind11_add_module(${name} MODULE ${pybind_src_files} "NO_EXTRAS") # NO EXTRA recquired for pip install
-        target_include_directories(${name} PUBLIC "python_binding")
-        target_link_libraries(${name} PUBLIC ${target_to_bind})        
-    endif()
+    pybind11_add_module(${name} MODULE ${pybind_src_files} "NO_EXTRAS") # NO EXTRA recquired for pip install
+    target_include_directories(${name} PUBLIC "python_binding")
+    target_link_libraries(${name} PUBLIC ${target_to_bind})
 endfunction()
diff --git a/include/aidge/data/TensorImpl.hpp b/include/aidge/data/TensorImpl.hpp
index 47edb55..fa12344 100644
--- a/include/aidge/data/TensorImpl.hpp
+++ b/include/aidge/data/TensorImpl.hpp
@@ -1,6 +1,9 @@
 #ifndef AIDGE_BACKEND_CUDA_DATA_TENSORIMPL_H_
 #define AIDGE_BACKEND_CUDA_DATA_TENSORIMPL_H_
 
+#include <thrust/equal.h>
+#include <thrust/execution_policy.h>
+
 #include "aidge/backend/TensorImpl.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/utils/Registrar.hpp"
@@ -22,8 +25,12 @@ class TensorImpl_cuda : public TensorImpl {
     TensorImpl_cuda(const Tensor &tensor) : TensorImpl(Backend), mTensor(tensor) {}
 
     bool operator==(const TensorImpl &otherImpl) const override final {
-        printf("Not implemented yet.");
-        return false;
+        const auto& otherImplCuda = static_cast<const TensorImpl_cuda<T>&>(otherImpl);
+
+        if (mTensor.size() != otherImplCuda.mTensor.size())
+            return false;
+
+        return thrust::equal(mData, mData + mTensor.size(), otherImplCuda.mData);
     }
 
     static std::unique_ptr<TensorImpl_cuda> create(const Tensor &tensor) {
@@ -49,9 +56,8 @@ class TensorImpl_cuda : public TensorImpl {
             cudaFree(mData);
     }
 
-    void setRawPtr(void *ptr) override final {
+    void setRawPtr(void* /*ptr*/) override final {
         printf("Not implemented yet.");
-        return false;
     };
 
    private:
diff --git a/include/aidge/utils/CudaUtils.hpp b/include/aidge/utils/CudaUtils.hpp
index df657eb..767025c 100644
--- a/include/aidge/utils/CudaUtils.hpp
+++ b/include/aidge/utils/CudaUtils.hpp
@@ -2,6 +2,7 @@
 #define CudaUtils_cuda_H_
 
 #include <string>
+#include <cassert>
 
 #include <cuda.h>
 
diff --git a/unit_tests/CMakeLists.txt b/unit_tests/CMakeLists.txt
index c57b370..ab65c92 100644
--- a/unit_tests/CMakeLists.txt
+++ b/unit_tests/CMakeLists.txt
@@ -8,15 +8,10 @@ FetchContent_Declare(
 
 FetchContent_MakeAvailable(Catch2)
 
-file(GLOB_RECURSE src_files "*.cpp")
+file(GLOB_RECURSE src_files "*.cpp" "*.cu")
 
 add_executable(tests${module_name} ${src_files})
 
-target_include_directories(tests${module_name}
-    PRIVATE
-        ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
-)
-
 target_link_libraries(tests${module_name} PUBLIC ${module_name})
 
 target_link_libraries(tests${module_name} PRIVATE Catch2::Catch2WithMain)
diff --git a/unit_tests/Test_TensorImpl.cpp b/unit_tests/Test_TensorImpl.cpp
index 40e8aa8..56e3ee1 100644
--- a/unit_tests/Test_TensorImpl.cpp
+++ b/unit_tests/Test_TensorImpl.cpp
@@ -13,11 +13,59 @@
 
 #include <catch2/catch_test_macros.hpp>
 
+#include "Test_cuda.hpp"
+
 #include "aidge/data/Tensor.hpp"
 #include "aidge/data/TensorImpl.hpp"
 
 using namespace Aidge;
 
+TEST_CASE("CUDA test") {
+    const int N = 100;
+
+    // Allocate host memory
+    float* a   = new float[N]();
+    float* b   = new float[N]();
+    float* out = new float[N]();
+
+    // Initialize host arrays
+    for(int i = 0; i < N; i++){
+        a[i] = 1.0f;
+        b[i] = 2.0f;
+    }
+
+    // Allocate device memory
+    float *d_a, *d_b, *d_out; 
+    cudaMalloc(reinterpret_cast<void**>(&d_a), sizeof(float) * N);
+    cudaMalloc(reinterpret_cast<void**>(&d_b), sizeof(float) * N);
+    cudaMalloc(reinterpret_cast<void**>(&d_out), sizeof(float) * N);
+
+    // Transfer data from host to device memory
+    cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice);
+    cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice);
+
+    // Executing kernel 
+    vector_add(d_out, d_a, d_b, N);
+    
+    // Transfer data back to host memory
+    cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost);
+
+    // Verification
+    for(int i = 0; i < N; i++){
+        REQUIRE(fabs(out[i] - a[i] - b[i]) < 1e-6);
+    }
+
+    // Deallocate device memory
+    cudaFree(d_a);
+    cudaFree(d_b);
+    cudaFree(d_out);
+
+    // Deallocate host memory
+    delete[] a;
+    delete[] b;
+    delete[] out;
+}
+
 TEST_CASE("Tensor creation", "[Connector]") {
     SECTION("from const array") {
         Tensor x;
diff --git a/unit_tests/Test_cuda.cu b/unit_tests/Test_cuda.cu
new file mode 100644
index 0000000..c46aee3
--- /dev/null
+++ b/unit_tests/Test_cuda.cu
@@ -0,0 +1,12 @@
+#include "Test_cuda.hpp"
+
+__global__ void vector_add_kernel(float *out, float *a, float *b, int n) {
+    for(int i = 0; i < n; i ++){
+        out[i] = a[i] + b[i];
+    }
+}
+
+void vector_add(float *out, float *a, float *b, int n) {
+    vector_add_kernel<<<1, 2>>>(out, a, b, n);
+    CHECK_CUDA_STATUS(cudaPeekAtLastError());
+}
diff --git a/unit_tests/Test_cuda.hpp b/unit_tests/Test_cuda.hpp
new file mode 100644
index 0000000..1d319f0
--- /dev/null
+++ b/unit_tests/Test_cuda.hpp
@@ -0,0 +1,6 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include "aidge/utils/CudaUtils.hpp"
+
+void vector_add(float *out, float *a, float *b, int n);
-- 
GitLab