/********************************************************************************
 * Copyright (c) 2023 CEA-List
 *
 * This program and the accompanying materials are made available under the
 * terms of the Eclipse Public License 2.0 which is available at
 * http://www.eclipse.org/legal/epl-2.0.
 *
 * SPDX-License-Identifier: EPL-2.0
 *
 ********************************************************************************/

#include <array>

#include <catch2/catch_test_macros.hpp>

#include "Test_cuda.hpp"

#include "aidge/data/Tensor.hpp"

#include "aidge/backend/cpu.hpp"
#include "aidge/backend/cuda.hpp"

using namespace Aidge;

TEST_CASE("CUDA test") {
    const int N = 100;

    // Allocate host memory
    float* a   = new float[N]();
    float* b   = new float[N]();
    float* out = new float[N]();

    // Initialize host arrays
    for(int i = 0; i < N; i++){
        a[i] = 1.0f;
        b[i] = 2.0f;
    }

    // Allocate device memory
    float *d_a, *d_b, *d_out; 
    cudaMalloc(reinterpret_cast<void**>(&d_a), sizeof(float) * N);
    cudaMalloc(reinterpret_cast<void**>(&d_b), sizeof(float) * N);
    cudaMalloc(reinterpret_cast<void**>(&d_out), sizeof(float) * N);

    // Transfer data from host to device memory
    cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice);

    // Executing kernel 
    vector_add(d_out, d_a, d_b, N);
    
    // Transfer data back to host memory
    cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost);

    // Verification
    for(int i = 0; i < N; i++){
        REQUIRE(fabs(out[i] - a[i] - b[i]) < 1e-6);
    }

    // Deallocate device memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_out);

    // Deallocate host memory
    delete[] a;
    delete[] b;
    delete[] out;
}

TEST_CASE("Tensor creation", "[Connector]") {
    SECTION("from const array") {
        Tensor x;
        x.setBackend("cuda");
        x = Array3D<int,2,2,2>{
        {
            {
                {1, 2},
                {3, 4}
            },
            {
                {5, 6},
                {7, 8}
            }
        }};

        REQUIRE(x.nbDims() == 3);
        REQUIRE(x.dims()[0] == 2);
        REQUIRE(x.dims()[1] == 2);
        REQUIRE(x.dims()[2] == 2);
        REQUIRE(x.size() == 8);

        std::array<int, 8> val;
        cudaMemcpy(&val[0], x.getImpl()->rawPtr(), 8 * sizeof(int), cudaMemcpyDeviceToHost);
        REQUIRE(val[0] == 1);
        REQUIRE(val[7] == 8);
    }

    SECTION("from const array before backend") {
        Tensor x = Array3D<int,2,2,2>{
        {
            {
                {1, 2},
                {3, 4}
            },
            {
                {5, 6},
                {7, 8}
            }
        }};
        x.setBackend("cuda");

        REQUIRE(x.nbDims() == 3);
        REQUIRE(x.dims()[0] == 2);
        REQUIRE(x.dims()[1] == 2);
        REQUIRE(x.dims()[2] == 2);
        REQUIRE(x.size() == 8);

        std::array<int, 8> val;
        cudaMemcpy(&val[0], x.getImpl()->rawPtr(), 8 * sizeof(int), cudaMemcpyDeviceToHost);
        REQUIRE(val[0] == 1);
        REQUIRE(val[7] == 8);
    }
}