Test_AvgPoolingImpl.cpp

/********************************************************************************
 * Copyright (c) 2024 CEA-List
 *
 * This program and the accompanying materials are made available under the
 * terms of the Eclipse Public License 2.0 which is available at
 * http://www.eclipse.org/legal/epl-2.0.
 *
 * SPDX-License-Identifier: EPL-2.0
 *
 ********************************************************************************/

#include <array>

#include <catch2/catch_test_macros.hpp>
#include <cuda_fp16.h>
#include <numeric>   // std::accumulate
#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution

#include "Test_cuda.hpp"

#include "aidge/data/half.hpp"
#include "aidge/data/Tensor.hpp"

#include "aidge/backend/cpu.hpp"
#include "aidge/backend/cuda.hpp"

using namespace Aidge;

TEST_CASE("[gpu/operator] AvgPooling(forward)", "[AvgPooling][GPU]") {
    std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,2,5,5> { //NCHW
        {
            {
                {{  0,   1,   2,   3,   4},
                 {  5,   6,   7,   8,   9},
                 { 10,  11,  12,  13,  14},
                 { 15,  16,  17,  18,  19},
                 { 20,  21,  22,  23,  24}},

                {{ 25,  26,  27,  28,  29},
                 { 30,  31,  32,  33,  34},
                 { 35,  36,  37,  38,  39},
                 { 40,  41,  42,  43,  44},
                 { 45,  46,  47,  48,  49}}
            },
            {
                {{100, 101, 102, 103, 104},
                 {105, 106, 107, 108, 109},
                 {110, 111, 112, 113, 114},
                 {115, 116, 117, 118, 119},
                 {120, 121, 122, 123, 124}},

                {{125, 126, 127, 128, 129},
                 {130, 131, 132, 133, 134},
                 {135, 136, 137, 138, 139},
                 {140, 141, 142, 143, 144},
                 {145, 146, 147, 148, 149}}
            }
        }
    });
    SECTION("Stride") {
        std::shared_ptr<Node> myAvgPool = AvgPooling({2,2}, "myAvgPool", {2,2});
        auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool -> getOperator());

        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,2,2,2> {
            {
                {
                    {{  3,   5},
                     { 13,  15}},
                    {{ 28,  30},
                     { 38,  40}}
                },
                {
                    {{103, 105},
                     {113, 115}},
                    {{128, 130},
                     {138, 140}}
                }
            }
        });
        op->associateInput(0,myInput);
        op->setDataType(DataType::Float32);
        op->setBackend("cuda");
        op->computeOutputDims();
        myAvgPool->forward();

        float* computedOutput   = new float[myOutput->size()]();
        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);

        for(int i = 0; i < myOutput->size(); i++){
            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
            REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
        }

        delete[] computedOutput;
    }

    SECTION("Stride >= feature dim") {
        std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(Array4D<float,1,1,3,3> { //NCHW
        {
            {
                {{0.3745, 0.9507, 0.7320},
                 {0.5987, 0.1560, 0.1560},
                 {0.0581, 0.8662, 0.6011}}
            }
        }
        });
        std::shared_ptr<Node> myAvgPool = AvgPooling({3,3}, "myAvgPool", {3,3});
        auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool -> getOperator());

        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,1,1,1,1> {
            {{{{(0.3745 + 0.9507 + 0.7320 + 0.5987 + 0.1560 + 0.1560 + 0.0581 + 0.8662 + 0.6011)/9.0}}}}
        });
        op->associateInput(0,myInput2);
        op->setDataType(DataType::Float32);
        op->setBackend("cuda");
        op->computeOutputDims();
        myAvgPool->forward();

        float* computedOutput   = new float[myOutput->size()]();
        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);

        for(int i = 0; i < myOutput->size(); i++){
            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
            REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
        }

        delete[] computedOutput;
    }

    SECTION("half") {
        std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(Array4D<half_float::half,1,1,3,3> { //NCHW
        {
            {
                {{half_float::half(0.3745), half_float::half(0.9507), half_float::half(0.7320)},
                 {half_float::half(0.5987), half_float::half(0.1560), half_float::half(0.1560)},
                 {half_float::half(0.0581), half_float::half(0.8662), half_float::half(0.6011)}}
            }
        }
        });
        myInput2->setBackend("cuda");

        std::shared_ptr<Node> myAvgPool = AvgPooling({3,3}, "mymyAvgPoolcdw", {3,3});
        auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool -> getOperator());
        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<half_float::half,1,1,1,1> {
            {{{{(half_float::half(0.3745) + half_float::half(0.9507) + half_float::half(0.7320) + half_float::half(0.5987) + half_float::half(0.1560) + half_float::half(0.1560) + half_float::half(0.0581) + half_float::half(0.8662) + half_float::half(0.6011))/half_float::half(9.0)}}}}
        });
        op->associateInput(0,myInput2);
        op->setDataType(DataType::Float16);
        op->setBackend("cuda");
        op->computeOutputDims();
        myAvgPool->forward();

        half_float::half* computedOutput   = new half_float::half[myOutput->size()]();
        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(half_float::half) * myOutput->size(), cudaMemcpyDeviceToHost);

        for(int i = 0; i < myOutput->size(); i++){
            const half_float::half targetOutput = *(static_cast<half_float::half*>(myOutput->getImpl()->rawPtr()) + i);
            REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
        }

        delete[] computedOutput;
    }

    int number_of_operation{0};
    SECTION("Random Input") {
        constexpr std::uint16_t NBTRIALS = 10;
        std::size_t kernel = 2;
        std::size_t stride = 2;
        // Create a random number generator
        std::random_device rd;
        std::mt19937 gen(rd());
        std::uniform_real_distribution<float> valueDist(
            0.1f, 1.1f); // Random float distribution between 0 and 1
        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2),
                                                                std::size_t(10));

        std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(4), std::size_t(4));

        // Create AveragePooling Operator
        std::shared_ptr<Node> myAvgPool = AvgPooling({kernel,kernel}, "myAvgPool", {stride,stride});
        auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool -> getOperator());
        op->setDataType(DataType::Float32);
        op->setBackend("cpu");

        // Create the input Tensor
        std::shared_ptr<Tensor> T0 = std::make_shared<Tensor>();
        op->associateInput(0, T0);
        T0->setDataType(DataType::Float32);
        T0->setBackend("cpu");

        // To measure execution time of 'AveragePooling_Op::forward()' 
        std::chrono::time_point<std::chrono::system_clock> start;
        std::chrono::time_point<std::chrono::system_clock> end;
        std::chrono::duration<double, std::micro> duration{};
        std::size_t number_of_operation = 0;

        SECTION("OutDims") {
            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
                // generate a random Tensor
                const std::size_t nbDims = nbDimsDist(gen);
                std::vector<std::size_t> dims;
                for (std::size_t i = 0; i < nbDims; ++i) {
                    dims.push_back(dimSizeDist(gen));
                }
                
                const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
                number_of_operation += nb_elements;

                // Fill input tensor
                float* array0 = new float[nb_elements];
                for (std::size_t i = 0; i < nb_elements; ++i) {
                    array0[i] = valueDist(gen);
                }
                T0->resize(dims);
                T0 -> getImpl() -> setRawPtr(array0, nb_elements);

                // Run inference
                op->computeOutputDims();
                start = std::chrono::system_clock::now();
                myAvgPool->forward();
                end = std::chrono::system_clock::now();
                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);

                // Verify output dimensions
                REQUIRE(op->getOutput(0)->nbDims() == dims.size());
                for (size_t i = 0; i < op->getOutput(0)->nbDims(); ++i) {
                    if(i == 2 || i == 3)
                        REQUIRE(op->getOutput(0)->dims()[i] == (1 + static_cast<DimSize_t>(std::floor(static_cast<float>(dims[i] - kernel) / static_cast<float>(stride)))));
                    else
                        REQUIRE(op->getOutput(0)->dims()[i] == dims[i]);
                }
        
                delete[] array0;
            }
            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
            std::cout << "total time: " << duration.count() << "μs" << std::endl;
        }

        SECTION("Values") {
            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
                // generate a random Tensor
                const std::size_t nbDims = nbDimsDist(gen);
                std::vector<std::size_t> dims;
                for (std::size_t i = 0; i < nbDims; ++i) {
                    dims.push_back(4);
                }
                
                const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
                number_of_operation += nb_elements;

                // Fill input tensor
                float* array0 = new float[nb_elements];
                for (std::size_t i = 0; i < nb_elements; ++i) {
                    array0[i] = valueDist(gen);
                }
                T0->resize(dims);
                T0 -> getImpl() -> setRawPtr(array0, nb_elements);

                // Fill expected output
                std::vector<float> result;
                std::size_t rows = dims[2], cols = dims[3], nbMat = dims[0] * dims[1], matSize = rows*cols;
                for (size_t i = 0; i < nbMat; i++)
                {
                    for(size_t r=0; r< rows; r += stride){
                        for(size_t c=0; c< cols; c += stride){
                            float sum = 0.0f;
                            for (size_t m = 0; m < kernel; m++)
                            {
                                for (size_t n = 0; n < kernel; n++)
                                {
                                    sum += array0[i * matSize + (r + m) * cols + c + n];
                                }
                                
                            }
                            result.push_back(sum/(kernel*kernel));
                        }
                    }
                }
                // energy based model
                //adversarial attacks: add noise on image so perturber le modèle
                // white box attacks and black box attacks

                // langevin sampling

                // Run inference
                op->computeOutputDims();
                start = std::chrono::system_clock::now();
                myAvgPool->forward();
                end = std::chrono::system_clock::now();
                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);

                std::cout << "---------output" << std::endl;
                op->getOutput(0)->print();
                float* computedOutput = static_cast<float*>(op->getOutput(0)->getImpl()->rawPtr());
                for (size_t i = 0; i < op->getOutput(0)->size(); i++)
                {
                    std::cout << "i " << i << " computed: "<< computedOutput[i] << ", expected " << result[i] << std::endl;
                    // REQUIRE(approxEq<float>(computedOutput[i], result[i]));
                    REQUIRE(abs(computedOutput[i] - result[i]) < 1e-6);
                }
                
                delete[] array0;
            }
            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
            std::cout << "total time: " << duration.count() << "μs" << std::endl;
        }
    }
}