diff --git a/unit_tests/Test_AvgPoolingImpl.cpp b/unit_tests/Test_AvgPoolingImpl.cpp index dfadebbe07aa38371576cf4006773484494751a0..e453d9ee866c50c23b34a445e0c0eedf1fba0b07 100644 --- a/unit_tests/Test_AvgPoolingImpl.cpp +++ b/unit_tests/Test_AvgPoolingImpl.cpp @@ -10,151 +10,308 @@ ********************************************************************************/ #include <array> - #include <catch2/catch_test_macros.hpp> -#include <cuda_fp16.h> -#include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution - -#include "Test_cuda.hpp" - -#include "aidge/data/half.hpp" -#include "aidge/data/Tensor.hpp" +#include <cuda_fp16.h> // half type +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution #include "aidge/backend/cpu.hpp" #include "aidge/backend/cuda.hpp" +#include "aidge/data/half.hpp" +#include "aidge/data/Tensor.hpp" using namespace Aidge; -TEST_CASE("[gpu/operator] AvgPooling(forward)", "[AvgPooling][GPU]") { - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,2,5,5> { //NCHW - { - { - {{ 0, 1, 2, 3, 4}, - { 5, 6, 7, 8, 9}, - { 10, 11, 12, 13, 14}, - { 15, 16, 17, 18, 19}, - { 20, 21, 22, 23, 24}}, - - {{ 25, 26, 27, 28, 29}, - { 30, 31, 32, 33, 34}, - { 35, 36, 37, 38, 39}, - { 40, 41, 42, 43, 44}, - { 45, 46, 47, 48, 49}} - }, - { - {{100, 101, 102, 103, 104}, - {105, 106, 107, 108, 109}, - {110, 111, 112, 113, 114}, - {115, 116, 117, 118, 119}, - {120, 121, 122, 123, 124}}, - - {{125, 126, 127, 128, 129}, - {130, 131, 132, 133, 134}, - {135, 136, 137, 138, 139}, - {140, 141, 142, 143, 144}, - {145, 146, 147, 148, 149}} - } - } - }); - SECTION("Stride") { - std::shared_ptr<Node> myAvgPool = AvgPooling({2,2}, "myAvgPool", {2,2}); - auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool -> getOperator()); +TEST_CASE("[gpu/operator] AvgPooling(forward)", "[AvgPooling][GPU]") +{ + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float, 2, 2, 5, 5>{// NCHW + { + {{{0, 1, 2, 3, 4}, + {5, 6, 7, 8, 9}, + {10, 11, 12, 13, 14}, + {15, 16, 17, 18, 19}, + {20, 21, 22, 23, 24}}, - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,2,2,2> { - { - { - {{ 3, 5}, - { 13, 15}}, - {{ 28, 30}, - { 38, 40}} - }, - { - {{103, 105}, - {113, 115}}, - {{128, 130}, - {138, 140}} - } - } - }); - op->associateInput(0,myInput); + {{25, 26, 27, 28, 29}, + {30, 31, 32, 33, 34}, + {35, 36, 37, 38, 39}, + {40, 41, 42, 43, 44}, + {45, 46, 47, 48, 49}}}, + {{{100, 101, 102, 103, 104}, + {105, 106, 107, 108, 109}, + {110, 111, 112, 113, 114}, + {115, 116, 117, 118, 119}, + {120, 121, 122, 123, 124}}, + + {{125, 126, 127, 128, 129}, + {130, 131, 132, 133, 134}, + {135, 136, 137, 138, 139}, + {140, 141, 142, 143, 144}, + {145, 146, 147, 148, 149}}}}}); + SECTION("Stride") + { + std::shared_ptr<Node> myAvgPool = AvgPooling({2, 2}, "myAvgPool", {2, 2}); + auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool->getOperator()); + + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 2>{ + {{{{3, 5}, + {13, 15}}, + {{28, 30}, + {38, 40}}}, + {{{103, 105}, + {113, 115}}, + {{128, 130}, + {138, 140}}}}}); + op->associateInput(0, myInput); op->setDataType(DataType::Float32); op->setBackend("cuda"); myAvgPool->forward(); - float* computedOutput = new float[myOutput->size()](); + float *computedOutput = new float[myOutput->size()](); cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); - for(int i = 0; i < myOutput->size(); i++){ - const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + for (int i = 0; i < myOutput->size(); i++) + { + const float targetOutput = *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i); REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); } delete[] computedOutput; } - SECTION("Stride >= feature dim") { - std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(Array4D<float,1,1,3,3> { //NCHW - { - { - {{0.3745, 0.9507, 0.7320}, - {0.5987, 0.1560, 0.1560}, - {0.0581, 0.8662, 0.6011}} - } - } - }); - std::shared_ptr<Node> myAvgPool = AvgPooling({3,3}, "myAvgPool", {3,3}); - auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool -> getOperator()); - - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,1,1,1,1> { - {{{{(0.3745 + 0.9507 + 0.7320 + 0.5987 + 0.1560 + 0.1560 + 0.0581 + 0.8662 + 0.6011)/9.0}}}} - }); - op->associateInput(0,myInput2); + SECTION("Stride >= feature dim") + { + std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(Array4D<float, 1, 1, 3, 3>{// NCHW + { + {{{0.3745, 0.9507, 0.7320}, + {0.5987, 0.1560, 0.1560}, + {0.0581, 0.8662, 0.6011}}}}}); + std::shared_ptr<Node> myAvgPool = AvgPooling({3, 3}, "myAvgPool", {3, 3}); + auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool->getOperator()); + + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float, 1, 1, 1, 1>{ + {{{{(0.3745 + 0.9507 + 0.7320 + 0.5987 + 0.1560 + 0.1560 + 0.0581 + 0.8662 + 0.6011) / 9.0}}}}}); + op->associateInput(0, myInput2); op->setDataType(DataType::Float32); op->setBackend("cuda"); myAvgPool->forward(); - float* computedOutput = new float[myOutput->size()](); + float *computedOutput = new float[myOutput->size()](); cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); - for(int i = 0; i < myOutput->size(); i++){ - const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + for (int i = 0; i < myOutput->size(); i++) + { + const float targetOutput = *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i); REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); } delete[] computedOutput; } - SECTION("half") { - std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(Array4D<half_float::half,1,1,3,3> { //NCHW - { - { - {{half_float::half(0.3745), half_float::half(0.9507), half_float::half(0.7320)}, - {half_float::half(0.5987), half_float::half(0.1560), half_float::half(0.1560)}, - {half_float::half(0.0581), half_float::half(0.8662), half_float::half(0.6011)}} - } - } - }); + SECTION("half") + { + std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(Array4D<half_float::half, 1, 1, 3, 3>{// NCHW + { + {{{half_float::half(0.3745), half_float::half(0.9507), half_float::half(0.7320)}, + {half_float::half(0.5987), half_float::half(0.1560), half_float::half(0.1560)}, + {half_float::half(0.0581), half_float::half(0.8662), half_float::half(0.6011)}}}}}); myInput2->setBackend("cuda"); - std::shared_ptr<Node> myAvgPool = AvgPooling({3,3}, "mymyAvgPoolcdw", {3,3}); - auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool -> getOperator()); - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<half_float::half,1,1,1,1> { - {{{{(half_float::half(0.3745) + half_float::half(0.9507) + half_float::half(0.7320) + half_float::half(0.5987) + half_float::half(0.1560) + half_float::half(0.1560) + half_float::half(0.0581) + half_float::half(0.8662) + half_float::half(0.6011))/half_float::half(9.0)}}}} - }); - op->associateInput(0,myInput2); + std::shared_ptr<Node> myAvgPool = AvgPooling({3, 3}, "myAvgPoolcdw", {3, 3}); + auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool->getOperator()); + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<half_float::half, 1, 1, 1, 1>{ + {{{{(half_float::half(0.3745) + half_float::half(0.9507) + half_float::half(0.7320) + half_float::half(0.5987) + half_float::half(0.1560) + half_float::half(0.1560) + half_float::half(0.0581) + half_float::half(0.8662) + half_float::half(0.6011)) / half_float::half(9.0)}}}}}); + op->associateInput(0, myInput2); op->setDataType(DataType::Float16); op->setBackend("cuda"); myAvgPool->forward(); - half_float::half* computedOutput = new half_float::half[myOutput->size()](); + half_float::half *computedOutput = new half_float::half[myOutput->size()](); cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(half_float::half) * myOutput->size(), cudaMemcpyDeviceToHost); - for(int i = 0; i < myOutput->size(); i++){ - const half_float::half targetOutput = *(static_cast<half_float::half*>(myOutput->getImpl()->rawPtr()) + i); + for (int i = 0; i < myOutput->size(); i++) + { + const half_float::half targetOutput = *(static_cast<half_float::half *>(myOutput->getImpl()->rawPtr()) + i); REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); } delete[] computedOutput; } + int number_of_operation{0}; + SECTION("Random Input") + { + constexpr std::uint16_t NBTRIALS = 5; + std::size_t kernel = 3; + std::size_t stride = 3; + // Create a random number generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<float> valueDist( + 0.1f, 1.1f); // Random float distribution between 0 and 1 + std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(9), + std::size_t(12)); + + // To measure execution time of 'AveragePooling_Op::forward()' + std::chrono::time_point<std::chrono::system_clock> start; + std::chrono::time_point<std::chrono::system_clock> end; + std::chrono::duration<double, std::micro> duration{}; + std::size_t number_of_operation = 0; + + SECTION("OutDims") + { + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) + { + // Create AveragePooling Operator + std::shared_ptr<Node> myAvgPool = AvgPooling({kernel, kernel}, "myAvgPool", {stride, stride}); + auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool->getOperator()); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + + // Create the input Tensor + std::shared_ptr<Tensor> T0 = std::make_shared<Tensor>(); + op->associateInput(0, T0); + T0->setDataType(DataType::Float32); + T0->setBackend("cuda"); + + // generate a random Tensor + const std::size_t nbDims = 4; + std::vector<std::size_t> dims; + for (std::size_t i = 0; i < nbDims; ++i) + { + dims.push_back(dimSizeDist(gen)); + } + + const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + number_of_operation += nb_elements; + + // Fill input tensor + float *array0 = new float[nb_elements]; + for (std::size_t i = 0; i < nb_elements; ++i) + { + array0[i] = valueDist(gen); + } + float *input_d; + cudaMalloc(reinterpret_cast<void **>(&input_d), sizeof(float) * nb_elements); + cudaMemcpy(input_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice); + T0->resize(dims); + T0->getImpl()->setRawPtr(input_d, nb_elements); + + // Run inference + start = std::chrono::system_clock::now(); + myAvgPool->forward(); + end = std::chrono::system_clock::now(); + duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + + // Verify output dimensions + REQUIRE(op->getOutput(0)->nbDims() == dims.size()); + for (size_t i = 0; i < op->getOutput(0)->nbDims(); ++i) + { + if (i == 2 || i == 3) + REQUIRE(op->getOutput(0)->dims()[i] == (1 + static_cast<DimSize_t>(std::floor(static_cast<float>(dims[i] - kernel) / static_cast<float>(stride))))); + else + REQUIRE(op->getOutput(0)->dims()[i] == dims[i]); + } + + delete[] array0; + cudaFree(input_d); + } + std::cout << "number of elements over time spent: " << (number_of_operation / duration.count()) << std::endl; + std::cout << "total time: " << duration.count() << "μs" << std::endl; + } + + SECTION("Values") + { + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) + { + // Create AveragePooling Operator + std::shared_ptr<Node> myAvgPool = AvgPooling({kernel, kernel}, "myAvgPool", {stride, stride}); + auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool->getOperator()); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + + // Create the input Tensor + std::shared_ptr<Tensor> T0 = std::make_shared<Tensor>(); + op->associateInput(0, T0); + T0->setDataType(DataType::Float32); + T0->setBackend("cuda"); + // generate a random Tensor + const std::size_t nbDims = 4; + std::vector<std::size_t> dims; + for (std::size_t i = 0; i < nbDims; ++i) + { + dims.push_back(dimSizeDist(gen)); + } + + const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + number_of_operation += nb_elements; + + // Fill input tensor + float *array0 = new float[nb_elements]; + for (std::size_t i = 0; i < nb_elements; ++i) + { + array0[i] = (int)(valueDist(gen) * 10); + } + float *input_d; + cudaMalloc(reinterpret_cast<void **>(&input_d), sizeof(float) * nb_elements); + cudaMemcpy(input_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice); + T0->resize(dims); + T0->getImpl()->setRawPtr(input_d, nb_elements); + + // Fill expected output + int N = dims[0]; // Batch + int C = dims[1]; // Channels + int H = dims[2]; // Height + int W = dims[3]; // Width + + // Compute output dimensions + int outH = (H - kernel) / stride + 1; + int outW = (W - kernel) / stride + 1; + + // Allocate memory for the output + size_t nbelemOut = N * C * outH * outW; + float *output = new float[N * C * outH * outW]; + + for (int n = 0; n < N; ++n) + { + for (int c = 0; c < C; ++c) + { + for (int i = 0; i < outH; ++i) + { + for (int j = 0; j < outW; ++j) + { + float sum = 0.0; + for (int m = 0; m < kernel; ++m) + { + for (int k = 0; k < kernel; ++k) + { + sum += array0[((n * C + c) * H + i * stride + m) * W + j * stride + k]; + } + } + output[((n * C + c) * outH + i) * outW + j] = sum / (kernel * kernel); + } + } + } + } + // Run inference + start = std::chrono::system_clock::now(); + myAvgPool->forward(); + end = std::chrono::system_clock::now(); + duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + + float *computedOutput = new float[nbelemOut](); + cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * nbelemOut, cudaMemcpyDeviceToHost); + + for (int i = 0; i < nbelemOut; ++i) + { + REQUIRE(fabs(computedOutput[i] - output[i]) < 1e-4); + } + + delete[] computedOutput; + delete[] array0; + cudaFree(input_d); + } + std::cout << "number of elements over time spent: " << (number_of_operation / duration.count()) << std::endl; + std::cout << "total time: " << duration.count() << "μs" << std::endl; + } + } } \ No newline at end of file