Skip to content
Snippets Groups Projects
Commit 8c282297 authored by Houssem ROUIS's avatar Houssem ROUIS
Browse files

add random tests for AvgPooling

parent 1263e73a
No related branches found
No related tags found
2 merge requests!32version 0.2.1,!25Add backward implementations
......@@ -10,151 +10,308 @@
********************************************************************************/
#include <array>
#include <catch2/catch_test_macros.hpp>
#include <cuda_fp16.h>
#include <numeric> // std::accumulate
#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution
#include "Test_cuda.hpp"
#include "aidge/data/half.hpp"
#include "aidge/data/Tensor.hpp"
#include <cuda_fp16.h> // half type
#include <numeric> // std::accumulate
#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution
#include "aidge/backend/cpu.hpp"
#include "aidge/backend/cuda.hpp"
#include "aidge/data/half.hpp"
#include "aidge/data/Tensor.hpp"
using namespace Aidge;
TEST_CASE("[gpu/operator] AvgPooling(forward)", "[AvgPooling][GPU]") {
std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,2,5,5> { //NCHW
{
{
{{ 0, 1, 2, 3, 4},
{ 5, 6, 7, 8, 9},
{ 10, 11, 12, 13, 14},
{ 15, 16, 17, 18, 19},
{ 20, 21, 22, 23, 24}},
{{ 25, 26, 27, 28, 29},
{ 30, 31, 32, 33, 34},
{ 35, 36, 37, 38, 39},
{ 40, 41, 42, 43, 44},
{ 45, 46, 47, 48, 49}}
},
{
{{100, 101, 102, 103, 104},
{105, 106, 107, 108, 109},
{110, 111, 112, 113, 114},
{115, 116, 117, 118, 119},
{120, 121, 122, 123, 124}},
{{125, 126, 127, 128, 129},
{130, 131, 132, 133, 134},
{135, 136, 137, 138, 139},
{140, 141, 142, 143, 144},
{145, 146, 147, 148, 149}}
}
}
});
SECTION("Stride") {
std::shared_ptr<Node> myAvgPool = AvgPooling({2,2}, "myAvgPool", {2,2});
auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool -> getOperator());
TEST_CASE("[gpu/operator] AvgPooling(forward)", "[AvgPooling][GPU]")
{
std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float, 2, 2, 5, 5>{// NCHW
{
{{{0, 1, 2, 3, 4},
{5, 6, 7, 8, 9},
{10, 11, 12, 13, 14},
{15, 16, 17, 18, 19},
{20, 21, 22, 23, 24}},
std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,2,2,2> {
{
{
{{ 3, 5},
{ 13, 15}},
{{ 28, 30},
{ 38, 40}}
},
{
{{103, 105},
{113, 115}},
{{128, 130},
{138, 140}}
}
}
});
op->associateInput(0,myInput);
{{25, 26, 27, 28, 29},
{30, 31, 32, 33, 34},
{35, 36, 37, 38, 39},
{40, 41, 42, 43, 44},
{45, 46, 47, 48, 49}}},
{{{100, 101, 102, 103, 104},
{105, 106, 107, 108, 109},
{110, 111, 112, 113, 114},
{115, 116, 117, 118, 119},
{120, 121, 122, 123, 124}},
{{125, 126, 127, 128, 129},
{130, 131, 132, 133, 134},
{135, 136, 137, 138, 139},
{140, 141, 142, 143, 144},
{145, 146, 147, 148, 149}}}}});
SECTION("Stride")
{
std::shared_ptr<Node> myAvgPool = AvgPooling({2, 2}, "myAvgPool", {2, 2});
auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool->getOperator());
std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 2>{
{{{{3, 5},
{13, 15}},
{{28, 30},
{38, 40}}},
{{{103, 105},
{113, 115}},
{{128, 130},
{138, 140}}}}});
op->associateInput(0, myInput);
op->setDataType(DataType::Float32);
op->setBackend("cuda");
myAvgPool->forward();
float* computedOutput = new float[myOutput->size()]();
float *computedOutput = new float[myOutput->size()]();
cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
for(int i = 0; i < myOutput->size(); i++){
const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
for (int i = 0; i < myOutput->size(); i++)
{
const float targetOutput = *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i);
REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
}
delete[] computedOutput;
}
SECTION("Stride >= feature dim") {
std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(Array4D<float,1,1,3,3> { //NCHW
{
{
{{0.3745, 0.9507, 0.7320},
{0.5987, 0.1560, 0.1560},
{0.0581, 0.8662, 0.6011}}
}
}
});
std::shared_ptr<Node> myAvgPool = AvgPooling({3,3}, "myAvgPool", {3,3});
auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool -> getOperator());
std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,1,1,1,1> {
{{{{(0.3745 + 0.9507 + 0.7320 + 0.5987 + 0.1560 + 0.1560 + 0.0581 + 0.8662 + 0.6011)/9.0}}}}
});
op->associateInput(0,myInput2);
SECTION("Stride >= feature dim")
{
std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(Array4D<float, 1, 1, 3, 3>{// NCHW
{
{{{0.3745, 0.9507, 0.7320},
{0.5987, 0.1560, 0.1560},
{0.0581, 0.8662, 0.6011}}}}});
std::shared_ptr<Node> myAvgPool = AvgPooling({3, 3}, "myAvgPool", {3, 3});
auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool->getOperator());
std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float, 1, 1, 1, 1>{
{{{{(0.3745 + 0.9507 + 0.7320 + 0.5987 + 0.1560 + 0.1560 + 0.0581 + 0.8662 + 0.6011) / 9.0}}}}});
op->associateInput(0, myInput2);
op->setDataType(DataType::Float32);
op->setBackend("cuda");
myAvgPool->forward();
float* computedOutput = new float[myOutput->size()]();
float *computedOutput = new float[myOutput->size()]();
cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
for(int i = 0; i < myOutput->size(); i++){
const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
for (int i = 0; i < myOutput->size(); i++)
{
const float targetOutput = *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i);
REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
}
delete[] computedOutput;
}
SECTION("half") {
std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(Array4D<half_float::half,1,1,3,3> { //NCHW
{
{
{{half_float::half(0.3745), half_float::half(0.9507), half_float::half(0.7320)},
{half_float::half(0.5987), half_float::half(0.1560), half_float::half(0.1560)},
{half_float::half(0.0581), half_float::half(0.8662), half_float::half(0.6011)}}
}
}
});
SECTION("half")
{
std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(Array4D<half_float::half, 1, 1, 3, 3>{// NCHW
{
{{{half_float::half(0.3745), half_float::half(0.9507), half_float::half(0.7320)},
{half_float::half(0.5987), half_float::half(0.1560), half_float::half(0.1560)},
{half_float::half(0.0581), half_float::half(0.8662), half_float::half(0.6011)}}}}});
myInput2->setBackend("cuda");
std::shared_ptr<Node> myAvgPool = AvgPooling({3,3}, "mymyAvgPoolcdw", {3,3});
auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool -> getOperator());
std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<half_float::half,1,1,1,1> {
{{{{(half_float::half(0.3745) + half_float::half(0.9507) + half_float::half(0.7320) + half_float::half(0.5987) + half_float::half(0.1560) + half_float::half(0.1560) + half_float::half(0.0581) + half_float::half(0.8662) + half_float::half(0.6011))/half_float::half(9.0)}}}}
});
op->associateInput(0,myInput2);
std::shared_ptr<Node> myAvgPool = AvgPooling({3, 3}, "myAvgPoolcdw", {3, 3});
auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool->getOperator());
std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<half_float::half, 1, 1, 1, 1>{
{{{{(half_float::half(0.3745) + half_float::half(0.9507) + half_float::half(0.7320) + half_float::half(0.5987) + half_float::half(0.1560) + half_float::half(0.1560) + half_float::half(0.0581) + half_float::half(0.8662) + half_float::half(0.6011)) / half_float::half(9.0)}}}}});
op->associateInput(0, myInput2);
op->setDataType(DataType::Float16);
op->setBackend("cuda");
myAvgPool->forward();
half_float::half* computedOutput = new half_float::half[myOutput->size()]();
half_float::half *computedOutput = new half_float::half[myOutput->size()]();
cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(half_float::half) * myOutput->size(), cudaMemcpyDeviceToHost);
for(int i = 0; i < myOutput->size(); i++){
const half_float::half targetOutput = *(static_cast<half_float::half*>(myOutput->getImpl()->rawPtr()) + i);
for (int i = 0; i < myOutput->size(); i++)
{
const half_float::half targetOutput = *(static_cast<half_float::half *>(myOutput->getImpl()->rawPtr()) + i);
REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
}
delete[] computedOutput;
}
int number_of_operation{0};
SECTION("Random Input")
{
constexpr std::uint16_t NBTRIALS = 5;
std::size_t kernel = 3;
std::size_t stride = 3;
// Create a random number generator
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<float> valueDist(
0.1f, 1.1f); // Random float distribution between 0 and 1
std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(9),
std::size_t(12));
// To measure execution time of 'AveragePooling_Op::forward()'
std::chrono::time_point<std::chrono::system_clock> start;
std::chrono::time_point<std::chrono::system_clock> end;
std::chrono::duration<double, std::micro> duration{};
std::size_t number_of_operation = 0;
SECTION("OutDims")
{
for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial)
{
// Create AveragePooling Operator
std::shared_ptr<Node> myAvgPool = AvgPooling({kernel, kernel}, "myAvgPool", {stride, stride});
auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool->getOperator());
op->setDataType(DataType::Float32);
op->setBackend("cuda");
// Create the input Tensor
std::shared_ptr<Tensor> T0 = std::make_shared<Tensor>();
op->associateInput(0, T0);
T0->setDataType(DataType::Float32);
T0->setBackend("cuda");
// generate a random Tensor
const std::size_t nbDims = 4;
std::vector<std::size_t> dims;
for (std::size_t i = 0; i < nbDims; ++i)
{
dims.push_back(dimSizeDist(gen));
}
const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
number_of_operation += nb_elements;
// Fill input tensor
float *array0 = new float[nb_elements];
for (std::size_t i = 0; i < nb_elements; ++i)
{
array0[i] = valueDist(gen);
}
float *input_d;
cudaMalloc(reinterpret_cast<void **>(&input_d), sizeof(float) * nb_elements);
cudaMemcpy(input_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice);
T0->resize(dims);
T0->getImpl()->setRawPtr(input_d, nb_elements);
// Run inference
start = std::chrono::system_clock::now();
myAvgPool->forward();
end = std::chrono::system_clock::now();
duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
// Verify output dimensions
REQUIRE(op->getOutput(0)->nbDims() == dims.size());
for (size_t i = 0; i < op->getOutput(0)->nbDims(); ++i)
{
if (i == 2 || i == 3)
REQUIRE(op->getOutput(0)->dims()[i] == (1 + static_cast<DimSize_t>(std::floor(static_cast<float>(dims[i] - kernel) / static_cast<float>(stride)))));
else
REQUIRE(op->getOutput(0)->dims()[i] == dims[i]);
}
delete[] array0;
cudaFree(input_d);
}
std::cout << "number of elements over time spent: " << (number_of_operation / duration.count()) << std::endl;
std::cout << "total time: " << duration.count() << "μs" << std::endl;
}
SECTION("Values")
{
for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial)
{
// Create AveragePooling Operator
std::shared_ptr<Node> myAvgPool = AvgPooling({kernel, kernel}, "myAvgPool", {stride, stride});
auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool->getOperator());
op->setDataType(DataType::Float32);
op->setBackend("cuda");
// Create the input Tensor
std::shared_ptr<Tensor> T0 = std::make_shared<Tensor>();
op->associateInput(0, T0);
T0->setDataType(DataType::Float32);
T0->setBackend("cuda");
// generate a random Tensor
const std::size_t nbDims = 4;
std::vector<std::size_t> dims;
for (std::size_t i = 0; i < nbDims; ++i)
{
dims.push_back(dimSizeDist(gen));
}
const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
number_of_operation += nb_elements;
// Fill input tensor
float *array0 = new float[nb_elements];
for (std::size_t i = 0; i < nb_elements; ++i)
{
array0[i] = (int)(valueDist(gen) * 10);
}
float *input_d;
cudaMalloc(reinterpret_cast<void **>(&input_d), sizeof(float) * nb_elements);
cudaMemcpy(input_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice);
T0->resize(dims);
T0->getImpl()->setRawPtr(input_d, nb_elements);
// Fill expected output
int N = dims[0]; // Batch
int C = dims[1]; // Channels
int H = dims[2]; // Height
int W = dims[3]; // Width
// Compute output dimensions
int outH = (H - kernel) / stride + 1;
int outW = (W - kernel) / stride + 1;
// Allocate memory for the output
size_t nbelemOut = N * C * outH * outW;
float *output = new float[N * C * outH * outW];
for (int n = 0; n < N; ++n)
{
for (int c = 0; c < C; ++c)
{
for (int i = 0; i < outH; ++i)
{
for (int j = 0; j < outW; ++j)
{
float sum = 0.0;
for (int m = 0; m < kernel; ++m)
{
for (int k = 0; k < kernel; ++k)
{
sum += array0[((n * C + c) * H + i * stride + m) * W + j * stride + k];
}
}
output[((n * C + c) * outH + i) * outW + j] = sum / (kernel * kernel);
}
}
}
}
// Run inference
start = std::chrono::system_clock::now();
myAvgPool->forward();
end = std::chrono::system_clock::now();
duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
float *computedOutput = new float[nbelemOut]();
cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * nbelemOut, cudaMemcpyDeviceToHost);
for (int i = 0; i < nbelemOut; ++i)
{
REQUIRE(fabs(computedOutput[i] - output[i]) < 1e-4);
}
delete[] computedOutput;
delete[] array0;
cudaFree(input_d);
}
std::cout << "number of elements over time spent: " << (number_of_operation / duration.count()) << std::endl;
std::cout << "total time: " << duration.count() << "μs" << std::endl;
}
}
}
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment