Skip to content
Snippets Groups Projects
Commit 509b4a66 authored by Houssem ROUIS's avatar Houssem ROUIS
Browse files

add random test for BatchNorm

parent e6e054ed
No related branches found
No related tags found
2 merge requests!32version 0.2.1,!25Add backward implementations
/********************************************************************************
* Copyright (c) 2023 CEA-List
* Copyright (c) 2024 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
......@@ -10,110 +10,260 @@
********************************************************************************/
#include <array>
#include <numeric> // std::accumulate
#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution
#include <catch2/catch_test_macros.hpp>
#include "Test_cuda.hpp"
#include "aidge/data/Tensor.hpp"
#include "aidge/backend/cpu.hpp"
#include "aidge/backend/cuda.hpp"
#include "aidge/data/Tensor.hpp"
#include "aidge/utils/TensorUtils.hpp"
#include "Test_cuda.hpp"
using namespace Aidge;
TEST_CASE("[gpu/operator] BatchNorm(forward)") {
std::shared_ptr<Node> myBatchNorm = BatchNorm<2>(3, 0.00001F, 0.1F, "mybatchnorm");
auto op = std::static_pointer_cast<OperatorTensor>(myBatchNorm -> getOperator());
op->setDataType(DataType::Float32);
op->setBackend("cuda");
std::shared_ptr<Tensor> myWeights= std::make_shared<Tensor>(Array1D<float,3> {{0.9159252643585205, 0.18772238492965698, 0.4479946792125702}});
std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float,3> {{0.33898890018463135, 0.3167555630207062, 0.7047033309936523}});
std::shared_ptr<Tensor> myMean = std::make_shared<Tensor>(Array1D<float,3> {{0.45547693967819214, 0.22650663554668427, 0.6612948179244995}});
std::shared_ptr<Tensor> myVar = std::make_shared<Tensor>(Array1D<float,3> {{0.02570258639752865, 0.026536229997873306, 0.15111008286476135}});
std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,3,3,3> { //NCHW
{
SECTION("Random Input") {
std::shared_ptr<Node> myBatchNorm = BatchNorm<2>(3, 0.00001F, 0.1F, "mybatchnorm");
auto op = std::static_pointer_cast<OperatorTensor>(myBatchNorm -> getOperator());
op->setDataType(DataType::Float32);
op->setBackend("cuda");
std::shared_ptr<Tensor> myWeights= std::make_shared<Tensor>(Array1D<float,3> {{0.9159252643585205, 0.18772238492965698, 0.4479946792125702}});
std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float,3> {{0.33898890018463135, 0.3167555630207062, 0.7047033309936523}});
std::shared_ptr<Tensor> myMean = std::make_shared<Tensor>(Array1D<float,3> {{0.45547693967819214, 0.22650663554668427, 0.6612948179244995}});
std::shared_ptr<Tensor> myVar = std::make_shared<Tensor>(Array1D<float,3> {{0.02570258639752865, 0.026536229997873306, 0.15111008286476135}});
std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,3,3,3> { //NCHW
{
{
{{0.12943482, 0.6451229 , 0.24979436},
{0.7551012, 0.32007095, 0.89463896},
{0.7087448, 0.6266124, 0.4782957 }},
{{0.13796203, 0.9950787, 0.71555305},
{0.01347321, 0.4395316, 0.43097174},
{0.6056306 , 0.9561122 , 0.5783939 }},
{{0.7174486 , 0.503465 , 0.23695093},
{0.5145477, 0.39576462, 0.02779444},
{0.60789394 ,0.14119725 ,0.20753163}}
},
{{{0.74452287, 0.5354875 , 0.8148496 },
{0.73356223, 0.4304034 , 0.11783765},
{0.8966221, 0.41049036, 0.95982736}},
{{0.03161403, 0.71250844, 0.14337301},
{0.5338889 , 0.13484782, 0.8055851 },
{0.71784616 ,0.8349626 , 0.10107189}},
{{0.85701346, 0.58286697, 0.9836816 },
{0.36061534, 0.03660944, 0.7375317 },
{0.6977233, 0.51965624, 0.29440993}}
}
}
});
std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,3,3,3> {
{
{
{{0.12943482, 0.6451229 , 0.24979436},
{0.7551012, 0.32007095, 0.89463896},
{0.7087448, 0.6266124, 0.4782957 }},
{{-1.5233592, 1.4222438, -0.83586717},
{ 2.0504384, -0.43444824, 2.847476 },
{ 1.7856512, 1.3165123, 0.46932936}},
{{0.13796203, 0.9950787, 0.71555305},
{0.01347321, 0.4395316, 0.43097174},
{0.6056306 , 0.9561122 , 0.5783939 }},
{{ 0.21473758 , 1.2022772, 0.8802177 },
{ 0.07130594 , 0.5621954, 0.55233306},
{ 0.7535689 , 1.1573814, 0.72218764}},
{{0.7174486 , 0.503465 , 0.23695093},
{0.5145477, 0.39576462, 0.02779444},
{0.60789394 ,0.14119725 ,0.20753163}}
},
{{ 0.7694162 , 0.52281666, 0.2156798 },
{ 0.5355886 , 0.3987003, -0.02535689},
{ 0.6431629 , 0.10533108 , 0.18177633}}},
{{{0.74452287, 0.5354875 , 0.8148496 },
{0.73356223, 0.4304034 , 0.11783765},
{0.8966221, 0.41049036, 0.95982736}},
{{{ 1.990015, 0.7960079, 2.3917203 },
{ 1.9274082, 0.19576907, -1.5896021 },
{ 2.8588037 , 0.08202624 , 3.2198315 }},
{{0.03161403, 0.71250844, 0.14337301},
{0.5338889 , 0.13484782, 0.8055851 },
{0.71784616 ,0.8349626 , 0.10107189}},
{{ 0.09220716, 0.8767097, 0.22097193},
{ 0.6709106 , 0.2111495, 0.9839494 },
{ 0.8828597 , 1.0177971 , 0.17223406}},
{{0.85701346, 0.58286697, 0.9836816 },
{0.36061534, 0.03660944, 0.7375317 },
{0.6977233, 0.51965624, 0.29440993}}
{{ 0.9302539 , 0.6143213 , 1.0762292 },
{ 0.35819346, -0.01519828, 0.79256046},
{ 0.7466844 , 0.5414758 , 0.28189686}}
}
}
});
std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,3,3,3> {
});
myInput->setBackend("cuda");
myWeights->setBackend("cuda");
myBias->setBackend("cuda");
myMean->setBackend("cuda");
myVar->setBackend("cuda");
op->associateInput(0,myInput);
op->associateInput(1,myWeights);
op->associateInput(2,myBias);
op->associateInput(3,myMean);
op->associateInput(4,myVar);
op->forward();
float* computedOutput = new float[myOutput->size()]();
cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
for(int i = 0; i < myOutput->size(); i++){
const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-5);
}
delete[] computedOutput;
}
SECTION("Random Input") {
constexpr std::uint16_t NBTRIALS = 10;
constexpr float epsilon = 0.00001F;
constexpr float momentum = 0.1F;
// Create a random number generator
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<float> valueDist(
0.1f, 1.1f); // Random float distribution between 0 and 1
std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1),
std::size_t(10));
// To measure execution time of 'forward()'
std::chrono::time_point<std::chrono::system_clock> start;
std::chrono::time_point<std::chrono::system_clock> end;
std::chrono::duration<double, std::micro> duration{};
std::size_t number_of_operation = 0;
for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial)
{
// generate a random Tensor
const std::size_t nbDims = 4;
std::vector<std::size_t> dims;
for (std::size_t i = 0; i < nbDims; ++i)
{
{{-1.5233592, 1.4222438, -0.83586717},
{ 2.0504384, -0.43444824, 2.847476 },
{ 1.7856512, 1.3165123, 0.46932936}},
dims.push_back(dimSizeDist(gen));
}
const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
number_of_operation += nb_elements;
int N = dims[0]; // Batch
int C = dims[1]; // Channels
int H = dims[2]; // Height
int W = dims[3]; // Width
{{ 0.21473758 , 1.2022772, 0.8802177 },
{ 0.07130594 , 0.5621954, 0.55233306},
{ 0.7535689 , 1.1573814, 0.72218764}},
// Create BatchNorm Operator
std::shared_ptr<Node> myBatchNorm = BatchNorm<2>(C, epsilon, momentum, "mybatchnorm");
auto op = std::static_pointer_cast<OperatorTensor>(myBatchNorm -> getOperator());
op->setDataType(DataType::Float32);
op->setBackend("cuda");
{{ 0.7694162 , 0.52281666, 0.2156798 },
{ 0.5355886 , 0.3987003, -0.02535689},
{ 0.6431629 , 0.10533108 , 0.18177633}}},
// Create the input Tensor
std::shared_ptr<Tensor> T0 = std::make_shared<Tensor>();
T0->setDataType(DataType::Float32);
T0->setBackend("cuda");
T0->resize(dims);
op->associateInput(0, T0);
// Create the weight Tensor
std::shared_ptr<Tensor> Tw = std::make_shared<Tensor>();
Tw->setDataType(DataType::Float32);
Tw->setBackend("cuda");
Tw->resize({static_cast<DimSize_t>(C)});
op->associateInput(1, Tw);
// Create the bias Tensor
std::shared_ptr<Tensor> Tb = std::make_shared<Tensor>();
Tb->setDataType(DataType::Float32);
Tb->setBackend("cuda");
Tb->resize({static_cast<DimSize_t>(C)});
op->associateInput(2, Tb);
// Create the mean Tensor
std::shared_ptr<Tensor> Tm = std::make_shared<Tensor>();
Tm->setDataType(DataType::Float32);
Tm->setBackend("cuda");
Tm->resize({static_cast<DimSize_t>(C)});
op->associateInput(3, Tm);
// Create the variance Tensor
std::shared_ptr<Tensor> Tv = std::make_shared<Tensor>();
Tv->setDataType(DataType::Float32);
Tv->setBackend("cuda");
Tv->resize({static_cast<DimSize_t>(C)});
op->associateInput(4, Tv);
// Fill input tensor
float *input_h = new float[nb_elements];
for (std::size_t i = 0; i < nb_elements; ++i) {
input_h[i] = valueDist(gen);
}
float *input_d;
cudaMalloc(reinterpret_cast<void **>(&input_d), sizeof(float) * nb_elements);
cudaMemcpy(input_d, input_h, sizeof(float) * nb_elements, cudaMemcpyHostToDevice);
T0->getImpl()->setRawPtr(input_d, nb_elements);
{{{ 1.990015, 0.7960079, 2.3917203 },
{ 1.9274082, 0.19576907, -1.5896021 },
{ 2.8588037 , 0.08202624 , 3.2198315 }},
// Set other inputs
float *weight_h = new float[C];
float *bias_h = new float[C];
float *mean_h = new float[C];
float *var_h = new float[C];
for (std::size_t i = 0; i < C; i++) {
weight_h[i] = valueDist(gen);
bias_h[i] = valueDist(gen);
mean_h[i] = valueDist(gen);
var_h[i] = valueDist(gen);
}
float *weight_d, *bias_d, *mean_d, *var_d;
cudaMalloc(reinterpret_cast<void **>(&weight_d), sizeof(float) * C);
cudaMemcpy(weight_d, weight_h, sizeof(float) * C, cudaMemcpyHostToDevice);
Tw->getImpl()->setRawPtr(weight_d, C);
cudaMalloc(reinterpret_cast<void **>(&bias_d), sizeof(float) * C);
cudaMemcpy(bias_d, bias_h, sizeof(float) * C, cudaMemcpyHostToDevice);
Tb->getImpl()->setRawPtr(bias_d, C);
cudaMalloc(reinterpret_cast<void **>(&mean_d), sizeof(float) * C);
cudaMemcpy(mean_d, mean_h, sizeof(float) * C, cudaMemcpyHostToDevice);
Tm->getImpl()->setRawPtr(mean_d, C);
cudaMalloc(reinterpret_cast<void **>(&var_d), sizeof(float) * C);
cudaMemcpy(var_d, var_h, sizeof(float) * C, cudaMemcpyHostToDevice);
Tv->getImpl()->setRawPtr(var_d, C);
{{ 0.09220716, 0.8767097, 0.22097193},
{ 0.6709106 , 0.2111495, 0.9839494 },
{ 0.8828597 , 1.0177971 , 0.17223406}},
// Compute expected results
float *output_h = new float[nb_elements];
const std::size_t featureMapSize = W * H;
for (std::size_t batch = 0; batch < N; ++batch) {
for (std::size_t ch = 0; ch < C; ++ch) {
const std::size_t ioIndex = (ch + batch*C) * featureMapSize;
std::fill(output_h + ioIndex, output_h + ioIndex + featureMapSize, bias_h[ch]);
const float var = std::sqrt(var_h[ch] + epsilon);
{{ 0.9302539 , 0.6143213 , 1.0762292 },
{ 0.35819346, -0.01519828, 0.79256046},
{ 0.7466844 , 0.5414758 , 0.28189686}}
for (std::size_t feature = 0; feature<featureMapSize; ++feature) {
output_h[ioIndex + feature] += weight_h[ch] * (input_h[ioIndex + feature]-mean_h[ch]) / var;
}
}
}
}
});
myInput->setBackend("cuda");
myWeights->setBackend("cuda");
myBias->setBackend("cuda");
myMean->setBackend("cuda");
myVar->setBackend("cuda");
op->associateInput(0,myInput);
op->associateInput(1,myWeights);
op->associateInput(2,myBias);
op->associateInput(3,myMean);
op->associateInput(4,myVar);
op->forward();
float* computedOutput = new float[myOutput->size()]();
cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
for(int i = 0; i < myOutput->size(); i++){
const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-5);
}
delete[] computedOutput;
// Run inference
start = std::chrono::system_clock::now();
myBatchNorm->forward();
end = std::chrono::system_clock::now();
duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
float *computedOutput = new float[nb_elements]();
cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * nb_elements, cudaMemcpyDeviceToHost);
REQUIRE(approxEq<float>(*computedOutput, *output_h));
delete[] computedOutput;
delete[] input_h;
delete[] output_h;
delete[] weight_h;
delete[] bias_h;
delete[] mean_h;
delete[] var_h;
cudaFree(input_d);
cudaFree(weight_d);
cudaFree(bias_d);
cudaFree(mean_d);
cudaFree(var_d);
}
std::cout << "number of elements over time spent: " << (number_of_operation / duration.count()) << std::endl;
std::cout << "total time: " << duration.count() << "μs" << std::endl;
}
}
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment