diff --git a/unit_tests/Test_AvgPoolingImpl.cpp b/unit_tests/Test_AvgPoolingImpl.cpp
index dfadebbe07aa38371576cf4006773484494751a0..e453d9ee866c50c23b34a445e0c0eedf1fba0b07 100644
--- a/unit_tests/Test_AvgPoolingImpl.cpp
+++ b/unit_tests/Test_AvgPoolingImpl.cpp
@@ -10,151 +10,308 @@
  ********************************************************************************/
 
 #include <array>
-
 #include <catch2/catch_test_macros.hpp>
-#include <cuda_fp16.h>
-#include <numeric>   // std::accumulate
-#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
-
-#include "Test_cuda.hpp"
-
-#include "aidge/data/half.hpp"
-#include "aidge/data/Tensor.hpp"
+#include <cuda_fp16.h> // half type
+#include <numeric>     // std::accumulate
+#include <random>      // std::random_device, std::mt19937, std::uniform_real_distribution
 
 #include "aidge/backend/cpu.hpp"
 #include "aidge/backend/cuda.hpp"
+#include "aidge/data/half.hpp"
+#include "aidge/data/Tensor.hpp"
 
 using namespace Aidge;
 
-TEST_CASE("[gpu/operator] AvgPooling(forward)", "[AvgPooling][GPU]") {
-    std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,2,5,5> { //NCHW
-        {
-            {
-                {{  0,   1,   2,   3,   4},
-                 {  5,   6,   7,   8,   9},
-                 { 10,  11,  12,  13,  14},
-                 { 15,  16,  17,  18,  19},
-                 { 20,  21,  22,  23,  24}},
-
-                {{ 25,  26,  27,  28,  29},
-                 { 30,  31,  32,  33,  34},
-                 { 35,  36,  37,  38,  39},
-                 { 40,  41,  42,  43,  44},
-                 { 45,  46,  47,  48,  49}}
-            },
-            {
-                {{100, 101, 102, 103, 104},
-                 {105, 106, 107, 108, 109},
-                 {110, 111, 112, 113, 114},
-                 {115, 116, 117, 118, 119},
-                 {120, 121, 122, 123, 124}},
-
-                {{125, 126, 127, 128, 129},
-                 {130, 131, 132, 133, 134},
-                 {135, 136, 137, 138, 139},
-                 {140, 141, 142, 143, 144},
-                 {145, 146, 147, 148, 149}}
-            }
-        }
-    });
-    SECTION("Stride") {
-        std::shared_ptr<Node> myAvgPool = AvgPooling({2,2}, "myAvgPool", {2,2});
-        auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool -> getOperator());
+TEST_CASE("[gpu/operator] AvgPooling(forward)", "[AvgPooling][GPU]")
+{
+    std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float, 2, 2, 5, 5>{// NCHW
+                                                                                          {
+                                                                                              {{{0, 1, 2, 3, 4},
+                                                                                                {5, 6, 7, 8, 9},
+                                                                                                {10, 11, 12, 13, 14},
+                                                                                                {15, 16, 17, 18, 19},
+                                                                                                {20, 21, 22, 23, 24}},
 
-        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,2,2,2> {
-            {
-                {
-                    {{  3,   5},
-                     { 13,  15}},
-                    {{ 28,  30},
-                     { 38,  40}}
-                },
-                {
-                    {{103, 105},
-                     {113, 115}},
-                    {{128, 130},
-                     {138, 140}}
-                }
-            }
-        });
-        op->associateInput(0,myInput);
+                                                                                               {{25, 26, 27, 28, 29},
+                                                                                                {30, 31, 32, 33, 34},
+                                                                                                {35, 36, 37, 38, 39},
+                                                                                                {40, 41, 42, 43, 44},
+                                                                                                {45, 46, 47, 48, 49}}},
+                                                                                              {{{100, 101, 102, 103, 104},
+                                                                                                {105, 106, 107, 108, 109},
+                                                                                                {110, 111, 112, 113, 114},
+                                                                                                {115, 116, 117, 118, 119},
+                                                                                                {120, 121, 122, 123, 124}},
+
+                                                                                               {{125, 126, 127, 128, 129},
+                                                                                                {130, 131, 132, 133, 134},
+                                                                                                {135, 136, 137, 138, 139},
+                                                                                                {140, 141, 142, 143, 144},
+                                                                                                {145, 146, 147, 148, 149}}}}});
+    SECTION("Stride")
+    {
+        std::shared_ptr<Node> myAvgPool = AvgPooling({2, 2}, "myAvgPool", {2, 2});
+        auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool->getOperator());
+
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 2>{
+            {{{{3, 5},
+               {13, 15}},
+              {{28, 30},
+               {38, 40}}},
+             {{{103, 105},
+               {113, 115}},
+              {{128, 130},
+               {138, 140}}}}});
+        op->associateInput(0, myInput);
         op->setDataType(DataType::Float32);
         op->setBackend("cuda");
         myAvgPool->forward();
 
-        float* computedOutput   = new float[myOutput->size()]();
+        float *computedOutput = new float[myOutput->size()]();
         cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
 
-        for(int i = 0; i < myOutput->size(); i++){
-            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+        for (int i = 0; i < myOutput->size(); i++)
+        {
+            const float targetOutput = *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i);
             REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
         }
 
         delete[] computedOutput;
     }
 
-    SECTION("Stride >= feature dim") {
-        std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(Array4D<float,1,1,3,3> { //NCHW
-        {
-            {
-                {{0.3745, 0.9507, 0.7320},
-                 {0.5987, 0.1560, 0.1560},
-                 {0.0581, 0.8662, 0.6011}}
-            }
-        }
-        });
-        std::shared_ptr<Node> myAvgPool = AvgPooling({3,3}, "myAvgPool", {3,3});
-        auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool -> getOperator());
-
-        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,1,1,1,1> {
-            {{{{(0.3745 + 0.9507 + 0.7320 + 0.5987 + 0.1560 + 0.1560 + 0.0581 + 0.8662 + 0.6011)/9.0}}}}
-        });
-        op->associateInput(0,myInput2);
+    SECTION("Stride >= feature dim")
+    {
+        std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(Array4D<float, 1, 1, 3, 3>{// NCHW
+                                                                                               {
+                                                                                                   {{{0.3745, 0.9507, 0.7320},
+                                                                                                     {0.5987, 0.1560, 0.1560},
+                                                                                                     {0.0581, 0.8662, 0.6011}}}}});
+        std::shared_ptr<Node> myAvgPool = AvgPooling({3, 3}, "myAvgPool", {3, 3});
+        auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool->getOperator());
+
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float, 1, 1, 1, 1>{
+            {{{{(0.3745 + 0.9507 + 0.7320 + 0.5987 + 0.1560 + 0.1560 + 0.0581 + 0.8662 + 0.6011) / 9.0}}}}});
+        op->associateInput(0, myInput2);
         op->setDataType(DataType::Float32);
         op->setBackend("cuda");
         myAvgPool->forward();
 
-        float* computedOutput   = new float[myOutput->size()]();
+        float *computedOutput = new float[myOutput->size()]();
         cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
 
-        for(int i = 0; i < myOutput->size(); i++){
-            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+        for (int i = 0; i < myOutput->size(); i++)
+        {
+            const float targetOutput = *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i);
             REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
         }
 
         delete[] computedOutput;
     }
 
-    SECTION("half") {
-        std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(Array4D<half_float::half,1,1,3,3> { //NCHW
-        {
-            {
-                {{half_float::half(0.3745), half_float::half(0.9507), half_float::half(0.7320)},
-                 {half_float::half(0.5987), half_float::half(0.1560), half_float::half(0.1560)},
-                 {half_float::half(0.0581), half_float::half(0.8662), half_float::half(0.6011)}}
-            }
-        }
-        });
+    SECTION("half")
+    {
+        std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(Array4D<half_float::half, 1, 1, 3, 3>{// NCHW
+                                                                                                          {
+                                                                                                              {{{half_float::half(0.3745), half_float::half(0.9507), half_float::half(0.7320)},
+                                                                                                                {half_float::half(0.5987), half_float::half(0.1560), half_float::half(0.1560)},
+                                                                                                                {half_float::half(0.0581), half_float::half(0.8662), half_float::half(0.6011)}}}}});
         myInput2->setBackend("cuda");
 
-        std::shared_ptr<Node> myAvgPool = AvgPooling({3,3}, "mymyAvgPoolcdw", {3,3});
-        auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool -> getOperator());
-        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<half_float::half,1,1,1,1> {
-            {{{{(half_float::half(0.3745) + half_float::half(0.9507) + half_float::half(0.7320) + half_float::half(0.5987) + half_float::half(0.1560) + half_float::half(0.1560) + half_float::half(0.0581) + half_float::half(0.8662) + half_float::half(0.6011))/half_float::half(9.0)}}}}
-        });
-        op->associateInput(0,myInput2);
+        std::shared_ptr<Node> myAvgPool = AvgPooling({3, 3}, "myAvgPoolcdw", {3, 3});
+        auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool->getOperator());
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<half_float::half, 1, 1, 1, 1>{
+            {{{{(half_float::half(0.3745) + half_float::half(0.9507) + half_float::half(0.7320) + half_float::half(0.5987) + half_float::half(0.1560) + half_float::half(0.1560) + half_float::half(0.0581) + half_float::half(0.8662) + half_float::half(0.6011)) / half_float::half(9.0)}}}}});
+        op->associateInput(0, myInput2);
         op->setDataType(DataType::Float16);
         op->setBackend("cuda");
         myAvgPool->forward();
 
-        half_float::half* computedOutput   = new half_float::half[myOutput->size()]();
+        half_float::half *computedOutput = new half_float::half[myOutput->size()]();
         cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(half_float::half) * myOutput->size(), cudaMemcpyDeviceToHost);
 
-        for(int i = 0; i < myOutput->size(); i++){
-            const half_float::half targetOutput = *(static_cast<half_float::half*>(myOutput->getImpl()->rawPtr()) + i);
+        for (int i = 0; i < myOutput->size(); i++)
+        {
+            const half_float::half targetOutput = *(static_cast<half_float::half *>(myOutput->getImpl()->rawPtr()) + i);
             REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
         }
 
         delete[] computedOutput;
     }
+    int number_of_operation{0};
+    SECTION("Random Input")
+    {
+        constexpr std::uint16_t NBTRIALS = 5;
+        std::size_t kernel = 3;
+        std::size_t stride = 3;
+        // Create a random number generator
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> valueDist(
+            0.1f, 1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(9),
+                                                               std::size_t(12));
+
+        // To measure execution time of 'AveragePooling_Op::forward()'
+        std::chrono::time_point<std::chrono::system_clock> start;
+        std::chrono::time_point<std::chrono::system_clock> end;
+        std::chrono::duration<double, std::micro> duration{};
+        std::size_t number_of_operation = 0;
+
+        SECTION("OutDims")
+        {
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial)
+            {
+                // Create AveragePooling Operator
+                std::shared_ptr<Node> myAvgPool = AvgPooling({kernel, kernel}, "myAvgPool", {stride, stride});
+                auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool->getOperator());
+                op->setDataType(DataType::Float32);
+                op->setBackend("cuda");
+
+                // Create the input Tensor
+                std::shared_ptr<Tensor> T0 = std::make_shared<Tensor>();
+                op->associateInput(0, T0);
+                T0->setDataType(DataType::Float32);
+                T0->setBackend("cuda");
+
+                // generate a random Tensor
+                const std::size_t nbDims = 4;
+                std::vector<std::size_t> dims;
+                for (std::size_t i = 0; i < nbDims; ++i)
+                {
+                    dims.push_back(dimSizeDist(gen));
+                }
+
+                const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                number_of_operation += nb_elements;
+
+                // Fill input tensor
+                float *array0 = new float[nb_elements];
+                for (std::size_t i = 0; i < nb_elements; ++i)
+                {
+                    array0[i] = valueDist(gen);
+                }
+                float *input_d;
+                cudaMalloc(reinterpret_cast<void **>(&input_d), sizeof(float) * nb_elements);
+                cudaMemcpy(input_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice);
+                T0->resize(dims);
+                T0->getImpl()->setRawPtr(input_d, nb_elements);
+
+                // Run inference
+                start = std::chrono::system_clock::now();
+                myAvgPool->forward();
+                end = std::chrono::system_clock::now();
+                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+                // Verify output dimensions
+                REQUIRE(op->getOutput(0)->nbDims() == dims.size());
+                for (size_t i = 0; i < op->getOutput(0)->nbDims(); ++i)
+                {
+                    if (i == 2 || i == 3)
+                        REQUIRE(op->getOutput(0)->dims()[i] == (1 + static_cast<DimSize_t>(std::floor(static_cast<float>(dims[i] - kernel) / static_cast<float>(stride)))));
+                    else
+                        REQUIRE(op->getOutput(0)->dims()[i] == dims[i]);
+                }
+
+                delete[] array0;
+                cudaFree(input_d);
+            }
+            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count()) << std::endl;
+            std::cout << "total time: " << duration.count() << "μs" << std::endl;
+        }
+
+        SECTION("Values")
+        {
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial)
+            {
+                // Create AveragePooling Operator
+                std::shared_ptr<Node> myAvgPool = AvgPooling({kernel, kernel}, "myAvgPool", {stride, stride});
+                auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool->getOperator());
+                op->setDataType(DataType::Float32);
+                op->setBackend("cuda");
+
+                // Create the input Tensor
+                std::shared_ptr<Tensor> T0 = std::make_shared<Tensor>();
+                op->associateInput(0, T0);
+                T0->setDataType(DataType::Float32);
+                T0->setBackend("cuda");
+                // generate a random Tensor
+                const std::size_t nbDims = 4;
+                std::vector<std::size_t> dims;
+                for (std::size_t i = 0; i < nbDims; ++i)
+                {
+                    dims.push_back(dimSizeDist(gen));
+                }
+
+                const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                number_of_operation += nb_elements;
+
+                // Fill input tensor
+                float *array0 = new float[nb_elements];
+                for (std::size_t i = 0; i < nb_elements; ++i)
+                {
+                    array0[i] = (int)(valueDist(gen) * 10);
+                }
+                float *input_d;
+                cudaMalloc(reinterpret_cast<void **>(&input_d), sizeof(float) * nb_elements);
+                cudaMemcpy(input_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice);
+                T0->resize(dims);
+                T0->getImpl()->setRawPtr(input_d, nb_elements);
+
+                // Fill expected output
+                int N = dims[0]; // Batch
+                int C = dims[1]; // Channels
+                int H = dims[2]; // Height
+                int W = dims[3]; // Width
+
+                // Compute output dimensions
+                int outH = (H - kernel) / stride + 1;
+                int outW = (W - kernel) / stride + 1;
+
+                // Allocate memory for the output
+                size_t nbelemOut = N * C * outH * outW;
+                float *output = new float[N * C * outH * outW];
+
+                for (int n = 0; n < N; ++n)
+                {
+                    for (int c = 0; c < C; ++c)
+                    {
+                        for (int i = 0; i < outH; ++i)
+                        {
+                            for (int j = 0; j < outW; ++j)
+                            {
+                                float sum = 0.0;
+                                for (int m = 0; m < kernel; ++m)
+                                {
+                                    for (int k = 0; k < kernel; ++k)
+                                    {
+                                        sum += array0[((n * C + c) * H + i * stride + m) * W + j * stride + k];
+                                    }
+                                }
+                                output[((n * C + c) * outH + i) * outW + j] = sum / (kernel * kernel);
+                            }
+                        }
+                    }
+                }
+                // Run inference
+                start = std::chrono::system_clock::now();
+                myAvgPool->forward();
+                end = std::chrono::system_clock::now();
+                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+                float *computedOutput = new float[nbelemOut]();
+                cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * nbelemOut, cudaMemcpyDeviceToHost);
+
+                for (int i = 0; i < nbelemOut; ++i)
+                {
+                    REQUIRE(fabs(computedOutput[i] - output[i]) < 1e-4);
+                }
+
+                delete[] computedOutput;
+                delete[] array0;
+                cudaFree(input_d);
+            }
+            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count()) << std::endl;
+            std::cout << "total time: " << duration.count() << "μs" << std::endl;
+        }
+    }
 }
\ No newline at end of file