From 7d8a52b4e02173bb391f31067ffdf257ea75c5aa Mon Sep 17 00:00:00 2001
From: Antoni Olivier <olivier.antoni@cea.fr>
Date: Thu, 10 Apr 2025 14:02:59 +0200
Subject: [PATCH] Fix update gradient tensor dimesions

---
 unit_tests/operator/Test_AddImpl.cpp |  14 +-
 unit_tests/operator/Test_DivImpl.cpp |  13 +-
 unit_tests/operator/Test_MulImpl.cpp |  13 +-
 unit_tests/operator/Test_PowImpl.cpp | 974 ++++++++++++++-------------
 unit_tests/operator/Test_SubImpl.cpp |  12 +-
 5 files changed, 512 insertions(+), 514 deletions(-)

diff --git a/unit_tests/operator/Test_AddImpl.cpp b/unit_tests/operator/Test_AddImpl.cpp
index 4538b322..d9adb484 100644
--- a/unit_tests/operator/Test_AddImpl.cpp
+++ b/unit_tests/operator/Test_AddImpl.cpp
@@ -159,10 +159,10 @@ TEST_CASE("[cpu/operator] Add(backward)", "[Add][CPU]") {
 
         op->associateInput(0, T0);
         op->associateInput(1, T1);
-        op->getOutput(0)->setGrad(std::make_shared<Tensor>(
-            Array2D<float, 2, 3>({{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}})));
         op->forwardDims();
 
+        op->getOutput(0)->setGrad(std::make_shared<Tensor>(
+            Array2D<float, 2, 3>({{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}})));
         op->backward();
 
         const Tensor expectedGrad0 =
@@ -194,8 +194,9 @@ TEST_CASE("[cpu/operator] Add(backward)", "[Add][CPU]") {
 
         op->associateInput(0, T0);
         op->associateInput(1, T1);
-        op->getOutput(0)->setGrad(newGrad);
         op->forwardDims();
+
+        op->getOutput(0)->setGrad(newGrad);
         op->backward();
 
         REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
@@ -236,9 +237,9 @@ TEST_CASE("[cpu/operator] Add(backward)", "[Add][CPU]") {
 
         op->associateInput(0, T0);
         op->associateInput(1, T1);
-        op->getOutput(0)->setGrad(newGrad);
         op->forwardDims();
 
+        op->getOutput(0)->setGrad(newGrad);
         op->backward();
 
         REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
@@ -290,8 +291,8 @@ TEST_CASE("[cpu/operator] Add(backward)", "[Add][CPU]") {
 
         op->associateInput(0, T0);
         op->associateInput(1, T1);
-        op->getOutput(0)->setGrad(newGrad);
         op->forwardDims();
+        op->getOutput(0)->setGrad(newGrad);
 
         op->backward();
 
@@ -364,8 +365,7 @@ TEST_CASE("[cpu/operator] Add(backward)", "[Add][CPU]") {
             val = dist(gen);
         }
 
-        op->getOutput(0)->setGrad(std::make_shared<Tensor>());
-        op->getOutput(0)->grad()->resize(outputDims);
+        op->getOutput(0)->setGrad(std::make_shared<Tensor>(outputDims));
         op->getOutput(0)->grad()->getImpl()->setRawPtr(gradOutputData.data(),
                                                        expectedOutput.size());
 
diff --git a/unit_tests/operator/Test_DivImpl.cpp b/unit_tests/operator/Test_DivImpl.cpp
index 4e7657ed..f7993753 100644
--- a/unit_tests/operator/Test_DivImpl.cpp
+++ b/unit_tests/operator/Test_DivImpl.cpp
@@ -339,10 +339,10 @@ TEST_CASE("[CPU/Operator] Div(Backward)", "[Div][CPU][Backward]") {
 
         op->associateInput(0, T0);
         op->associateInput(1, T1);
-        op->getOutput(0)->setGrad(std::make_shared<Tensor>(
-            Array2D<float, 2, 3>({{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}})));
         op->forwardDims();
 
+        op->getOutput(0)->setGrad(std::make_shared<Tensor>(
+            Array2D<float, 2, 3>({{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}})));
         op->backward();
 
         const Tensor expectedGrad0 =
@@ -373,9 +373,9 @@ TEST_CASE("[CPU/Operator] Div(Backward)", "[Div][CPU][Backward]") {
 
         op->associateInput(0, T0);
         op->associateInput(1, T1);
-        op->getOutput(0)->setGrad(newGrad);
         op->forwardDims();
 
+        op->getOutput(0)->setGrad(newGrad);
         op->backward();
 
         REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
@@ -415,9 +415,9 @@ TEST_CASE("[CPU/Operator] Div(Backward)", "[Div][CPU][Backward]") {
 
         op->associateInput(0, T0);
         op->associateInput(1, T1);
-        op->getOutput(0)->setGrad(newGrad);
         op->forwardDims();
 
+        op->getOutput(0)->setGrad(newGrad);
         op->backward();
 
         REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
@@ -471,9 +471,9 @@ TEST_CASE("[CPU/Operator] Div(Backward)", "[Div][CPU][Backward]") {
 
         op->associateInput(0, T0);
         op->associateInput(1, T1);
-        op->getOutput(0)->setGrad(newGrad);
         op->forwardDims();
 
+        op->getOutput(0)->setGrad(newGrad);
         op->backward();
 
         REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
@@ -545,8 +545,7 @@ TEST_CASE("[CPU/Operator] Div(Backward)", "[Div][CPU][Backward]") {
             val = dist(gen);
         }
 
-        op->getOutput(0)->setGrad(std::make_shared<Tensor>());
-        op->getOutput(0)->grad()->resize(outputDims);
+        op->getOutput(0)->setGrad(std::make_shared<Tensor>(outputDims));
         op->getOutput(0)->grad()->getImpl()->setRawPtr(gradOutputData.data(),
                                                        expectedOutput.size());
 
diff --git a/unit_tests/operator/Test_MulImpl.cpp b/unit_tests/operator/Test_MulImpl.cpp
index 2937e949..a8e0fbdd 100644
--- a/unit_tests/operator/Test_MulImpl.cpp
+++ b/unit_tests/operator/Test_MulImpl.cpp
@@ -46,10 +46,10 @@ TEST_CASE("[CPU/Operator] Mul(Backward)", "[Mul][CPU][Backward]") {
 
         op->associateInput(0, T0);
         op->associateInput(1, T1);
-        op->getOutput(0)->setGrad(std::make_shared<Tensor>(
-            Array2D<float, 2, 3>({{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}})));
         op->forwardDims();
 
+        op->getOutput(0)->setGrad(std::make_shared<Tensor>(
+            Array2D<float, 2, 3>({{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}})));
         op->backward();
 
         const Tensor expectedGrad0 =
@@ -80,9 +80,9 @@ TEST_CASE("[CPU/Operator] Mul(Backward)", "[Mul][CPU][Backward]") {
 
         op->associateInput(0, T0);
         op->associateInput(1, T1);
-        op->getOutput(0)->setGrad(newGrad);
         op->forwardDims();
 
+        op->getOutput(0)->setGrad(newGrad);
         op->backward();
 
         REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
@@ -122,9 +122,9 @@ TEST_CASE("[CPU/Operator] Mul(Backward)", "[Mul][CPU][Backward]") {
 
         op->associateInput(0, T0);
         op->associateInput(1, T1);
-        op->getOutput(0)->setGrad(newGrad);
         op->forwardDims();
 
+        op->getOutput(0)->setGrad(newGrad);
         op->backward();
 
         REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
@@ -176,9 +176,9 @@ TEST_CASE("[CPU/Operator] Mul(Backward)", "[Mul][CPU][Backward]") {
 
         op->associateInput(0, T0);
         op->associateInput(1, T1);
-        op->getOutput(0)->setGrad(newGrad);
         op->forwardDims();
 
+        op->getOutput(0)->setGrad(newGrad);
         op->backward();
 
         REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
@@ -250,8 +250,7 @@ TEST_CASE("[CPU/Operator] Mul(Backward)", "[Mul][CPU][Backward]") {
             val = dist(gen);
         }
 
-        op->getOutput(0)->setGrad(std::make_shared<Tensor>());
-        op->getOutput(0)->grad()->resize(outputDims);
+        op->getOutput(0)->setGrad(std::make_shared<Tensor>(outputDims));
         op->getOutput(0)->grad()->getImpl()->setRawPtr(gradOutputData.data(),
                                                        expectedOutput.size());
 
diff --git a/unit_tests/operator/Test_PowImpl.cpp b/unit_tests/operator/Test_PowImpl.cpp
index 55a416c3..8f3b2c35 100644
--- a/unit_tests/operator/Test_PowImpl.cpp
+++ b/unit_tests/operator/Test_PowImpl.cpp
@@ -1,486 +1,488 @@
-/********************************************************************************
- * Copyright (c) 2023 CEA-List
- *
- * This program and the accompanying materials are made available under the
- * terms of the Eclipse Public License 2.0 which is available at
- * http://www.eclipse.org/legal/epl-2.0.
- *
- * SPDX-License-Identifier: EPL-2.0
- *
- ********************************************************************************/
-
-#include <chrono>      // std::micro, std::chrono::time_point,
-                       // std::chrono::system_clock, std::chrono::duration
-#include <cstddef>     // std::size_t
-#include <cstdint>     // std::uint16_t
-#include <functional>  // std::multiplies
-#include <memory>
-#include <numeric>     // std::accumulate
-#include <random>      // std::random_device, std::mt19937
-                       // std::uniform_int_distribution, std::uniform_real_distribution
-#include <vector>
-
-#include <catch2/catch_test_macros.hpp>
-#include <fmt/core.h>
-
-#include "aidge/backend/cpu/data/TensorImpl.hpp"
-#include "aidge/backend/cpu/operator/PowImpl.hpp"
-#include "aidge/data/Data.hpp"
-#include "aidge/data/Tensor.hpp"
-#include "aidge/operator/Pow.hpp"
-#include "aidge/utils/ArrayHelpers.hpp"
-#include "aidge/utils/TensorUtils.hpp"
-
-namespace Aidge {
-
-TEST_CASE("[cpu/operator] Pow", "[Pow][CPU]") {
-    constexpr std::uint16_t NBTRIALS = 10;
-    // Create a random number generator
-    std::random_device rd;
-    std::mt19937 gen(rd());
-    std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1
-    std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
-    std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5));
-    std::uniform_int_distribution<int> boolDist(0,1);
-
-    // Create MatPow Operator
-    std::shared_ptr<Node> myPow = Pow();
-    auto op = std::static_pointer_cast<OperatorTensor>(myPow-> getOperator());
-    op->setDataType(DataType::Float32);
-    op->setBackend("cpu");
-
-    // Create 2 input Tensors
-    std::shared_ptr<Tensor> T0 = std::make_shared<Tensor>();
-    op->associateInput(0,T0);
-    T0->setDataType(DataType::Float32);
-    T0->setBackend("cpu");
-    std::shared_ptr<Tensor> T1 = std::make_shared<Tensor>();
-    op -> associateInput(1,T1);
-    T1->setDataType(DataType::Float32);
-    T1->setBackend("cpu");
-
-    // Create results Tensor
-    std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>();
-    Tres->setDataType(DataType::Float32);
-    Tres->setBackend("cpu");
-
-    // To measure execution time of 'MatPow_Op::forward()' member function call
-    std::chrono::time_point<std::chrono::system_clock> start;
-    std::chrono::time_point<std::chrono::system_clock> end;
-    std::chrono::duration<double, std::micro> duration{};
-
-    SECTION("PowImpl_cpu::forward()") {
-        SECTION("Scalar / Scalar") {
-
-        }
-        SECTION("Scalar / +1-D Tensor") {
-
-        }
-        SECTION("+1-D Tensor / +1-D Tensor - same dimensions") {
-            std::size_t number_of_operation = 0;
-
-            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
-                // generate 2 random Tensors
-                const std::size_t nbDims = nbDimsDist(gen);
-                std::vector<std::size_t> dims;
-                for (std::size_t i = 0; i < nbDims; ++i) {
-                    dims.push_back(dimSizeDist(gen));
-                }
-                const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
-                number_of_operation += nb_elements;
-
-                // without broadcasting
-                float* array0 = new float[nb_elements];
-                float* array1 = new float[nb_elements];
-                float* result = new float[nb_elements];
-
-                for (std::size_t i = 0; i < nb_elements; ++i) {
-                    array0[i] = valueDist(gen);
-                    array1[i] = valueDist(gen);
-                    result[i] = std::pow(array0[i], array1[i]);
-                }
-
-                // input0
-                T0->resize(dims);
-                T0 -> getImpl() -> setRawPtr(array0, nb_elements);
-
-                // input1
-                T1->resize(dims);
-                T1 -> getImpl() -> setRawPtr(array1, nb_elements);
-
-                // results
-                Tres->resize(dims);
-                Tres -> getImpl() -> setRawPtr(result, nb_elements);
-
-                op->forwardDims();
-                start = std::chrono::system_clock::now();
-                myPow->forward();
-                end = std::chrono::system_clock::now();
-                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
-
-                REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
-
-                delete[] array0;
-                delete[] array1;
-                delete[] result;
-
-                // with broadcasting
-            }
-            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
-            Log::info("total time: {} μs\n", duration.count());
-        }
-
-        SECTION("+1-D Tensor / +1-D Tensor - broadcasting") {
-            std::size_t number_of_operation = 0;
-
-            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
-                // generate 2 random Tensors
-                // handle dimensions, replace some dimensions with '1' to get broadcasting
-                constexpr std::size_t nbDims = 4;
-                std::vector<std::size_t> dims;
-                for (std::size_t i = 0; i < nbDims; ++i) {
-                    dims.push_back(dimSizeDist(gen));
-                }
-                std::vector<std::size_t> dims0 = dims;
-                std::vector<std::size_t> dims1 = dims;
-                std::vector<std::size_t> dimsOut = dims;
-                for (std::size_t i = 0; i < nbDims; ++i) {
-                    if (boolDist(gen)) {
-                        dims0[i] = 1;
-                    }
-                    if (boolDist(gen)) {
-                        dims1[i] = 1;
-                    }
-                    dimsOut[i] = (dims0[i] == 1) ? dims1[i] : dims0[i];
-                }
-
-                // create arrays and fill them with random values
-                float* array0 = new float[dims0[0]*dims0[1]*dims0[2]*dims0[3]];
-                float* array1 = new float[dims1[0]*dims1[1]*dims1[2]*dims1[3]];
-                float* result = new float[dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]];
-
-                for (std::size_t i = 0; i < dims0[0]*dims0[1]*dims0[2]*dims0[3]; ++i) {
-                    array0[i] = valueDist(gen);
-                }
-                for (std::size_t i = 0; i < dims1[0]*dims1[1]*dims1[2]*dims1[3]; ++i) {
-                    array1[i] = valueDist(gen);
-                }
-
-                // compute true result
-                const std::size_t strides0[nbDims] = {dims0[1]*dims0[2]*dims0[3], dims0[2]*dims0[3], dims0[3], 1};
-                const std::size_t strides1[nbDims] = {dims1[1]*dims1[2]*dims1[3], dims1[2]*dims1[3], dims1[3], 1};
-                for (std::size_t a = 0; a < dimsOut[0]; ++a) {
-                    for (std::size_t b = 0; b < dimsOut[1]; ++b) {
-                        const std::size_t idx0_0 = strides0[0] * ((dims0[0] > 1) ? a : 0)
-                                                    + strides0[1] * ((dims0[1] > 1) ? b : 0);
-                        const std::size_t idx1_0 = strides1[0] * ((dims1[0] > 1) ? a : 0)
-                                                    + strides1[1] * ((dims1[1] > 1) ? b : 0);
-                        for (std::size_t c = 0; c < dimsOut[2]; ++c) {
-                            const std::size_t idx_out = dimsOut[3] * (c + dimsOut[2] * (b + dimsOut[1] * a));
-                            for (std::size_t d = 0; d < dimsOut[3]; ++d) {
-                                std::size_t idx0 = idx0_0
-                                                    + strides0[2] * ((dims0[2] > 1) ? c : 0)
-                                                    + ((dims0[3] > 1) ? d : 0);
-                                std::size_t idx1 = idx1_0
-                                                    + strides1[2] * ((dims1[2] > 1) ? c : 0)
-                                                    + ((dims1[3] > 1) ? d : 0);
-                                result[idx_out + d] = std::pow(array0[idx0], array1[idx1]);
-                                // std::cout << "(" << idx0 << ", " << idx1 << ") -> " << array0[idx0] << " ** " << array1[idx1] << " -> " << idx_out + d << std::endl;
-                            }
-                        }
-                    }
-                }
-
-                // conversion to Aidge::Tensors
-                // input0
-                T0->resize(dims0);
-                T0 -> getImpl() -> setRawPtr(array0, dims0[0]*dims0[1]*dims0[2]*dims0[3]);
-
-                // input1
-                T1->resize(dims1);
-                T1 -> getImpl() -> setRawPtr(array1, dims1[0]*dims1[1]*dims1[2]*dims1[3]);
-
-                // results
-                Tres->resize(dimsOut);
-                Tres -> getImpl() -> setRawPtr(result, dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]);
-
-                // compute result
-                op->forwardDims();
-                start = std::chrono::system_clock::now();
-                myPow->forward();
-                end = std::chrono::system_clock::now();
-                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
-
-                // comparison between truth and computed result
-                REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
-
-                delete[] array0;
-                delete[] array1;
-                delete[] result;
-
-                const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
-                number_of_operation += nb_elements;
-            }
-            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
-            Log::info("total time: {} μs\n", duration.count());
-        }
-        SECTION("+1-D Tensor / 1-D Tensor") {
-            std::size_t number_of_operation = 0;
-            std::uniform_int_distribution<std::size_t> nbRemovedDimsDist(std::size_t(1), std::size_t(3));
-
-            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
-                // generate 2 random Tensors
-                // handle dimensions
-                constexpr std::size_t nbDims = 4;
-                std::vector<std::size_t> dims0(4);
-                for (std::size_t i = 0; i < nbDims; ++i) {
-                    dims0[i] = dimSizeDist(gen);
-                }
-                std::vector<std::size_t> dimsOut = dims0;
-                std::vector<std::size_t> dims1 = dims0;
-                for (std::size_t i = 0; i < nbDims; ++i) {
-                    if (boolDist(gen)) {
-                        dims1[i] = 1;
-                    }
-                }
-                dims1.erase(dims1.cbegin(), dims1.cbegin() + nbRemovedDimsDist(gen));
-
-                // create arrays and fill them with random values
-                float* array0 = new float[dims0[0]*dims0[1]*dims0[2]*dims0[3]];
-                std::size_t array1_size = std::accumulate(dims1.cbegin(), dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
-                float* array1 = new float[array1_size];
-                float* result = new float[dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]];
-
-                for (std::size_t i = 0; i < (dims0[0]*dims0[1]*dims0[2]*dims0[3]); ++i) {
-                    array0[i] = valueDist(gen);
-                }
-                for (std::size_t i = 0; i < array1_size; ++i) {
-                    array1[i] = valueDist(gen);
-                }
-
-                // compute true result
-                auto dims1_tmp = dims1;
-                dims1_tmp.insert(dims1_tmp.cbegin(), 4 - dims1_tmp.size(), std::size_t(1));
-
-                const std::size_t strides0[nbDims] = {dims0[1]*dims0[2]*dims0[3], dims0[2]*dims0[3], dims0[3], 1};
-                const std::size_t strides1[nbDims] = {dims1_tmp[1]*dims1_tmp[2]*dims1_tmp[3], dims1_tmp[2]*dims1_tmp[3], dims1_tmp[3], 1};
-                for (std::size_t a = 0; a < dimsOut[0]; ++a) {
-                    for (std::size_t b = 0; b < dimsOut[1]; ++b) {
-                        const std::size_t idx0_0 = strides0[0] * ((dims0[0] > 1) ? a : 0)
-                                                    + strides0[1] * ((dims0[1] > 1) ? b : 0);
-                        const std::size_t idx1_0 = strides1[0] * ((dims1_tmp[0] > 1) ? a : 0)
-                                                    + strides1[1] * ((dims1_tmp[1] > 1) ? b : 0);
-                        for (std::size_t c = 0; c < dimsOut[2]; ++c) {
-                            const std::size_t idx_out = dimsOut[3] * (c + dimsOut[2] * (b + dimsOut[1] * a));
-                            for (std::size_t d = 0; d < dimsOut[3]; ++d) {
-                                std::size_t idx0 = idx0_0
-                                                    + strides0[2] * ((dims0[2] > 1) ? c : 0)
-                                                    + ((dims0[3] > 1) ? d : 0);
-                                std::size_t idx1 = idx1_0
-                                                    + strides1[2] * ((dims1_tmp[2] > 1) ? c : 0)
-                                                    + ((dims1_tmp[3] > 1) ? d : 0);
-                                result[idx_out + d] = std::pow(array0[idx0], array1[idx1]);
-                                // std::cout << "(" << idx0 << ", " << idx1 << ") -> " << array0[idx0] << " ** " << array1[idx1] << " -> " << idx_out + d << std::endl;
-                            }
-                        }
-                    }
-                }
-
-                // conversion to Aidge::Tensors
-                // input0
-                T0->resize(dims0);
-                T0 -> getImpl() -> setRawPtr(array0, dims0[0]*dims0[1]*dims0[2]*dims0[3]);
-
-                // input1
-                T1->resize(dims1);
-                T1 -> getImpl() -> setRawPtr(array1, array1_size);
-
-                // results
-                Tres->resize(dimsOut);
-                Tres -> getImpl() -> setRawPtr(result, dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]);
-
-                // compute result
-                op->forwardDims();
-                start = std::chrono::system_clock::now();
-                myPow->forward();
-                end = std::chrono::system_clock::now();
-                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
-
-                // comparison between truth and computed result
-                REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
-
-                delete[] array0;
-                delete[] array1;
-                delete[] result;
-
-                const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
-                number_of_operation += nb_elements;
-            }
-
-            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
-            Log::info("total time: {} μs\n", duration.count());
-        }
-    }
-
-
-    SECTION("PowImpl_cpu::backward()") {
-        SECTION("3D Tensors") {
-            const auto input0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
-                {
-                    {
-                        {
-                            {2.0, 3.0},
-                            {4.0, 5.0}
-                        },
-                        {
-                            {6.0, 7.0},
-                            {8.0, 9.0}
-                        }
-                    }
-                }
-            ));
-            const auto input1 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
-                {
-                    {
-                        {
-                            {1.0, 2.0},
-                            {3.0, 2.0}
-                        },
-                        {
-                            {2.0, 3.0},
-                            {1.0, 0.5}
-                        }
-                    }
-                }
-            ));
-            const auto gradOut = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
-                {
-                    {
-                        {
-                            {0.5, 1.0},
-                            {1.5, 2.0}
-                        },
-                        {
-                            {2.5, 3.0},
-                            {3.5, 4.0}
-                        }
-                    }
-                }
-            ));
-            const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
-                {
-                    {
-                        {
-                            {0.50000000,   6.00000000},
-                            {72.00000000,  20.00000000}
-                        },
-                        {
-                            {30.00000000, 441.00000000},
-                            {3.50000000,   0.66666669}
-                        }
-                    }
-                }
-            ));
-            const auto expectedGrad1 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
-                {
-                    {
-                        {
-                            {  0.693147182, 9.88751030},
-                            {1.33084259e+02, 8.04718933e+01}
-                        },
-                        {
-                            {1.61258362e+02, 2.00234143e+03},
-                            {5.82243652e+01, 2.63666954e+01}
-                        }
-                    }
-                }
-            ));
-            for(const auto T: {input0, input1, gradOut, expectedGrad0, expectedGrad1})
-            {
-                    T->setBackend("cpu") ;
-                    T->setDataType(DataType::Float32);
-            }
-            std::shared_ptr<Node> powOp = Pow();
-            auto opr = std::static_pointer_cast<OperatorTensor>(powOp-> getOperator());
-            opr->setDataType(DataType::Float32);
-            opr->setBackend("cpu");
-            opr->associateInput(0, input0);
-            opr->associateInput(1, input1);
-            opr->getOutput(0)->setGrad(gradOut);
-            opr->forward();
-
-            powOp->backward();
-            REQUIRE(approxEq<float>(*(opr->getInput(0)->grad()), *expectedGrad0));
-            REQUIRE(approxEq<float>(*(opr->getInput(1)->grad()), *expectedGrad1));
-        }
-        SECTION("Broadcasting") {
-            const auto input0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
-                {
-                    {
-                        {
-                            {1.0, 2.0, 3.0},
-                            {4.0, 5.0, 6.0}
-                        },
-                        {
-                            {1.5, 2.5, 3.5},
-                            {4.5, 5.5, 6.5}
-                        }
-                    }
-                }
-            ));
-            const auto input1 = std::make_shared<Tensor>(Array1D<float, 3>(
-                {
-                    {0.1, 0.2, 0.3}
-                }
-            ));
-
-            const auto gradOut = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
-                {
-                    {
-                        {
-                            {1.0, 2.0, 3.0},
-                            {4.0, 5.0, 6.0}
-                        },
-                        {
-                            {6.0, 5.0, 4.0},
-                            {3.0, 2.0, 1.0}
-                        }
-                    }
-                }
-            ));
-            const Tensor expectedGrad0 = Array3D<float, 2, 2, 3>(
-                {
-                    {
-                        {
-                            {0.10000000, 0.22973967, 0.41711676},
-                            {0.11486985, 0.27594593, 0.51353097}
-                        },
-                        {
-                            {0.41655189, 0.48044977, 0.49926791},
-                            {0.07748720, 0.10227509, 0.08092485}
-                        }
-                    }
-                }
-            );
-            const Tensor expectedGrad1 = Array1D<float, 3>(
-                {
-                    {14.14779854, 22.99299049, 33.56402588}
-                }
-            );
-
-            std::shared_ptr<Node> powOp = Pow();
-            auto opr = std::static_pointer_cast<OperatorTensor>(powOp-> getOperator());
-            opr->setDataType(DataType::Float32);
-            opr->setBackend("cpu");
-            opr->associateInput(0, input0);
-            opr->associateInput(1, input1);
-            opr->getOutput(0)->setGrad(gradOut);
-            powOp->forward();
-
-            powOp->backward();
-            REQUIRE(approxEq<float>(*(opr->getInput(0)->grad()), expectedGrad0));
-            REQUIRE(approxEq<float>(*(opr->getInput(1)->grad()), expectedGrad1));
-        }
-    }
-}
-} // namespace Aidge
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <chrono>      // std::micro, std::chrono::time_point,
+                       // std::chrono::system_clock, std::chrono::duration
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::uint16_t
+#include <functional>  // std::multiplies
+#include <memory>
+#include <numeric>     // std::accumulate
+#include <random>      // std::random_device, std::mt19937
+                       // std::uniform_int_distribution, std::uniform_real_distribution
+#include <vector>
+
+#include <catch2/catch_test_macros.hpp>
+#include <fmt/core.h>
+
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/PowImpl.hpp"
+#include "aidge/data/Data.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/Pow.hpp"
+#include "aidge/utils/ArrayHelpers.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+namespace Aidge {
+
+TEST_CASE("[cpu/operator] Pow", "[Pow][CPU]") {
+    constexpr std::uint16_t NBTRIALS = 10;
+    // Create a random number generator
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1
+    std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
+    std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5));
+    std::uniform_int_distribution<int> boolDist(0,1);
+
+    // Create MatPow Operator
+    std::shared_ptr<Node> myPow = Pow();
+    auto op = std::static_pointer_cast<OperatorTensor>(myPow-> getOperator());
+    op->setDataType(DataType::Float32);
+    op->setBackend("cpu");
+
+    // Create 2 input Tensors
+    std::shared_ptr<Tensor> T0 = std::make_shared<Tensor>();
+    op->associateInput(0,T0);
+    T0->setDataType(DataType::Float32);
+    T0->setBackend("cpu");
+    std::shared_ptr<Tensor> T1 = std::make_shared<Tensor>();
+    op -> associateInput(1,T1);
+    T1->setDataType(DataType::Float32);
+    T1->setBackend("cpu");
+
+    // Create results Tensor
+    std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>();
+    Tres->setDataType(DataType::Float32);
+    Tres->setBackend("cpu");
+
+    // To measure execution time of 'MatPow_Op::forward()' member function call
+    std::chrono::time_point<std::chrono::system_clock> start;
+    std::chrono::time_point<std::chrono::system_clock> end;
+    std::chrono::duration<double, std::micro> duration{};
+
+    SECTION("PowImpl_cpu::forward()") {
+        SECTION("Scalar / Scalar") {
+
+        }
+        SECTION("Scalar / +1-D Tensor") {
+
+        }
+        SECTION("+1-D Tensor / +1-D Tensor - same dimensions") {
+            std::size_t number_of_operation = 0;
+
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                // generate 2 random Tensors
+                const std::size_t nbDims = nbDimsDist(gen);
+                std::vector<std::size_t> dims;
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    dims.push_back(dimSizeDist(gen));
+                }
+                const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                number_of_operation += nb_elements;
+
+                // without broadcasting
+                float* array0 = new float[nb_elements];
+                float* array1 = new float[nb_elements];
+                float* result = new float[nb_elements];
+
+                for (std::size_t i = 0; i < nb_elements; ++i) {
+                    array0[i] = valueDist(gen);
+                    array1[i] = valueDist(gen);
+                    result[i] = std::pow(array0[i], array1[i]);
+                }
+
+                // input0
+                T0->resize(dims);
+                T0 -> getImpl() -> setRawPtr(array0, nb_elements);
+
+                // input1
+                T1->resize(dims);
+                T1 -> getImpl() -> setRawPtr(array1, nb_elements);
+
+                // results
+                Tres->resize(dims);
+                Tres -> getImpl() -> setRawPtr(result, nb_elements);
+
+                op->forwardDims();
+                start = std::chrono::system_clock::now();
+                myPow->forward();
+                end = std::chrono::system_clock::now();
+                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+                REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
+
+                delete[] array0;
+                delete[] array1;
+                delete[] result;
+
+                // with broadcasting
+            }
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {} μs\n", duration.count());
+        }
+
+        SECTION("+1-D Tensor / +1-D Tensor - broadcasting") {
+            std::size_t number_of_operation = 0;
+
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                // generate 2 random Tensors
+                // handle dimensions, replace some dimensions with '1' to get broadcasting
+                constexpr std::size_t nbDims = 4;
+                std::vector<std::size_t> dims;
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    dims.push_back(dimSizeDist(gen));
+                }
+                std::vector<std::size_t> dims0 = dims;
+                std::vector<std::size_t> dims1 = dims;
+                std::vector<std::size_t> dimsOut = dims;
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    if (boolDist(gen)) {
+                        dims0[i] = 1;
+                    }
+                    if (boolDist(gen)) {
+                        dims1[i] = 1;
+                    }
+                    dimsOut[i] = (dims0[i] == 1) ? dims1[i] : dims0[i];
+                }
+
+                // create arrays and fill them with random values
+                float* array0 = new float[dims0[0]*dims0[1]*dims0[2]*dims0[3]];
+                float* array1 = new float[dims1[0]*dims1[1]*dims1[2]*dims1[3]];
+                float* result = new float[dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]];
+
+                for (std::size_t i = 0; i < dims0[0]*dims0[1]*dims0[2]*dims0[3]; ++i) {
+                    array0[i] = valueDist(gen);
+                }
+                for (std::size_t i = 0; i < dims1[0]*dims1[1]*dims1[2]*dims1[3]; ++i) {
+                    array1[i] = valueDist(gen);
+                }
+
+                // compute true result
+                const std::size_t strides0[nbDims] = {dims0[1]*dims0[2]*dims0[3], dims0[2]*dims0[3], dims0[3], 1};
+                const std::size_t strides1[nbDims] = {dims1[1]*dims1[2]*dims1[3], dims1[2]*dims1[3], dims1[3], 1};
+                for (std::size_t a = 0; a < dimsOut[0]; ++a) {
+                    for (std::size_t b = 0; b < dimsOut[1]; ++b) {
+                        const std::size_t idx0_0 = strides0[0] * ((dims0[0] > 1) ? a : 0)
+                                                    + strides0[1] * ((dims0[1] > 1) ? b : 0);
+                        const std::size_t idx1_0 = strides1[0] * ((dims1[0] > 1) ? a : 0)
+                                                    + strides1[1] * ((dims1[1] > 1) ? b : 0);
+                        for (std::size_t c = 0; c < dimsOut[2]; ++c) {
+                            const std::size_t idx_out = dimsOut[3] * (c + dimsOut[2] * (b + dimsOut[1] * a));
+                            for (std::size_t d = 0; d < dimsOut[3]; ++d) {
+                                std::size_t idx0 = idx0_0
+                                                    + strides0[2] * ((dims0[2] > 1) ? c : 0)
+                                                    + ((dims0[3] > 1) ? d : 0);
+                                std::size_t idx1 = idx1_0
+                                                    + strides1[2] * ((dims1[2] > 1) ? c : 0)
+                                                    + ((dims1[3] > 1) ? d : 0);
+                                result[idx_out + d] = std::pow(array0[idx0], array1[idx1]);
+                                // std::cout << "(" << idx0 << ", " << idx1 << ") -> " << array0[idx0] << " ** " << array1[idx1] << " -> " << idx_out + d << std::endl;
+                            }
+                        }
+                    }
+                }
+
+                // conversion to Aidge::Tensors
+                // input0
+                T0->resize(dims0);
+                T0 -> getImpl() -> setRawPtr(array0, dims0[0]*dims0[1]*dims0[2]*dims0[3]);
+
+                // input1
+                T1->resize(dims1);
+                T1 -> getImpl() -> setRawPtr(array1, dims1[0]*dims1[1]*dims1[2]*dims1[3]);
+
+                // results
+                Tres->resize(dimsOut);
+                Tres -> getImpl() -> setRawPtr(result, dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]);
+
+                // compute result
+                op->forwardDims();
+                start = std::chrono::system_clock::now();
+                myPow->forward();
+                end = std::chrono::system_clock::now();
+                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+                // comparison between truth and computed result
+                REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
+
+                delete[] array0;
+                delete[] array1;
+                delete[] result;
+
+                const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                number_of_operation += nb_elements;
+            }
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {} μs\n", duration.count());
+        }
+        SECTION("+1-D Tensor / 1-D Tensor") {
+            std::size_t number_of_operation = 0;
+            std::uniform_int_distribution<std::size_t> nbRemovedDimsDist(std::size_t(1), std::size_t(3));
+
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                // generate 2 random Tensors
+                // handle dimensions
+                constexpr std::size_t nbDims = 4;
+                std::vector<std::size_t> dims0(4);
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    dims0[i] = dimSizeDist(gen);
+                }
+                std::vector<std::size_t> dimsOut = dims0;
+                std::vector<std::size_t> dims1 = dims0;
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    if (boolDist(gen)) {
+                        dims1[i] = 1;
+                    }
+                }
+                dims1.erase(dims1.cbegin(), dims1.cbegin() + nbRemovedDimsDist(gen));
+
+                // create arrays and fill them with random values
+                float* array0 = new float[dims0[0]*dims0[1]*dims0[2]*dims0[3]];
+                std::size_t array1_size = std::accumulate(dims1.cbegin(), dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                float* array1 = new float[array1_size];
+                float* result = new float[dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]];
+
+                for (std::size_t i = 0; i < (dims0[0]*dims0[1]*dims0[2]*dims0[3]); ++i) {
+                    array0[i] = valueDist(gen);
+                }
+                for (std::size_t i = 0; i < array1_size; ++i) {
+                    array1[i] = valueDist(gen);
+                }
+
+                // compute true result
+                auto dims1_tmp = dims1;
+                dims1_tmp.insert(dims1_tmp.cbegin(), 4 - dims1_tmp.size(), std::size_t(1));
+
+                const std::size_t strides0[nbDims] = {dims0[1]*dims0[2]*dims0[3], dims0[2]*dims0[3], dims0[3], 1};
+                const std::size_t strides1[nbDims] = {dims1_tmp[1]*dims1_tmp[2]*dims1_tmp[3], dims1_tmp[2]*dims1_tmp[3], dims1_tmp[3], 1};
+                for (std::size_t a = 0; a < dimsOut[0]; ++a) {
+                    for (std::size_t b = 0; b < dimsOut[1]; ++b) {
+                        const std::size_t idx0_0 = strides0[0] * ((dims0[0] > 1) ? a : 0)
+                                                    + strides0[1] * ((dims0[1] > 1) ? b : 0);
+                        const std::size_t idx1_0 = strides1[0] * ((dims1_tmp[0] > 1) ? a : 0)
+                                                    + strides1[1] * ((dims1_tmp[1] > 1) ? b : 0);
+                        for (std::size_t c = 0; c < dimsOut[2]; ++c) {
+                            const std::size_t idx_out = dimsOut[3] * (c + dimsOut[2] * (b + dimsOut[1] * a));
+                            for (std::size_t d = 0; d < dimsOut[3]; ++d) {
+                                std::size_t idx0 = idx0_0
+                                                    + strides0[2] * ((dims0[2] > 1) ? c : 0)
+                                                    + ((dims0[3] > 1) ? d : 0);
+                                std::size_t idx1 = idx1_0
+                                                    + strides1[2] * ((dims1_tmp[2] > 1) ? c : 0)
+                                                    + ((dims1_tmp[3] > 1) ? d : 0);
+                                result[idx_out + d] = std::pow(array0[idx0], array1[idx1]);
+                                // std::cout << "(" << idx0 << ", " << idx1 << ") -> " << array0[idx0] << " ** " << array1[idx1] << " -> " << idx_out + d << std::endl;
+                            }
+                        }
+                    }
+                }
+
+                // conversion to Aidge::Tensors
+                // input0
+                T0->resize(dims0);
+                T0 -> getImpl() -> setRawPtr(array0, dims0[0]*dims0[1]*dims0[2]*dims0[3]);
+
+                // input1
+                T1->resize(dims1);
+                T1 -> getImpl() -> setRawPtr(array1, array1_size);
+
+                // results
+                Tres->resize(dimsOut);
+                Tres -> getImpl() -> setRawPtr(result, dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]);
+
+                // compute result
+                op->forwardDims();
+                start = std::chrono::system_clock::now();
+                myPow->forward();
+                end = std::chrono::system_clock::now();
+                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+                // comparison between truth and computed result
+                REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
+
+                delete[] array0;
+                delete[] array1;
+                delete[] result;
+
+                const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                number_of_operation += nb_elements;
+            }
+
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {} μs\n", duration.count());
+        }
+    }
+
+
+    SECTION("PowImpl_cpu::backward()") {
+        SECTION("3D Tensors") {
+            const auto input0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
+                {
+                    {
+                        {
+                            {2.0, 3.0},
+                            {4.0, 5.0}
+                        },
+                        {
+                            {6.0, 7.0},
+                            {8.0, 9.0}
+                        }
+                    }
+                }
+            ));
+            const auto input1 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
+                {
+                    {
+                        {
+                            {1.0, 2.0},
+                            {3.0, 2.0}
+                        },
+                        {
+                            {2.0, 3.0},
+                            {1.0, 0.5}
+                        }
+                    }
+                }
+            ));
+            const auto gradOut = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
+                {
+                    {
+                        {
+                            {0.5, 1.0},
+                            {1.5, 2.0}
+                        },
+                        {
+                            {2.5, 3.0},
+                            {3.5, 4.0}
+                        }
+                    }
+                }
+            ));
+            const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
+                {
+                    {
+                        {
+                            {0.50000000,   6.00000000},
+                            {72.00000000,  20.00000000}
+                        },
+                        {
+                            {30.00000000, 441.00000000},
+                            {3.50000000,   0.66666669}
+                        }
+                    }
+                }
+            ));
+            const auto expectedGrad1 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
+                {
+                    {
+                        {
+                            {  0.693147182, 9.88751030},
+                            {1.33084259e+02, 8.04718933e+01}
+                        },
+                        {
+                            {1.61258362e+02, 2.00234143e+03},
+                            {5.82243652e+01, 2.63666954e+01}
+                        }
+                    }
+                }
+            ));
+            for(const auto T: {input0, input1, gradOut, expectedGrad0, expectedGrad1})
+            {
+                    T->setBackend("cpu") ;
+                    T->setDataType(DataType::Float32);
+            }
+            std::shared_ptr<Node> powOp = Pow();
+            auto opr = std::static_pointer_cast<OperatorTensor>(powOp-> getOperator());
+            opr->setDataType(DataType::Float32);
+            opr->setBackend("cpu");
+            opr->associateInput(0, input0);
+            opr->associateInput(1, input1);
+            opr->forward();
+
+            opr->getOutput(0)->setGrad(gradOut);
+            powOp->backward();
+
+            REQUIRE(approxEq<float>(*(opr->getInput(0)->grad()), *expectedGrad0));
+            REQUIRE(approxEq<float>(*(opr->getInput(1)->grad()), *expectedGrad1));
+        }
+        SECTION("Broadcasting") {
+            const auto input0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+                {
+                    {
+                        {
+                            {1.0, 2.0, 3.0},
+                            {4.0, 5.0, 6.0}
+                        },
+                        {
+                            {1.5, 2.5, 3.5},
+                            {4.5, 5.5, 6.5}
+                        }
+                    }
+                }
+            ));
+            const auto input1 = std::make_shared<Tensor>(Array1D<float, 3>(
+                {
+                    {0.1, 0.2, 0.3}
+                }
+            ));
+
+            const auto gradOut = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+                {
+                    {
+                        {
+                            {1.0, 2.0, 3.0},
+                            {4.0, 5.0, 6.0}
+                        },
+                        {
+                            {6.0, 5.0, 4.0},
+                            {3.0, 2.0, 1.0}
+                        }
+                    }
+                }
+            ));
+            const Tensor expectedGrad0 = Array3D<float, 2, 2, 3>(
+                {
+                    {
+                        {
+                            {0.10000000, 0.22973967, 0.41711676},
+                            {0.11486985, 0.27594593, 0.51353097}
+                        },
+                        {
+                            {0.41655189, 0.48044977, 0.49926791},
+                            {0.07748720, 0.10227509, 0.08092485}
+                        }
+                    }
+                }
+            );
+            const Tensor expectedGrad1 = Array1D<float, 3>(
+                {
+                    {14.14779854, 22.99299049, 33.56402588}
+                }
+            );
+
+            std::shared_ptr<Node> powOp = Pow();
+            auto opr = std::static_pointer_cast<OperatorTensor>(powOp-> getOperator());
+            opr->setDataType(DataType::Float32);
+            opr->setBackend("cpu");
+            opr->associateInput(0, input0);
+            opr->associateInput(1, input1);
+            powOp->forward();
+
+            opr->getOutput(0)->setGrad(gradOut);
+            powOp->backward();
+
+            REQUIRE(approxEq<float>(*(opr->getInput(0)->grad()), expectedGrad0));
+            REQUIRE(approxEq<float>(*(opr->getInput(1)->grad()), expectedGrad1));
+        }
+    }
+}
+} // namespace Aidge
diff --git a/unit_tests/operator/Test_SubImpl.cpp b/unit_tests/operator/Test_SubImpl.cpp
index d9b6207b..f87f34d5 100644
--- a/unit_tests/operator/Test_SubImpl.cpp
+++ b/unit_tests/operator/Test_SubImpl.cpp
@@ -344,10 +344,10 @@ TEST_CASE("[CPU/Operator] Sub(Backward)", "[Sub][CPU][Backward]") {
 
         op->associateInput(0, T0);
         op->associateInput(1, T1);
-        op->getOutput(0)->setGrad(std::make_shared<Tensor>(
-            Array2D<float, 2, 3>({{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}})));
         op->forwardDims();
 
+        op->getOutput(0)->setGrad(std::make_shared<Tensor>(
+            Array2D<float, 2, 3>({{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}})));
         mySub->backward();
 
         // For subtraction: grad_input0 = grad_output
@@ -387,9 +387,9 @@ TEST_CASE("[CPU/Operator] Sub(Backward)", "[Sub][CPU][Backward]") {
 
         op->associateInput(0, T0);
         op->associateInput(1, T1);
-        op->getOutput(0)->setGrad(newGrad);
         op->forwardDims();
 
+        op->getOutput(0)->setGrad(newGrad);
         mySub->backward();
 
         REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0));
@@ -434,14 +434,12 @@ TEST_CASE("[CPU/Operator] Sub(Backward)", "[Sub][CPU][Backward]") {
 
         op->associateInput(0, T0);
         op->associateInput(1, T1);
+	op->forwardDims();
 
         // Set gradient of output
-        op->getOutput(0)->setGrad(std::make_shared<Tensor>());
-        op->getOutput(0)->grad()->resize(outputDims);
+        op->getOutput(0)->setGrad(std::make_shared<Tensor>(outputDims));
         op->getOutput(0)->grad()->getImpl()->setRawPtr(gradOutputData.data(), outputSize);
 
-        op->forwardDims();
-
         // Compute reference gradients
         std::vector<float> expectedGrad0(input0Size, 0.0f);
         std::vector<float> expectedGrad1(input1Size, 0.0f);
-- 
GitLab