diff --git a/include/aidge/backend/cpu/operator/PowImpl.hpp b/include/aidge/backend/cpu/operator/PowImpl.hpp
index daf23177fb57bee4111c92654ad94dfae3e50f08..cfbb8173d1f83162519016a8f2b3c3166977a5b7 100644
--- a/include/aidge/backend/cpu/operator/PowImpl.hpp
+++ b/include/aidge/backend/cpu/operator/PowImpl.hpp
@@ -24,7 +24,8 @@ namespace Aidge {
 // Operator implementation entry point for the backend
 using PowImpl_cpu = OperatorImpl_cpu<Pow_Op,
     void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*,void*),
-    void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*, void*)>;
+    void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*, const void*, void*, void*)>;
+
 
 // Implementation entry point registration to Operator
 REGISTRAR(Pow_Op, "cpu", Aidge::PowImpl_cpu::create);
diff --git a/include/aidge/backend/cpu/operator/PowImpl_kernels.hpp b/include/aidge/backend/cpu/operator/PowImpl_kernels.hpp
index f484fabf437f656dc8671d4ac78161ef11e84de5..ab9b2ccc7b823842decd044b90a5c6364cedc9c9 100644
--- a/include/aidge/backend/cpu/operator/PowImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/PowImpl_kernels.hpp
@@ -31,14 +31,10 @@ void PowImpl_cpu_forward_kernel(const std::vector<std::size_t>& input1Dims,
     const I2* input_2 = static_cast<const I2*>(input2_);
     O* output = static_cast<O*>(output_);
 
-    size_t totalElements = 1;
-    for (size_t dimSize : outputDims) {
-        totalElements *= dimSize;
-    }
-
+    std::size_t totalElements = std::accumulate(outputDims.cbegin(), outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
 	for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex) 
 	{
-		std::vector<size_t> indexes = getMultiDimIndices(outputDims, oIndex);
+		std::vector<std::size_t> indexes = getMultiDimIndices(outputDims, oIndex);
 
 		std::size_t idx1 = getFlattenedIndex(input1Dims, indexes);
 		std::size_t idx2 = getFlattenedIndex(input2Dims, indexes);
@@ -47,16 +43,53 @@ void PowImpl_cpu_forward_kernel(const std::vector<std::size_t>& input1Dims,
 	}
 }
 
+template <class I1, class I2, class O>
+void PowImpl_cpu_backward_kernel(const std::vector<std::size_t>& input0Dims,
+                                const std::vector<std::size_t>& input1Dims,
+                                const std::vector<std::size_t>& outputDims,
+                                const void* input0_,
+                                const void* input1_,
+                                const void* gradOutput_,
+                                void* gradientInput0_,
+                                void* gradientInput1_) {
+	const I1* input0 = static_cast<const I1*>(input0_);
+	I1* grad0 = static_cast<I1*>(gradientInput0_);
+    const I2* input1 = static_cast<const I2*>(input1_);
+    I2* grad1 = static_cast<I2*>(gradientInput1_);
+    const O* gradOut = static_cast<const O*>(gradOutput_);
+
+    // Fill input grads with zeros
+	std::size_t input0Elements = std::accumulate(input0Dims.cbegin(), input0Dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+	std::fill(grad0, grad0 + input0Elements, I1(0));
+	std::size_t input1Elements = std::accumulate(input1Dims.cbegin(), input1Dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+	std::fill(grad1, grad1 + input1Elements, I2(0));
+
+	std::size_t totalElements = std::accumulate(outputDims.cbegin(), outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    for (size_t oIndex = 0; oIndex < totalElements; ++oIndex)
+    {
+        // Compute indexes in inputs 0 and 1 to support broadcasting
+        std::vector<std::size_t> indexes = getMultiDimIndices(outputDims, oIndex);
+        std::size_t idx0 = getFlattenedIndex(input0Dims, indexes);
+        std::size_t idx1 = getFlattenedIndex(input1Dims, indexes);
+
+        // grad0 = grad_output * (input1 * pow(input0, (input1 -1)))
+        grad0[idx0] += gradOut[oIndex]*input1[idx1]* std::pow(input0[idx0], input1[idx1]-1);
+
+        // grad1 = grad_output * (output * ln(input0))
+        grad1[idx1] += gradOut[oIndex] * std::pow(input0[idx0], input1[idx1]) * std::log(input0[idx0]);
+    }
+}
+
 // Kernels registration to implementation entry point
 REGISTRAR(PowImpl_cpu,
     {DataType::Float32},
-    {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<float, float, float>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<float, float, float>, Aidge::PowImpl_cpu_backward_kernel<float, float, float>});
 REGISTRAR(PowImpl_cpu,
     {DataType::Float64},
-    {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<double, double, double>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<double, double, double>, Aidge::PowImpl_cpu_backward_kernel<double, double, double>});
 REGISTRAR(PowImpl_cpu,
     {DataType::Int32},
-    {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<int32_t, int32_t, int32_t>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<int32_t, int32_t, int32_t>, Aidge::PowImpl_cpu_backward_kernel<int32_t, int32_t, int32_t>});
 }  // namespace Aidge
 
 #endif /* AIDGE_CPU_OPERATOR_POWIMPL_KERNELS_H_ */
diff --git a/src/operator/PowImpl.cpp b/src/operator/PowImpl.cpp
index fe16bb955973d99e022c61043e8144aeaf6801a1..74a7be71e176ba8e1cb8851050e575d6aa7465df 100644
--- a/src/operator/PowImpl.cpp
+++ b/src/operator/PowImpl.cpp
@@ -44,21 +44,29 @@ void Aidge::PowImpl_cpu::forward() {
 
 template <>
 void Aidge::PowImpl_cpu::backward() {
-    // Find the correct kernel type
     const Pow_Op& op_ = dynamic_cast<const Pow_Op&>(mOp);
-    const std::vector<std::size_t> input0gradDims = getBroadcastedDims(op_.getInput(0)->grad()->dims(),
-                                                                   op_.getOutput(0)->grad()->dims());
-    const std::vector<std::size_t> input1gradDims = getBroadcastedDims(op_.getInput(1)->grad()->dims(),
-                                                                   op_.getOutput(0)->grad()->dims());
+
+    auto in0 = op_.getInput(0);
+    auto in1 = op_.getInput(1);
+    auto in0grad = op_.getInput(0)->grad();
+    auto in1grad = op_.getInput(1)->grad();
+    auto out0grad = op_.getOutput(0)->grad();
+
+    const std::vector<std::size_t> input0gradDims = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->grad()->dims(),
+                                                                       std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->grad()->dims());
+    const std::vector<std::size_t> input1gradDims = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->grad()->dims(),
+                                                                       std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->grad()->dims());
 
     // Find the correct kernel type
     const auto impl = Registrar<PowImpl_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Call kernel
-    impl.backward(op_.getOutput(0)->grad()->dims(),
-               input0gradDims,
-               input1gradDims,
-               getCPUPtr(mOp.getRawOutput(0)),
-               getCPUPtr(mOp.getRawInput(0)),
-               getCPUPtr(mOp.getRawInput(1)));
+    impl.backward(input0gradDims,
+                input1gradDims,
+                out0grad->dims(),
+                getCPUPtr(in0),
+                getCPUPtr(in1),
+                getCPUPtr(out0grad),
+                getCPUPtr(in0grad),
+                getCPUPtr(in1grad));
 }
\ No newline at end of file
diff --git a/unit_tests/operator/Test_PowImpl.cpp b/unit_tests/operator/Test_PowImpl.cpp
index 3b85defb37ff76439b658faa84c3c7457a152d2f..cb5d8872c9c7242bb4aa4efca388d53b578417f9 100644
--- a/unit_tests/operator/Test_PowImpl.cpp
+++ b/unit_tests/operator/Test_PowImpl.cpp
@@ -313,5 +313,171 @@ TEST_CASE("[cpu/operator] Pow", "[Pow][CPU]") {
             std::cout << "total time: " << duration.count() << "μs" << std::endl;
         }
     }
+
+
+    SECTION("PowImpl_cpu::backward()") {
+        SECTION("3D Tensors") {
+            const auto input0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
+                {
+                    {
+                        {
+                            {2.0, 3.0},
+                            {4.0, 5.0}
+                        },
+                        {
+                            {6.0, 7.0},
+                            {8.0, 9.0}
+                        }
+                    }
+                }
+            ));
+            const auto input1 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
+                {
+                    {
+                        {
+                            {1.0, 2.0},
+                            {3.0, 2.0}
+                        },
+                        {
+                            {2.0, 3.0},
+                            {1.0, 0.5}
+                        }
+                    }
+                }
+            ));
+            const auto gradOut = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
+                {
+                    {
+                        {
+                            {0.5, 1.0},
+                            {1.5, 2.0}
+                        },
+                        {
+                            {2.5, 3.0},
+                            {3.5, 4.0}
+                        }
+                    }
+                }
+            ));
+            const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
+                {
+                    {
+                        {
+                            {0.50000000,   6.00000000},
+                            {72.00000000,  20.00000000}
+                        },
+                        {
+                            {30.00000000, 441.00000000},
+                            {3.50000000,   0.66666669}
+                        }
+                    }
+                }
+            ));
+            const auto expectedGrad1 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
+                {
+                    {
+                        {
+                            {  0.693147182, 9.88751030},
+                            {1.33084259e+02, 8.04718933e+01}
+                        },
+                        {
+                            {1.61258362e+02, 2.00234143e+03},
+                            {5.82243652e+01, 2.63666954e+01}
+                        }
+                    }
+                }
+            ));
+            for(const auto T: {input0, input1, gradOut, expectedGrad0, expectedGrad1})
+            {
+                    T->setBackend("cpu") ;
+                    T->setDataType(DataType::Float32);
+            }
+            std::shared_ptr<Node> powOp = Pow();
+            auto opr = std::static_pointer_cast<OperatorTensor>(powOp-> getOperator());
+            opr->setDataType(DataType::Float32);
+            opr->setBackend("cpu");
+            opr->associateInput(0, input0);
+            opr->associateInput(1, input1);
+            opr->getOutput(0)->setGrad(gradOut);
+            opr->forward();
+
+            powOp->backward();
+            REQUIRE(approxEq<float>(*(opr->getInput(0)->grad()), *expectedGrad0));
+            REQUIRE(approxEq<float>(*(opr->getInput(1)->grad()), *expectedGrad1));
+        }
+        SECTION("Broadcasting") {
+            const auto input0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+                {
+                    {
+                        {
+                            {1.0, 2.0, 3.0},
+                            {4.0, 5.0, 6.0}
+                        },
+                        {
+                            {1.5, 2.5, 3.5},
+                            {4.5, 5.5, 6.5}
+                        }
+                    }
+                }
+            ));
+            const auto input1 = std::make_shared<Tensor>(Array1D<float, 3>(
+                {
+                    {0.1, 0.2, 0.3}
+                }
+            ));
+
+            const auto gradOut = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+                {
+                    {
+                        {
+                            {1.0, 2.0, 3.0},
+                            {4.0, 5.0, 6.0}
+                        },
+                        {
+                            {6.0, 5.0, 4.0},
+                            {3.0, 2.0, 1.0}
+                        }
+                    }
+                }
+            ));
+            const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+                {
+                    {
+                        {
+                            {0.10000000, 0.22973967, 0.41711676},
+                            {0.11486985, 0.27594593, 0.51353097}
+                        },
+                        {
+                            {0.41655189, 0.48044977, 0.49926791},
+                            {0.07748720, 0.10227509, 0.08092485}
+                        }
+                    }
+                }
+            ));
+            const auto expectedGrad1 = std::make_shared<Tensor>(Array1D<float, 3>(
+                {
+                    {14.14779854, 22.99299049, 33.56402588}
+                }
+            ));
+
+            for(const auto T: {input0, input1, gradOut, expectedGrad0, expectedGrad1})
+            {
+                    T->setBackend("cpu") ;
+                    T->setDataType(DataType::Float32);
+            }
+            std::shared_ptr<Node> powOp = Pow();
+            auto opr = std::static_pointer_cast<OperatorTensor>(powOp-> getOperator());
+            opr->setDataType(DataType::Float32);
+            opr->setBackend("cpu");
+            opr->associateInput(0, input0);
+            opr->associateInput(1, input1);
+            opr->getOutput(0)->setGrad(gradOut);
+            powOp->forward();
+
+            powOp->backward();
+            REQUIRE(approxEq<float>(*(opr->getInput(0)->grad()), *expectedGrad0));
+            REQUIRE(approxEq<float>(*(opr->getInput(1)->grad()), *expectedGrad1));
+        }
+    }
 }
 } // namespace Aidge