diff --git a/aidge_backend_cpu/unit_tests/test_scheduler.py b/aidge_backend_cpu/unit_tests/test_scheduler.py
index 0aeeb04b74a078f77c57500b959d6ef9fa9af4d0..c37fc54437c02b0bb1c6f09a1c73d5cc538fa4c0 100644
--- a/aidge_backend_cpu/unit_tests/test_scheduler.py
+++ b/aidge_backend_cpu/unit_tests/test_scheduler.py
@@ -17,12 +17,12 @@ class test_scheduler(unittest.TestCase):
 
         input_node = aidge_core.Producer(aidge_core.Tensor(values), "Input")
         relu = aidge_core.ReLU()
+        input_node.add_child(relu)
 
         gv = aidge_core.GraphView()
         gv.add(relu)
         gv.add(input_node)
 
-        input_node.add_child(relu)
 
         gv.set_datatype(aidge_core.dtype.int32)
         gv.set_backend("cpu")
diff --git a/include/aidge/backend/cpu.hpp b/include/aidge/backend/cpu.hpp
index b770cf2b51d1489dc30b6173de25506283208392..3a605de8cbfcda125f3982e5574040a4e544d0a6 100644
--- a/include/aidge/backend/cpu.hpp
+++ b/include/aidge/backend/cpu.hpp
@@ -19,6 +19,7 @@
 #include "aidge/backend/cpu/operator/AvgPoolingImpl.hpp"
 #include "aidge/backend/cpu/operator/MaxPoolingImpl.hpp"
 #include "aidge/backend/cpu/operator/BatchNormImpl.hpp"
+#include "aidge/backend/cpu/operator/BitShiftImpl.hpp"
 #include "aidge/backend/cpu/operator/ClipImpl.hpp"
 #include "aidge/backend/cpu/operator/ConvDepthWiseImpl.hpp"
 #include "aidge/backend/cpu/operator/ConvImpl.hpp"
diff --git a/include/aidge/backend/cpu/operator/BitShiftImpl.hpp b/include/aidge/backend/cpu/operator/BitShiftImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..6da67bb7dd4469b6ca609c5aea1ae70dfca3f939
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/BitShiftImpl.hpp
@@ -0,0 +1,38 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_BITSHIFTIMPL_H_
+#define AIDGE_CPU_OPERATOR_BITSHIFTIMPL_H_
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/BitShift.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include <memory>
+#include <vector>
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using BitShiftImpl_cpu = OperatorImpl_cpu<BitShift_Op,
+    void(const BitShift_Op::BitShiftDirection,
+    const std::vector<std::size_t>&, 
+    const std::vector<std::size_t>&, 
+    const std::vector<std::size_t>&, 
+    const void*, 
+    const void*,
+    void*)>;
+    
+    // Implementation entry point registration to Operator
+    REGISTRAR(BitShift_Op,"cpu",Aidge::BitShiftImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_BITSHIFTIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/BitShiftImpl_kernels.hpp b/include/aidge/backend/cpu/operator/BitShiftImpl_kernels.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f815e946ea2e4abaff48a6e5155368d564e88e8c
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/BitShiftImpl_kernels.hpp
@@ -0,0 +1,70 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_BITSHIFTIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_BITSHIFTIMPL_KERNELS_H_
+
+#include "aidge/utils/Registrar.hpp"
+
+#include <cstdint>     // std::int32_t, std::int64_t
+#include "aidge/operator/BitShift.hpp"
+
+#include "aidge/backend/cpu/data/Broadcasting.hpp"
+#include "aidge/backend/cpu/operator/BitShiftImpl.hpp"
+
+
+
+namespace Aidge {
+template <class I1, class I2, class O>
+void BitShiftImpl_cpu_forward_kernel(
+                                const BitShift_Op::BitShiftDirection direction,
+                                const std::vector<std::size_t>& input1Dims,
+                                const std::vector<std::size_t>& input2Dims,
+                                const std::vector<std::size_t>& outputDims,
+                                const void* input1_,
+                                const void* input2_,
+                                void* output_
+                                ) {
+
+    const I1* input_1 = static_cast<const I1*>(input1_);
+    const I2* input_2 = static_cast<const I2*>(input2_);
+    O* output = static_cast<O*>(output_);
+
+    const size_t totalElements = std::accumulate(outputDims.begin(), outputDims.end(), std::size_t(1), std::multiplies<std::size_t>());
+    
+    for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex)
+    {
+        std::vector<size_t> indexes = getMultiDimIndices(outputDims, oIndex);
+        std::size_t idx1 = getFlattenedIndex(input1Dims, indexes);
+        std::size_t idx2 = getFlattenedIndex(input2Dims, indexes);
+        if(direction == BitShift_Op::BitShiftDirection::right)
+
+        {
+                output[oIndex]= input_1[idx1] >> input_2[idx2];
+        }
+        else
+        {
+                output[oIndex] = input_1[idx1] << input_2[idx2];
+        }
+    }
+}
+
+REGISTRAR(BitShiftImpl_cpu,
+{DataType::Int32},
+{ProdConso::inPlaceModel,Aidge::BitShiftImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>,nullptr});
+REGISTRAR(BitShiftImpl_cpu,
+{DataType::Int64},
+{ProdConso::inPlaceModel,Aidge::BitShiftImpl_cpu_forward_kernel<std::int64_t, std::int64_t, std::int64_t>,nullptr});
+
+
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_BitShiftIMPL_KERNELS_H_ */
\ No newline at end of file
diff --git a/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp
index ff9bb148fa68d75e2d4b00804e13f063e3ca2cc0..59a471aee82f7c706be390d80b5db569bd3c6f1e 100644
--- a/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp
@@ -38,7 +38,7 @@ namespace Aidge {
  */
 template <class I, class W, class B, class O>
 void ConvDepthWiseImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& strideDims,
-                            const std::array<DimSize_t, 1>& /*dilationDims*/,
+                            const std::array<DimSize_t, 1>& dilationDims,
                             const std::array<DimSize_t, 1>& kernelDims,
                             const std::array<DimSize_t, 3>& inputDims,
                             const void *input_,
@@ -53,10 +53,12 @@ void ConvDepthWiseImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& stri
 
 
     // output H size
+    const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1;
     const std::size_t oxSize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - kernelDims[0] + strideDims[0]) /
+            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilated_kernel_x + strideDims[0]) /
                                 static_cast<float>(strideDims[0])));
 
+
     // TODO: kernel computation
     // output (batch, outCh, Xout, Yout)
     // input  (batch, ch, Xin, Yin)
@@ -71,15 +73,17 @@ void ConvDepthWiseImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& stri
             const std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2];
             const std::size_t wIndex = ch * kernelDims[0];
             for (std::size_t ox = 0; ox < oxSize; ++ox) {
-                const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]);
-                const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
-                const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx);
+                // const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]);
+                // const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
+                // const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx);
+                const std::size_t sxMin = 0;
+                const std::size_t sxMax = dilated_kernel_x;
                 const std::size_t oIndexFull = oIndex + ox;
                 const signedsize ix = static_cast<signedsize>(ox * strideDims[0]);
 
-                for (std::size_t sx = sxMin; sx < sxMax; ++sx) {
+                for (std::size_t sx = sxMin; sx*dilationDims[0] < sxMax; ++sx) {
                     output[oIndexFull] += weights[wIndex + sx] *
-                                            input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx))];
+                                            input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx*dilationDims[0]))];
                 }
             }
         }
@@ -113,7 +117,7 @@ REGISTRAR(ConvDepthWiseImpl1D_cpu,
  */
 template <class I, class W, class B, class O>
 void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
-                            const std::array<DimSize_t, 2>& /*dilationDims*/,
+                            const std::array<DimSize_t, 2>& dilationDims,
                             const std::array<DimSize_t, 2>& kernelDims,
                             const std::array<DimSize_t, 4>& inputDims,
                             const void *input_,
@@ -129,12 +133,14 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri
 
 
     // output H size
+    const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1;
     const std::size_t oxSize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - kernelDims[0] + strideDims[0]) /
+            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilated_kernel_x + strideDims[0]) /
                                 static_cast<float>(strideDims[0])));
     // output W size
+    const DimSize_t dilated_kernel_y = dilationDims[1]*(kernelDims[1] - 1) + 1;
     const std::size_t oySize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[3] - kernelDims[1] + strideDims[1]) /
+            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[3] - dilated_kernel_y + strideDims[1]) /
                                 static_cast<float>(strideDims[1])));
 
     // TODO: kernel computation
@@ -151,13 +157,17 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri
             const std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3];
             const std::size_t wIndex = ch * kernelDims[0] * kernelDims[1];
             for (std::size_t ox = 0; ox < oxSize; ++ox) {
-                const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]);
-                const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
-                const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx);
+                // const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]);
+                // const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
+                // const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx);
+                const std::size_t sxMin = 0;
+                const std::size_t sxMax = dilated_kernel_x;
                 for (std::size_t oy = 0; oy < oySize; ++oy) {
-                    const signedsize dify = static_cast<signedsize>(- oy * strideDims[1]);
-                    const std::size_t syMin = static_cast<std::size_t>(std::max(dify, signedsize(0)));
-                    const std::size_t syMax = (static_cast<signedsize>(inputDims[3]) + dify) < 0 ? 0 : ((inputDims[3] + dify) > kernelDims[1] ? kernelDims[1] : inputDims[3] + dify);
+                    // const signedsize dify = static_cast<signedsize>(- oy * strideDims[1]);
+                    // const std::size_t syMin = static_cast<std::size_t>(std::max(dify, signedsize(0)));
+                    // const std::size_t syMax = (static_cast<signedsize>(inputDims[3]) + dify) < 0 ? 0 : ((inputDims[3] + dify) > kernelDims[1] ? kernelDims[1] : inputDims[3] + dify);
+                    const std::size_t syMin = 0;
+                    const std::size_t syMax = dilated_kernel_y;
                     const std::size_t oIndexFull = oIndex + ox*oySize + oy;
                     const signedsize ix = static_cast<signedsize>(ox * strideDims[0]);
                     const signedsize iy = static_cast<signedsize>(oy * strideDims[1]);
@@ -173,10 +183,10 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri
                                                 weights[wIndex + 2*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
                                                 weights[wIndex + 2*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+2)]);
                     } else {
-                        for (std::size_t sx = sxMin; sx < sxMax; ++sx) {
-                            for (std::size_t sy = syMin; sy < syMax; ++sy) {
+                        for (std::size_t sx = sxMin; sx*dilationDims[0] < sxMax; ++sx) {
+                            for (std::size_t sy = syMin; sy*dilationDims[1] < syMax; ++sy) {
                                 output[oIndexFull] += weights[wIndex + sx*kernelDims[1] + sy] *
-                                                        input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx))*inputDims[3] + static_cast<std::size_t>(iy+static_cast<signedsize>(sy))];
+                                                        input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx*dilationDims[0]))*inputDims[3] + static_cast<std::size_t>(iy+static_cast<signedsize>(sy*dilationDims[1]))];
                             }
                         }
                     }
diff --git a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
index cc3bd57cb17f2a0feb6a79af2c291e6f960467d8..e800c252676ec5247a776abf458f808289b278c8 100644
--- a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
@@ -40,7 +40,7 @@ namespace Aidge {
  */
 template <class I, class W, class B, class O>
 void ConvImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& strideDims,
-                            const std::array<DimSize_t, 1>& /*dilationDims*/,
+                            const std::array<DimSize_t, 1>& dilationDims,
                             const std::array<DimSize_t, 1>& kernelDims,
                             const std::array<DimSize_t, 3>& inputDims,
                             DimSize_t outChannels,
@@ -57,8 +57,9 @@ void ConvImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& strideDims,
 
     // output H size
     const std::size_t oxSize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - kernelDims[0] + strideDims[0]) /
+            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilationDims[0]*(kernelDims[0] - 1) - 1 + strideDims[0]) /
                                 static_cast<float>(strideDims[0])));
+    const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1;
 
     // TODO: kernel computation
     // output (batch, outCh, Xout, Yout)
@@ -76,15 +77,17 @@ void ConvImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& strideDims,
                 const std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2];
                 const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0];
                 for (std::size_t ox = 0; ox < oxSize; ++ox) {
-                    const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]);
-                    const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
-                    const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx);
+                    // const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]);
+                    // const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
+                    // const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx);
+                    const std::size_t sxMin = 0;
+                    const std::size_t sxMax = dilated_kernel_x;
                     const std::size_t oIndexFull = oIndex + ox;
                     const signedsize ix = static_cast<signedsize>(ox * strideDims[0]);
 
-                    for (std::size_t sx = sxMin; sx < sxMax; ++sx) {
+                    for (std::size_t sx = sxMin; sx*dilationDims[0] < sxMax; ++sx) {
                         output[oIndexFull] += weights[wIndex + sx] *
-                                                input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx))];
+                                                input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx*dilationDims[0]))];
                     }
                 }
             }
@@ -122,7 +125,7 @@ REGISTRAR(ConvImpl1D_cpu,
  */
 template <class I, class W, class B, class O>
 void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
-                            const std::array<DimSize_t, 2>& /*dilationDims*/,
+                            const std::array<DimSize_t, 2>& dilationDims,
                             const std::array<DimSize_t, 2>& kernelDims,
                             const std::array<DimSize_t, 4> &inputDims,
                             DimSize_t outChannels,
@@ -139,12 +142,15 @@ void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
 
     // output H size
     const std::size_t oxSize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - kernelDims[0] + strideDims[0]) /
+            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilationDims[0]*(kernelDims[0] - 1) - 1 + strideDims[0]) /
                                 static_cast<float>(strideDims[0])));
+    const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1;
     // output W size
     const std::size_t oySize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[3] - kernelDims[1] + strideDims[1]) /
+            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[3] - dilationDims[1]*(kernelDims[1] - 1) - 1 + strideDims[1]) /
                                 static_cast<float>(strideDims[1])));
+    const DimSize_t dilated_kernel_y = dilationDims[1]*(kernelDims[1] - 1) + 1;
+
 
     // TODO: kernel computation
     // output (batch, outCh, Xout, Yout)
@@ -162,13 +168,17 @@ void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
                 const std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
                 const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0] * kernelDims[1];
                 for (std::size_t ox = 0; ox < oxSize; ++ox) {
-                    const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]);
-                    const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
-                    const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx);
+                    // const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]);
+                    // const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
+                    // const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx);
+                    const std::size_t sxMin = 0;
+                    const std::size_t sxMax = dilated_kernel_x;
                     for (std::size_t oy = 0; oy < oySize; ++oy) {
-                        const signedsize dify = static_cast<signedsize>(- oy * strideDims[1]);
-                        const std::size_t syMin = static_cast<std::size_t>(std::max(dify, signedsize(0)));
-                        const std::size_t syMax = (static_cast<signedsize>(inputDims[3]) + dify) < 0 ? 0 : ((inputDims[3] + dify) > kernelDims[1] ? kernelDims[1] : inputDims[3] + dify);
+                        // const signedsize dify = static_cast<signedsize>(- oy * strideDims[1]);
+                        // const std::size_t syMin = static_cast<std::size_t>(std::max(dify, signedsize(0)));
+                        // const std::size_t syMax = (static_cast<signedsize>(inputDims[3]) + dify) < 0 ? 0 : ((inputDims[3] + dify) > kernelDims[1] ? kernelDims[1] : inputDims[3] + dify);
+                        const std::size_t syMin = 0;
+                        const std::size_t syMax = dilated_kernel_y;
                         const std::size_t oIndexFull = oIndex + ox*oySize + oy;
                         const signedsize ix = static_cast<signedsize>(ox * strideDims[0]);
                         const signedsize iy = static_cast<signedsize>(oy * strideDims[1]);
@@ -184,10 +194,10 @@ void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
                                                    weights[wIndex + 2*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
                                                    weights[wIndex + 2*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+2)]);
                         } else {
-                            for (std::size_t sx = sxMin; sx < sxMax; ++sx) {
-                                for (std::size_t sy = syMin; sy < syMax; ++sy) {
+                            for (std::size_t sx = sxMin; sx*dilationDims[0] < sxMax; ++sx) {
+                                for (std::size_t sy = syMin; sy*dilationDims[1] < syMax; ++sy) {
                                     output[oIndexFull] += weights[wIndex + sx*kernelDims[1] + sy] *
-                                                            input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx))*inputDims[3] + static_cast<std::size_t>(iy+static_cast<signedsize>(sy))];
+                                                            input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx*dilationDims[0]))*inputDims[3] + static_cast<std::size_t>(iy+static_cast<signedsize>(sy*dilationDims[1]))];
                                 }
                             }
                         }
diff --git a/include/aidge/backend/cpu/operator/PowImpl.hpp b/include/aidge/backend/cpu/operator/PowImpl.hpp
index daf23177fb57bee4111c92654ad94dfae3e50f08..cfbb8173d1f83162519016a8f2b3c3166977a5b7 100644
--- a/include/aidge/backend/cpu/operator/PowImpl.hpp
+++ b/include/aidge/backend/cpu/operator/PowImpl.hpp
@@ -24,7 +24,8 @@ namespace Aidge {
 // Operator implementation entry point for the backend
 using PowImpl_cpu = OperatorImpl_cpu<Pow_Op,
     void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*,void*),
-    void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*, void*)>;
+    void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*, const void*, void*, void*)>;
+
 
 // Implementation entry point registration to Operator
 REGISTRAR(Pow_Op, "cpu", Aidge::PowImpl_cpu::create);
diff --git a/include/aidge/backend/cpu/operator/PowImpl_kernels.hpp b/include/aidge/backend/cpu/operator/PowImpl_kernels.hpp
index f484fabf437f656dc8671d4ac78161ef11e84de5..ab9b2ccc7b823842decd044b90a5c6364cedc9c9 100644
--- a/include/aidge/backend/cpu/operator/PowImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/PowImpl_kernels.hpp
@@ -31,14 +31,10 @@ void PowImpl_cpu_forward_kernel(const std::vector<std::size_t>& input1Dims,
     const I2* input_2 = static_cast<const I2*>(input2_);
     O* output = static_cast<O*>(output_);
 
-    size_t totalElements = 1;
-    for (size_t dimSize : outputDims) {
-        totalElements *= dimSize;
-    }
-
+    std::size_t totalElements = std::accumulate(outputDims.cbegin(), outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
 	for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex) 
 	{
-		std::vector<size_t> indexes = getMultiDimIndices(outputDims, oIndex);
+		std::vector<std::size_t> indexes = getMultiDimIndices(outputDims, oIndex);
 
 		std::size_t idx1 = getFlattenedIndex(input1Dims, indexes);
 		std::size_t idx2 = getFlattenedIndex(input2Dims, indexes);
@@ -47,16 +43,53 @@ void PowImpl_cpu_forward_kernel(const std::vector<std::size_t>& input1Dims,
 	}
 }
 
+template <class I1, class I2, class O>
+void PowImpl_cpu_backward_kernel(const std::vector<std::size_t>& input0Dims,
+                                const std::vector<std::size_t>& input1Dims,
+                                const std::vector<std::size_t>& outputDims,
+                                const void* input0_,
+                                const void* input1_,
+                                const void* gradOutput_,
+                                void* gradientInput0_,
+                                void* gradientInput1_) {
+	const I1* input0 = static_cast<const I1*>(input0_);
+	I1* grad0 = static_cast<I1*>(gradientInput0_);
+    const I2* input1 = static_cast<const I2*>(input1_);
+    I2* grad1 = static_cast<I2*>(gradientInput1_);
+    const O* gradOut = static_cast<const O*>(gradOutput_);
+
+    // Fill input grads with zeros
+	std::size_t input0Elements = std::accumulate(input0Dims.cbegin(), input0Dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+	std::fill(grad0, grad0 + input0Elements, I1(0));
+	std::size_t input1Elements = std::accumulate(input1Dims.cbegin(), input1Dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+	std::fill(grad1, grad1 + input1Elements, I2(0));
+
+	std::size_t totalElements = std::accumulate(outputDims.cbegin(), outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    for (size_t oIndex = 0; oIndex < totalElements; ++oIndex)
+    {
+        // Compute indexes in inputs 0 and 1 to support broadcasting
+        std::vector<std::size_t> indexes = getMultiDimIndices(outputDims, oIndex);
+        std::size_t idx0 = getFlattenedIndex(input0Dims, indexes);
+        std::size_t idx1 = getFlattenedIndex(input1Dims, indexes);
+
+        // grad0 = grad_output * (input1 * pow(input0, (input1 -1)))
+        grad0[idx0] += gradOut[oIndex]*input1[idx1]* std::pow(input0[idx0], input1[idx1]-1);
+
+        // grad1 = grad_output * (output * ln(input0))
+        grad1[idx1] += gradOut[oIndex] * std::pow(input0[idx0], input1[idx1]) * std::log(input0[idx0]);
+    }
+}
+
 // Kernels registration to implementation entry point
 REGISTRAR(PowImpl_cpu,
     {DataType::Float32},
-    {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<float, float, float>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<float, float, float>, Aidge::PowImpl_cpu_backward_kernel<float, float, float>});
 REGISTRAR(PowImpl_cpu,
     {DataType::Float64},
-    {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<double, double, double>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<double, double, double>, Aidge::PowImpl_cpu_backward_kernel<double, double, double>});
 REGISTRAR(PowImpl_cpu,
     {DataType::Int32},
-    {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<int32_t, int32_t, int32_t>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<int32_t, int32_t, int32_t>, Aidge::PowImpl_cpu_backward_kernel<int32_t, int32_t, int32_t>});
 }  // namespace Aidge
 
 #endif /* AIDGE_CPU_OPERATOR_POWIMPL_KERNELS_H_ */
diff --git a/project_name.txt b/project_name.txt
deleted file mode 100644
index f8a086fc063978638db5a0fcfe1dc2e5c9d0c1b7..0000000000000000000000000000000000000000
--- a/project_name.txt
+++ /dev/null
@@ -1 +0,0 @@
-aidge_backend_cpu
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 24ce15ab7ead32f98c7ac3edcd34bb2010ff4326..0000000000000000000000000000000000000000
--- a/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-numpy
diff --git a/src/operator/BitShiftImpl.cpp b/src/operator/BitShiftImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1e0f79fd29fd140f0b41c64d245b9b240da80028
--- /dev/null
+++ b/src/operator/BitShiftImpl.cpp
@@ -0,0 +1,57 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <cassert>
+#include <chrono>  // std::chrono::milliseconds
+#include <numeric>
+#include <thread>  // std::this_thread::sleep_for
+#include <vector>
+
+
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/Broadcasting.hpp"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+
+#include "aidge/backend/cpu/operator/BitShiftImpl.hpp"
+#include "aidge/backend/cpu/operator/BitShiftImpl_kernels.hpp"
+
+template<>
+void Aidge::BitShiftImpl_cpu::forward() {
+
+    const auto& op_ = dynamic_cast<const BitShift_Op&>(mOp);
+
+
+    const auto impl = Registrar<BitShiftImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+
+    const std::vector<std::size_t> inputDims0 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
+                                                                   std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims());
+    const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
+                                                                   std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims());
+
+    BitShift_Op::BitShiftDirection direction = op_.direction();
+
+    // Call kernel
+    impl.forward(
+        direction,
+        inputDims0,
+        inputDims1,
+        std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
+        getCPUPtr(mOp.getRawInput(0)),
+        getCPUPtr(mOp.getRawInput(1)),
+        getCPUPtr(mOp.getRawOutput(0)));
+        
+}
+
+template <>
+void Aidge::BitShiftImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for BitShift_Op on backend cpu");
+}
\ No newline at end of file
diff --git a/src/operator/PowImpl.cpp b/src/operator/PowImpl.cpp
index fe16bb955973d99e022c61043e8144aeaf6801a1..74a7be71e176ba8e1cb8851050e575d6aa7465df 100644
--- a/src/operator/PowImpl.cpp
+++ b/src/operator/PowImpl.cpp
@@ -44,21 +44,29 @@ void Aidge::PowImpl_cpu::forward() {
 
 template <>
 void Aidge::PowImpl_cpu::backward() {
-    // Find the correct kernel type
     const Pow_Op& op_ = dynamic_cast<const Pow_Op&>(mOp);
-    const std::vector<std::size_t> input0gradDims = getBroadcastedDims(op_.getInput(0)->grad()->dims(),
-                                                                   op_.getOutput(0)->grad()->dims());
-    const std::vector<std::size_t> input1gradDims = getBroadcastedDims(op_.getInput(1)->grad()->dims(),
-                                                                   op_.getOutput(0)->grad()->dims());
+
+    auto in0 = op_.getInput(0);
+    auto in1 = op_.getInput(1);
+    auto in0grad = op_.getInput(0)->grad();
+    auto in1grad = op_.getInput(1)->grad();
+    auto out0grad = op_.getOutput(0)->grad();
+
+    const std::vector<std::size_t> input0gradDims = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->grad()->dims(),
+                                                                       std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->grad()->dims());
+    const std::vector<std::size_t> input1gradDims = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->grad()->dims(),
+                                                                       std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->grad()->dims());
 
     // Find the correct kernel type
     const auto impl = Registrar<PowImpl_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Call kernel
-    impl.backward(op_.getOutput(0)->grad()->dims(),
-               input0gradDims,
-               input1gradDims,
-               getCPUPtr(mOp.getRawOutput(0)),
-               getCPUPtr(mOp.getRawInput(0)),
-               getCPUPtr(mOp.getRawInput(1)));
+    impl.backward(input0gradDims,
+                input1gradDims,
+                out0grad->dims(),
+                getCPUPtr(in0),
+                getCPUPtr(in1),
+                getCPUPtr(out0grad),
+                getCPUPtr(in0grad),
+                getCPUPtr(in1grad));
 }
\ No newline at end of file
diff --git a/unit_tests/data/Test_TensorImpl.cpp b/unit_tests/data/Test_TensorImpl.cpp
index 31fbed4c090f5e4848df12f2bc2ccd36e3aedf9d..5f870acfb44366632474b7290228658d7a4701dd 100644
--- a/unit_tests/data/Test_TensorImpl.cpp
+++ b/unit_tests/data/Test_TensorImpl.cpp
@@ -154,6 +154,10 @@ TEST_CASE("Test addition of Tensors","[TensorImpl][Add]") {
         Tensor T4(T1->dims());
         T4.setDataType(DataType::Float64);
         REQUIRE_THROWS(*T0 + T4);
+
+        delete[] array0;
+        delete[] array1;
+        delete[] result;
     }
 }
 
diff --git a/unit_tests/operator/Test_BitShift.cpp b/unit_tests/operator/Test_BitShift.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a52990bc7991a325ce151cf6634b0d5a831992c8
--- /dev/null
+++ b/unit_tests/operator/Test_BitShift.cpp
@@ -0,0 +1,245 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <catch2/catch_test_macros.hpp>
+#include <cstddef>   // std::size_t
+#include <cstdint>   // std::uint16_t
+#include <chrono>
+#include <iostream>
+#include <memory>
+#include <numeric>   
+#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <iomanip>
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/BitShift.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+namespace Aidge {
+
+TEST_CASE("[cpu/operator] BitShift_TEST", "[BitShift][CPU]") {
+    constexpr std::uint16_t NBTRIALS = 15;
+    // Create a random number generator
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<int> valueDist(-15, 15); 
+    std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(5));
+    std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(3));
+    std::uniform_int_distribution<int> boolDist(0,1);
+
+    BitShift_Op::BitShiftDirection direction = BitShift_Op::BitShiftDirection::left;
+
+    if(valueDist(gen) % 2 == 0)
+    {
+        direction = BitShift_Op::BitShiftDirection::right;
+    }
+
+    // Create BitShift Operator
+    std::shared_ptr<Node> myBitShift = BitShift(direction);
+    auto op = std::static_pointer_cast<OperatorTensor>(myBitShift-> getOperator());
+    op->setDataType(DataType::Int32);
+    op->setBackend("cpu");
+
+    // Create 2 input Tensors
+    std::shared_ptr<Tensor> T0 = std::make_shared<Tensor>();
+    op->associateInput(0,T0);
+    T0->setDataType(DataType::Int32);
+    T0->setBackend("cpu");
+    std::shared_ptr<Tensor> T1 = std::make_shared<Tensor>();
+    op -> associateInput(1,T1);
+    T1->setDataType(DataType::Int32);
+    T1->setBackend("cpu");
+
+    // Create results Tensor
+    std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>();
+    Tres->setDataType(DataType::Int32);
+    Tres->setBackend("cpu");
+
+    // To measure execution time of 'BitShift_Op::forward()' member function call
+    std::chrono::time_point<std::chrono::system_clock> start;
+
+    std::chrono::time_point<std::chrono::system_clock> end;
+    std::chrono::duration<double, std::micro> duration{};
+
+    SECTION("BitShiftImpl_cpu::forward()") {
+        SECTION("Test Forward Kernel with same dimensions") {
+            std::size_t number_of_operation = 0;
+
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                // generate 2 random Tensors
+                const std::size_t nbDims = nbDimsDist(gen);
+                std::vector<std::size_t> dims;
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    dims.push_back(dimSizeDist(gen));
+                }
+                const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                number_of_operation += nb_elements;
+
+                // without broadcasting
+                int* array0 = new int[nb_elements];
+                int* array1 = new int[nb_elements];
+                int* result = new int[nb_elements];
+
+                for (std::size_t i = 0; i < nb_elements; ++i) {
+                    array0[i] = valueDist(gen);
+                    array1[i] = std::abs(valueDist(gen)); // bitshift is impossible with negative value
+                    if(direction == BitShift_Op::BitShiftDirection::left)
+                    {
+                        result[i] = array0[i] << array1[i];
+                    }
+                    else
+                    {
+                        result[i] = array0[i] >> array1[i];
+                    }
+                }
+
+                // input0
+                T0->resize(dims);
+                T0 -> getImpl() -> setRawPtr(array0, nb_elements);
+
+                // input1
+                T1->resize(dims);
+                T1 -> getImpl() -> setRawPtr(array1, nb_elements);
+
+                // results
+                Tres->resize(dims);
+                Tres -> getImpl() -> setRawPtr(result, nb_elements);
+
+                op->forwardDims();
+                start = std::chrono::system_clock::now();
+                myBitShift->forward();
+                end = std::chrono::system_clock::now();
+                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+                bool is_eq = approxEq<int>(*(op->getOutput(0)), *Tres);
+
+                auto Output = *(op->getOutput(0));
+                auto prt = Output.getImpl()->rawPtr();
+
+                REQUIRE(is_eq);
+
+                delete[] array0;
+                delete[] array1;
+                delete[] result;
+
+
+            }
+            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
+            std::cout << "total time: " << duration.count() << "μs" << std::endl;
+        }
+        SECTION("Test BitShift kernels with Broadcasting") {
+            std::size_t number_of_operation = 0;
+
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                // generate 2 random Tensors
+                // handle dimensions, replace some dimensions with '1' to get broadcasting
+                constexpr std::size_t nbDims = 4;
+                std::vector<std::size_t> dims;
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    dims.push_back(dimSizeDist(gen));
+                }
+                std::vector<std::size_t> dims0 = dims;
+                std::vector<std::size_t> dims1 = dims;
+                std::vector<std::size_t> dimsOut = dims;
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    if (boolDist(gen)) {
+                        dims0[i] = 1;
+                    }
+                    if (boolDist(gen)) {
+                        dims1[i] = 1;
+                    }
+                    dimsOut[i] = (dims0[i] == 1) ? dims1[i] : dims0[i];
+                }
+
+                // create arrays and fill them with random values
+                int* array0 = new int[dims0[0]*dims0[1]*dims0[2]*dims0[3]];
+                int* array1 = new int[dims1[0]*dims1[1]*dims1[2]*dims1[3]];
+                int* result = new int[dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]];
+
+                for (std::size_t i = 0; i < dims0[0]*dims0[1]*dims0[2]*dims0[3]; ++i) {
+                    array0[i] = valueDist(gen);
+                }
+                for (std::size_t i = 0; i < dims1[0]*dims1[1]*dims1[2]*dims1[3]; ++i) {
+                    array1[i] = std::abs(valueDist(gen));
+                }
+
+                //True result with broadcast
+                const std::size_t strides0[nbDims] = {dims0[1]*dims0[2]*dims0[3], dims0[2]*dims0[3], dims0[3], 1};
+                const std::size_t strides1[nbDims] = {dims1[1]*dims1[2]*dims1[3], dims1[2]*dims1[3], dims1[3], 1};
+                for (std::size_t a = 0; a < dimsOut[0]; ++a) {
+                    for (std::size_t b = 0; b < dimsOut[1]; ++b) {
+                        const std::size_t idx0_0 = strides0[0] * ((dims0[0] > 1) ? a : 0)
+                                                    + strides0[1] * ((dims0[1] > 1) ? b : 0);
+                        const std::size_t idx1_0 = strides1[0] * ((dims1[0] > 1) ? a : 0)
+                                                    + strides1[1] * ((dims1[1] > 1) ? b : 0);
+                        for (std::size_t c = 0; c < dimsOut[2]; ++c) {
+                            const std::size_t idx_out = dimsOut[3] * (c + dimsOut[2] * (b + dimsOut[1] * a));
+                            for (std::size_t d = 0; d < dimsOut[3]; ++d) {
+                                std::size_t idx0 = idx0_0
+                                                    + strides0[2] * ((dims0[2] > 1) ? c : 0)
+                                                    + ((dims0[3] > 1) ? d : 0);
+                                std::size_t idx1 = idx1_0
+                                                    + strides1[2] * ((dims1[2] > 1) ? c : 0)
+                                                    + ((dims1[3] > 1) ? d : 0);
+                                if(direction == BitShift_Op::BitShiftDirection::left)
+                                {
+                                    result[idx_out + d] = array0[idx0] << array1[idx1];
+                                }
+                                else
+                                {
+                                    result[idx_out + d] = array0[idx0] >> array1[idx1];                               
+                                }
+                            }
+                        }
+                    }
+                }
+
+                // conversion to Aidge::Tensors
+                // input0
+                T0->resize(dims0);
+                T0 -> getImpl() -> setRawPtr(array0, dims0[0]*dims0[1]*dims0[2]*dims0[3]);
+
+                // input1
+                T1->resize(dims1);
+                T1 -> getImpl() -> setRawPtr(array1, dims1[0]*dims1[1]*dims1[2]*dims1[3]);
+
+                // results
+                Tres->resize(dimsOut);
+                Tres -> getImpl() -> setRawPtr(result, dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]);
+
+                // compute result
+                op->forwardDims();
+                start = std::chrono::system_clock::now();
+                myBitShift->forward();
+                end = std::chrono::system_clock::now();
+                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+                // comparison between truth and computed result
+                bool equiv = (approxEq<int>(*(op->getOutput(0)), *Tres));
+                if(equiv == false)
+                {
+                    std::cout << "Problem\n";
+                }
+                REQUIRE(equiv);
+
+                delete[] array0;
+                delete[] array1;
+                delete[] result;
+
+                const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                number_of_operation += nb_elements;
+            }
+            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
+            std::cout << "total time: " << duration.count() << "μs" << std::endl;
+        }
+
+}
+} // namespace Aidge
+}
\ No newline at end of file
diff --git a/unit_tests/operator/Test_MatMulImpl.cpp b/unit_tests/operator/Test_MatMulImpl.cpp
index 8a1e589fa0e9a57d712c77a12501d35f5f995bcc..d6e934b4dc8d84e8a595eb74d1af9d2c68c892d1 100644
--- a/unit_tests/operator/Test_MatMulImpl.cpp
+++ b/unit_tests/operator/Test_MatMulImpl.cpp
@@ -101,6 +101,10 @@ TEST_CASE("[cpu/operator] MatMul(forward)", "[MatMul][CPU]") {
             duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
 
             REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
+
+            delete[] bigArray1;
+            delete[] bigArray2;
+            delete[] res;
         }
         std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl;
         std::cout << "total time: " << duration.count() << std::endl;
@@ -165,6 +169,10 @@ TEST_CASE("[cpu/operator] MatMul(forward)", "[MatMul][CPU]") {
             duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
 
             REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
+
+            delete[] bigArray1;
+            delete[] bigArray2;
+            delete[] res;
         }
         std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl;
         std::cout << "total time: " << duration.count() << std::endl;
@@ -231,6 +239,10 @@ TEST_CASE("[cpu/operator] MatMul(forward)", "[MatMul][CPU]") {
             end = std::chrono::system_clock::now();
             duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
             REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
+
+            delete[] bigArray1;
+            delete[] bigArray2;
+            delete[] res;
         }
         std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl;
         std::cout << "total time: " << duration.count() << std::endl;
diff --git a/unit_tests/operator/Test_PowImpl.cpp b/unit_tests/operator/Test_PowImpl.cpp
index 3b85defb37ff76439b658faa84c3c7457a152d2f..cb5d8872c9c7242bb4aa4efca388d53b578417f9 100644
--- a/unit_tests/operator/Test_PowImpl.cpp
+++ b/unit_tests/operator/Test_PowImpl.cpp
@@ -313,5 +313,171 @@ TEST_CASE("[cpu/operator] Pow", "[Pow][CPU]") {
             std::cout << "total time: " << duration.count() << "μs" << std::endl;
         }
     }
+
+
+    SECTION("PowImpl_cpu::backward()") {
+        SECTION("3D Tensors") {
+            const auto input0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
+                {
+                    {
+                        {
+                            {2.0, 3.0},
+                            {4.0, 5.0}
+                        },
+                        {
+                            {6.0, 7.0},
+                            {8.0, 9.0}
+                        }
+                    }
+                }
+            ));
+            const auto input1 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
+                {
+                    {
+                        {
+                            {1.0, 2.0},
+                            {3.0, 2.0}
+                        },
+                        {
+                            {2.0, 3.0},
+                            {1.0, 0.5}
+                        }
+                    }
+                }
+            ));
+            const auto gradOut = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
+                {
+                    {
+                        {
+                            {0.5, 1.0},
+                            {1.5, 2.0}
+                        },
+                        {
+                            {2.5, 3.0},
+                            {3.5, 4.0}
+                        }
+                    }
+                }
+            ));
+            const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
+                {
+                    {
+                        {
+                            {0.50000000,   6.00000000},
+                            {72.00000000,  20.00000000}
+                        },
+                        {
+                            {30.00000000, 441.00000000},
+                            {3.50000000,   0.66666669}
+                        }
+                    }
+                }
+            ));
+            const auto expectedGrad1 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
+                {
+                    {
+                        {
+                            {  0.693147182, 9.88751030},
+                            {1.33084259e+02, 8.04718933e+01}
+                        },
+                        {
+                            {1.61258362e+02, 2.00234143e+03},
+                            {5.82243652e+01, 2.63666954e+01}
+                        }
+                    }
+                }
+            ));
+            for(const auto T: {input0, input1, gradOut, expectedGrad0, expectedGrad1})
+            {
+                    T->setBackend("cpu") ;
+                    T->setDataType(DataType::Float32);
+            }
+            std::shared_ptr<Node> powOp = Pow();
+            auto opr = std::static_pointer_cast<OperatorTensor>(powOp-> getOperator());
+            opr->setDataType(DataType::Float32);
+            opr->setBackend("cpu");
+            opr->associateInput(0, input0);
+            opr->associateInput(1, input1);
+            opr->getOutput(0)->setGrad(gradOut);
+            opr->forward();
+
+            powOp->backward();
+            REQUIRE(approxEq<float>(*(opr->getInput(0)->grad()), *expectedGrad0));
+            REQUIRE(approxEq<float>(*(opr->getInput(1)->grad()), *expectedGrad1));
+        }
+        SECTION("Broadcasting") {
+            const auto input0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+                {
+                    {
+                        {
+                            {1.0, 2.0, 3.0},
+                            {4.0, 5.0, 6.0}
+                        },
+                        {
+                            {1.5, 2.5, 3.5},
+                            {4.5, 5.5, 6.5}
+                        }
+                    }
+                }
+            ));
+            const auto input1 = std::make_shared<Tensor>(Array1D<float, 3>(
+                {
+                    {0.1, 0.2, 0.3}
+                }
+            ));
+
+            const auto gradOut = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+                {
+                    {
+                        {
+                            {1.0, 2.0, 3.0},
+                            {4.0, 5.0, 6.0}
+                        },
+                        {
+                            {6.0, 5.0, 4.0},
+                            {3.0, 2.0, 1.0}
+                        }
+                    }
+                }
+            ));
+            const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+                {
+                    {
+                        {
+                            {0.10000000, 0.22973967, 0.41711676},
+                            {0.11486985, 0.27594593, 0.51353097}
+                        },
+                        {
+                            {0.41655189, 0.48044977, 0.49926791},
+                            {0.07748720, 0.10227509, 0.08092485}
+                        }
+                    }
+                }
+            ));
+            const auto expectedGrad1 = std::make_shared<Tensor>(Array1D<float, 3>(
+                {
+                    {14.14779854, 22.99299049, 33.56402588}
+                }
+            ));
+
+            for(const auto T: {input0, input1, gradOut, expectedGrad0, expectedGrad1})
+            {
+                    T->setBackend("cpu") ;
+                    T->setDataType(DataType::Float32);
+            }
+            std::shared_ptr<Node> powOp = Pow();
+            auto opr = std::static_pointer_cast<OperatorTensor>(powOp-> getOperator());
+            opr->setDataType(DataType::Float32);
+            opr->setBackend("cpu");
+            opr->associateInput(0, input0);
+            opr->associateInput(1, input1);
+            opr->getOutput(0)->setGrad(gradOut);
+            powOp->forward();
+
+            powOp->backward();
+            REQUIRE(approxEq<float>(*(opr->getInput(0)->grad()), *expectedGrad0));
+            REQUIRE(approxEq<float>(*(opr->getInput(1)->grad()), *expectedGrad1));
+        }
+    }
 }
 } // namespace Aidge
diff --git a/version.txt b/version.txt
index 7179039691ce07a214e7a815893fee97a97b1422..0d91a54c7d439e84e3dd17d3594f1b2b6737f430 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.2.3
+0.3.0