diff --git a/.gitignore b/.gitignore
index 0e14676b900cb1418593019be70cc4d20aba2883..9877699f938bdbf94d0383b5b9db6f5f4cf1023e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@
 # C++ Build
 build*/
 install*/
+include/aidge/backend/cpu_version.h
 
 # VSCode
 .vscode
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e9e191c36d5ad57a9a9dbed378154db6676ec796..a2f50c50871d0a47b884b54b353063c60260c1d9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,15 +3,18 @@ set(CXX_STANDARD 14)
 
 file(STRINGS "${CMAKE_SOURCE_DIR}/version.txt" version)
 
+# Parse version.txt to retrieve Major, Minor and Path
+string(REGEX MATCH "([0-9]+\\.[0-9]+\\.[0-9]+)" _ MATCHES ${version})
+set(PROJECT_VERSION_MAJOR ${CMAKE_MATCH_1})
+set(PROJECT_VERSION_MINOR ${CMAKE_MATCH_2})
+set(PROJECT_VERSION_PATCH ${CMAKE_MATCH_3})
+
 project(aidge_backend_cpu
         VERSION ${version}
         DESCRIPTION "CPU implementations of the operators of aidge framework."
         LANGUAGES CXX)
 
-message(STATUS "Project name: ${CMAKE_PROJECT_NAME}")
-message(STATUS "Project version: ${version}")
-add_definitions(-DPROJECT_VERSION="${version}")
-
+# Retrieve latest git commit
 execute_process(
     COMMAND git rev-parse --short HEAD
     WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
@@ -19,8 +22,10 @@ execute_process(
     OUTPUT_STRIP_TRAILING_WHITESPACE
     ERROR_QUIET
 )
+
+message(STATUS "Project name: ${CMAKE_PROJECT_NAME}")
+message(STATUS "Project version: ${version}")
 message(STATUS "Latest git commit: ${GIT_COMMIT_HASH}")
-add_definitions(-DGIT_COMMIT_HASH="${GIT_COMMIT_HASH}")
 
 # helper for LSP users
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@@ -64,6 +69,8 @@ file(GLOB_RECURSE inc_files "include/*.hpp")
 add_library(${module_name} ${src_files} ${inc_files})
 
 target_link_libraries(${module_name}
+    PRIVATE
+        fmt::fmt
     PUBLIC
         _aidge_core # _ is added because we link the exported target and not the project
 )
@@ -115,6 +122,13 @@ if(CMAKE_COMPILER_IS_GNUCXX AND COVERAGE)
     append_coverage_compiler_flags()
 endif()
 
+message(STATUS "Creating ${CMAKE_CURRENT_SOURCE_DIR}/include/aidge/backend/cpu_version.h")
+# Generate version.h file from config file version.h.in
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/include/aidge/backend/version.h.in"
+    "${CMAKE_CURRENT_SOURCE_DIR}/include/aidge/backend/cpu_version.h"
+)
+
 ##############################################
 # Installation instructions
 include(GNUInstallDirs)
diff --git a/aidge_backend_cpu/__init__.py b/aidge_backend_cpu/__init__.py
index a7fe1ea3abdea25b18af6e7e0a1958f01f928433..bb320b2fe436a3be81dde8d643728bd5a30942e7 100644
--- a/aidge_backend_cpu/__init__.py
+++ b/aidge_backend_cpu/__init__.py
@@ -1,3 +1,2 @@
 import aidge_core
 from aidge_backend_cpu.aidge_backend_cpu import * # import so generated by PyBind
-from ._version import *
diff --git a/include/aidge/backend/cpu.hpp b/include/aidge/backend/cpu.hpp
index caa75328e58f6c9581f81368a3981bb79a069d49..7553f96b9c17a930c61658e58282b8ee6ba3e015 100644
--- a/include/aidge/backend/cpu.hpp
+++ b/include/aidge/backend/cpu.hpp
@@ -12,6 +12,8 @@
 #ifndef AIDGE_CPU_IMPORTS_H_
 #define AIDGE_CPU_IMPORTS_H_
 
+#include "aidge/backend/cpu_version.h"
+
 #include "aidge/backend/cpu/operator/AbsImpl.hpp"
 #include "aidge/backend/cpu/operator/AddImpl.hpp"
 #include "aidge/backend/cpu/operator/AndImpl.hpp"
@@ -27,6 +29,7 @@
 #include "aidge/backend/cpu/operator/ConvImpl.hpp"
 #include "aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp"
 #include "aidge/backend/cpu/operator/DivImpl.hpp"
+#include "aidge/backend/cpu/operator/EqualImpl.hpp"
 #include "aidge/backend/cpu/operator/ErfImpl.hpp"
 #include "aidge/backend/cpu/operator/FCImpl.hpp"
 #include "aidge/backend/cpu/operator/FoldImpl.hpp"
@@ -51,6 +54,7 @@
 #include "aidge/backend/cpu/operator/SoftmaxImpl.hpp"
 #include "aidge/backend/cpu/operator/SubImpl.hpp"
 #include "aidge/backend/cpu/operator/TanhImpl.hpp"
+#include "aidge/backend/cpu/operator/WeightInterleavedImpl.hpp"
 
 #include "aidge/backend/cpu/data/TensorImpl.hpp"
 
diff --git a/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp
index 73b710e021ac5031923eb1e9a2492502c02a3633..d7c8ebcf19f64cb60aa2b62f312f4b46351e6ec2 100644
--- a/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp
@@ -20,7 +20,7 @@ namespace Aidge {
 namespace {
 // suppose values are contiguous in memory
 template <class I, class O>
-void equal_contiguous_arrays(const std::size_t input1size,
+void and_contiguous_arrays(const std::size_t input1size,
                             const std::size_t input2size,
                             const std::size_t output1size,
                             const I* input1,
@@ -31,14 +31,14 @@ void equal_contiguous_arrays(const std::size_t input1size,
     {
         const std::size_t in1_id = (input1size != 1) ? i : 0;
         const std::size_t in2_id = (input2size != 1) ? i : 0;
-        output[i] = static_cast<O>(input1[in1_id] == input2[in2_id]);
+        output[i] = static_cast<O>(input1[in1_id] && input2[in2_id]);
     }
 }
 }
 
 
 template <class I, class O>
-void EqualImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
+void AndImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
                                 std::vector<std::size_t> dims1,
                                 const std::vector<std::size_t>& outputDims,
                                 const void* input0_,
@@ -60,9 +60,8 @@ void EqualImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
     // special case for equal dimensions, the kernel is called with the entire arrays at once
     if (dims0 == dims1) {
         const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
-        for (std::size_t i = 0; i < input0_contiguous_size; ++i)
-        {
-            output[i] = static_cast<O>(input_0[i] == input_1[i]);
+        for (std::size_t i = 0; i < input0_contiguous_size; ++i) {
+            output[i] = static_cast<O>(input_0[i] && input_1[i]);
         }
         return;
     }
@@ -126,7 +125,7 @@ void EqualImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
     std::size_t dim = contiguousIdx - 1;
     const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
     for (std::size_t stack = 0; stack < nbStacks;) {
-        equal_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
+        and_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
                     input_0 + offsetIn0*input0_contiguous_size,
                     input_1 + offsetIn1*input1_contiguous_size,
                     output + offsetOut*output_contiguous_size);
@@ -146,17 +145,17 @@ void EqualImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
 
 // Kernels registration to implementation entry point
 REGISTRAR(AndImpl_cpu,
-    {DataType::Float32},
-    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<float, float>, nullptr});
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float32}},
+    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<float, float>, nullptr});
 REGISTRAR(AndImpl_cpu,
-    {DataType::Float64},
-    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<double, double>, nullptr});
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}},
+    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<double, double>, nullptr});
 REGISTRAR(AndImpl_cpu,
-    {DataType::Int32},
-    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr});
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}},
+    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr});
 REGISTRAR(AndImpl_cpu,
-    {DataType::Int64},
-    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, nullptr});
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int64}},
+    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, nullptr});
 
 }  // namespace Aidge
 
diff --git a/include/aidge/backend/cpu/operator/AvgPoolingImpl.hpp b/include/aidge/backend/cpu/operator/AvgPoolingImpl.hpp
index adea96ca43a1ad9d2a49777426913ca4676e4f32..7c76657f7d100255f49ab1e675672407c5fbbaf8 100644
--- a/include/aidge/backend/cpu/operator/AvgPoolingImpl.hpp
+++ b/include/aidge/backend/cpu/operator/AvgPoolingImpl.hpp
@@ -28,8 +28,10 @@ namespace Aidge {
 using AvgPooling2D_Op = AvgPooling_Op<2>;
 using AvgPoolingImpl2D_cpu = OperatorImpl_cpu<AvgPooling_Op<2>,
     void(const std::array<DimSize_t, 2>&,
+        const std::array<DimSize_t, 2>&,
         const std::array<DimSize_t, 2>&,
         const std::array<DimSize_t, 4>&,
+        bool,
         const void *,
         void *)>;
 
diff --git a/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp
index f6da9dcb026101b93de862499d42ae8734532d52..68dbfbe7302c392d87d9c895612c7ace9292ef57 100644
--- a/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp
@@ -35,66 +35,54 @@ namespace Aidge {
 template <class I, class O>
 void AvgPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
                                         const std::array<DimSize_t, 2>& kernelDims,
+                                        const std::array<DimSize_t, 2>& dilations,
                                         const std::array<DimSize_t, 4> &dims,
+                                        bool ceilMode,
                                         const void *input_,
                                         void *output_) {
-    // FIXME: missing convolution attributes as arguments
     const I *input = static_cast<const I *>(input_);
     O *output = static_cast<O *>(output_);
 
+    // Calculate output dimensions based on ceilMode and dilations
+    auto compute_output_size = [&](DimSize_t inputDim, DimSize_t kernelDim, DimSize_t stride, DimSize_t dilation) {
+        DimSize_t effectiveKernelDim = (kernelDim - 1) * dilation + 1;
+        float result = static_cast<float>(inputDim - effectiveKernelDim + stride) / static_cast<float>(stride);
+        return ceilMode ? static_cast<DimSize_t>(std::ceil(result)) : static_cast<DimSize_t>(std::floor(result));
+    };
 
-    // output H size
-    const std::size_t oxSize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(dims[2] - kernelDims[0] + strideDims[0]) /
-                                static_cast<float>(strideDims[0])));
-    // output W size
-    const std::size_t oySize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(dims[3] - kernelDims[1] + strideDims[1]) /
-                                static_cast<float>(strideDims[1])));
+    const std::size_t oxSize = compute_output_size(dims[2], kernelDims[0], strideDims[0], dilations[0]);
+    const std::size_t oySize = compute_output_size(dims[3], kernelDims[1], strideDims[1], dilations[1]);
 
-    // TODO: kernel computation
-    // output (batch, outCh, Xout, Yout)
-    // input  (batch, ch, Xin, Yin)
-    // weight (outCh, ch, kernelX, kernelY)
-    // does not take Dilation attribute into account
     using signedsize = std::make_signed<std::size_t>::type;
+
     for (std::size_t batch = 0; batch < dims[0]; ++batch) {
         for (std::size_t ch = 0; ch < dims[1]; ++ch) {
-            const std::size_t oIndex = (ch + batch*dims[1]) * oxSize * oySize;
-            const std::size_t iIndex = (ch + batch*dims[1]) * dims[2] * dims[3];
-            std::fill(output + oIndex, output+(oIndex+oxSize*oySize), 0);
+            const std::size_t oIndex = (ch + batch * dims[1]) * oxSize * oySize;
+            const std::size_t iIndex = (ch + batch * dims[1]) * dims[2] * dims[3];
+            std::fill(output + oIndex, output + (oIndex + oxSize * oySize), 0);
+
             for (std::size_t ox = 0; ox < oxSize; ++ox) {
-                const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]);
-                const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
-                const std::size_t sxMax = (static_cast<signedsize>(dims[2]) + difx) < 0 ? 0 : ((dims[2] + difx) > kernelDims[0] ? kernelDims[0] : dims[2] + difx);
+                const signedsize startx = static_cast<signedsize>(ox * strideDims[0]) - (dilations[0] - 1);
+                const std::size_t sxMin = static_cast<std::size_t>(std::max(startx, signedsize(0)));
+                const std::size_t sxMax = std::min(dims[2], static_cast<std::size_t>(startx + kernelDims[0] * dilations[0]));
+
                 for (std::size_t oy = 0; oy < oySize; ++oy) {
-                    const signedsize dify = static_cast<signedsize>(- oy * strideDims[1]);
-                    const std::size_t syMin = static_cast<std::size_t>(std::max(dify, signedsize(0)));
-                    const std::size_t syMax = (static_cast<signedsize>(dims[3]) + dify) < 0 ? 0 : ((dims[3] + dify) > kernelDims[1] ? kernelDims[1] : dims[3] + dify);
-                    const std::size_t oIndexFull = oIndex + ox*oySize + oy;
-                    const std::size_t ix = ox * strideDims[0];
-                    const std::size_t iy = oy * strideDims[1];
+                    const signedsize starty = static_cast<signedsize>(oy * strideDims[1]) - (dilations[1] - 1);
+                    const std::size_t syMin = static_cast<std::size_t>(std::max(starty, signedsize(0)));
+                    const std::size_t syMax = std::min(dims[3], static_cast<std::size_t>(starty + kernelDims[1] * dilations[1]));
 
-                    if (sxMin == 0 && syMin == 0 && sxMax == 3 && syMax == 3) {
-                        output[oIndexFull] += static_cast<O>(
-                                               input[iIndex + (ix+0)*dims[3] + (iy+0)] +
-                                               input[iIndex + (ix+0)*dims[3] + (iy+1)] +
-                                               input[iIndex + (ix+0)*dims[3] + (iy+2)] +
-                                               input[iIndex + (ix+1)*dims[3] + (iy+0)] +
-                                               input[iIndex + (ix+1)*dims[3] + (iy+1)] +
-                                               input[iIndex + (ix+1)*dims[3] + (iy+2)] +
-                                               input[iIndex + (ix+2)*dims[3] + (iy+0)] +
-                                               input[iIndex + (ix+2)*dims[3] + (iy+1)] +
-                                               input[iIndex + (ix+2)*dims[3] + (iy+2)]) / O(9);
-                    } else {
-                        for (std::size_t sx = sxMin; sx < sxMax; ++sx) {
-                            for (std::size_t sy = syMin; sy < syMax; ++sy) {
-                                output[oIndexFull] += input[iIndex + (ix+sx)*dims[3] + (iy+sy)];
-                            }
+                    const std::size_t oIndexFull = oIndex + ox * oySize + oy;
+                    O sum = static_cast<O>(0);
+                    std::size_t count = 0;
+
+                    for (std::size_t sx = sxMin; sx < sxMax; sx += dilations[0]) {
+                        for (std::size_t sy = syMin; sy < syMax; sy += dilations[1]) {
+                            sum += static_cast<O>(input[iIndex + sx * dims[3] + sy]);
+                            ++count;
                         }
-                        // padding not used
-                        output[oIndexFull] /= (sxMax - sxMin) * (syMax - syMin);
                     }
+
+                    output[oIndexFull] = sum / static_cast<O>(count);
                 }
             }
         }
diff --git a/include/aidge/backend/cpu/operator/EqualImpl.hpp b/include/aidge/backend/cpu/operator/EqualImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e2489096067a49139f6291898056d525a77db522
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/EqualImpl.hpp
@@ -0,0 +1,32 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_EQUALIMPL_H_
+#define AIDGE_CPU_OPERATOR_EQUALIMPL_H_
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Equal.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include <memory>
+#include <vector>
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using EqualImpl_cpu = OperatorImpl_cpu<Equal_Op,
+    void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*, void*)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(Equal_Op, "cpu", Aidge::EqualImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_EQUALIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/EqualImpl_kernels.hpp b/include/aidge/backend/cpu/operator/EqualImpl_kernels.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3c8ff0f4742e0393efd8cbbf637822c443edffb3
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/EqualImpl_kernels.hpp
@@ -0,0 +1,163 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_EQUALIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_EQUALIMPL_KERNELS_H_
+
+#include "aidge/backend/cpu/operator/EqualImpl.hpp"
+#include "aidge/utils/Registrar.hpp"
+
+namespace Aidge {
+
+namespace {
+// suppose values are contiguous in memory
+template <class I, class O>
+void equal_contiguous_arrays(const std::size_t input1size,
+                            const std::size_t input2size,
+                            const std::size_t output1size,
+                            const I* input1,
+                            const I* input2,
+                            O* output)
+{
+    for (std::size_t i = 0; i < output1size; ++i)
+    {
+        const std::size_t in1_id = (input1size != 1) ? i : 0;
+        const std::size_t in2_id = (input2size != 1) ? i : 0;
+        output[i] = static_cast<O>(input1[in1_id] == input2[in2_id]);
+    }
+}
+}
+
+
+template <class I, class O>
+void EqualImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
+                                std::vector<std::size_t> dims1,
+                                const std::vector<std::size_t>& outputDims,
+                                const void* input0_,
+                                const void* input1_,
+                                void* output_) {
+
+    const I* input_0 = static_cast<const I*>(input0_);
+    const I* input_1 = static_cast<const I*>(input1_);
+    O* output = static_cast<O*>(output_);
+
+    // [5,2,1,7] & [2,6,7]
+    // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7]
+    // 2. Find the highest equal dimension -> 3
+    //    Exception: if the first diverging dimension is the last one, then -> 4 (dims.size())
+    // 3. Compute the highest number of contiguous data -> 7
+    // 4. Compute stride and offset step for the broadcast mechanism
+    // 5. Call a simple kernel
+
+    // special case for equal dimensions, the kernel is called with the entire arrays at once
+    if (dims0 == dims1) {
+        const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+        for (std::size_t i = 0; i < input0_contiguous_size; ++i)
+        {
+            output[i] = static_cast<O>(input_0[i] == input_1[i]);
+        }
+        return;
+    }
+
+    // set dimensions to be of equal size by filling the smallest one with ones.
+    if (dims0.size() > dims1.size()) {
+        dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1));
+    }
+    else if (dims1.size() > dims0.size()) {
+        dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1));
+    }
+
+    const std::size_t nbDims = dims0.size();
+
+    // Find the highest equal dimension
+    // std::size_t contiguousIdx = nbDims - 1;
+    std::size_t contiguousIdx = nbDims;
+    while (contiguousIdx-- > 0) {
+    // for (; contiguousIdx+1 > 0; --contiguousIdx) {
+        if (dims0[contiguousIdx] != dims1[contiguousIdx]) {
+            if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1
+                const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1;
+                while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) {
+                    --contiguousIdx;
+                }
+            }
+            break;
+        }
+    }
+    ++contiguousIdx;
+
+    // Compute the highest number of contiguous data for each Tensor
+    const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+
+    // initialize strides to iterate through data because of broadcasting
+    std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    if (contiguousIdx > 0) {
+        stride_post0[contiguousIdx - 1] = 1;
+        stride_post1[contiguousIdx - 1] = 1;
+        for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) {
+            stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]);
+            stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]);
+        }
+        for (std::size_t i = 0; i != contiguousIdx; ++i) {
+            stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1;
+            stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1;
+        }
+    }
+
+    // variables for arrays offsets
+    std::size_t offsetIn0 = 0;
+    std::size_t offsetIn1 = 0;
+    std::size_t offsetOut = 0;
+
+
+    std::size_t dim = contiguousIdx - 1;
+    const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
+    for (std::size_t stack = 0; stack < nbStacks;) {
+        equal_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
+                    input_0 + offsetIn0*input0_contiguous_size,
+                    input_1 + offsetIn1*input1_contiguous_size,
+                    output + offsetOut*output_contiguous_size);
+        if (++stack < nbStacks) {
+            std::size_t tmp_stack = stack;
+            while(tmp_stack % outputDims[dim] == 0) {
+                tmp_stack /= outputDims[dim];
+                dim--;
+            }
+            offsetIn0 += stride_step0[dim];
+            offsetIn1 += stride_step1[dim];
+            ++offsetOut;
+            dim = contiguousIdx - 1;
+        }
+    }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(EqualImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float32}},
+    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(EqualImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}},
+    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(EqualImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}},
+    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr});
+REGISTRAR(EqualImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int64}},
+    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, nullptr});
+
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_EQUALIMPL_KERNELS_H_ */
diff --git a/include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp b/include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp
index 68cc3621514de97d9837e10bcf90218abe559aaa..062088a13c0fcca8116a08f71457e6b0b01d1a65 100644
--- a/include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp
+++ b/include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp
@@ -28,6 +28,7 @@ namespace Aidge {
 using MaxPooling2D_Op = MaxPooling_Op<2>;
 using MaxPoolingImpl2D_cpu = OperatorImpl_cpu<MaxPooling_Op<2>,
     void(const std::array<DimSize_t, 2>&,
+                            const std::array<DimSize_t, 2>&,
                             const std::array<DimSize_t, 2>&,
                             const bool,
                             const std::array<DimSize_t, 4> &,
diff --git a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
index 7b6f04f141eb701849a8d436561bcf9e37471cfa..250b11b0e0da100fc415c661ec3497d30203f293 100644
--- a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
@@ -35,28 +35,23 @@ namespace Aidge {
 template <class I, class O>
 void MaxPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
                                         const std::array<DimSize_t, 2>& kernelDims,
+                                        const std::array<DimSize_t, 2>& dilations,
                                         const bool /*ceilMode*/,
                                         const std::array<DimSize_t, 4> &dims,
                                         const void *input_,
                                         void *output_) {
-    // FIXME: missing convolution parameters as arguments
     const I *input = static_cast<const I *>(input_);
     O *output = static_cast<O *>(output_);
 
     // output H size
     const std::size_t oxSize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(dims[2] - kernelDims[0] + strideDims[0]) /
+            static_cast<std::size_t>(std::floor(static_cast<float>(dims[2] - (kernelDims[0] - 1) * dilations[0] - 1 + strideDims[0]) /
                                 static_cast<float>(strideDims[0])));
     // output W size
     const std::size_t oySize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(dims[3] - kernelDims[1] + strideDims[1]) /
+            static_cast<std::size_t>(std::floor(static_cast<float>(dims[3] - (kernelDims[1] - 1) * dilations[1] - 1 + strideDims[1]) /
                                 static_cast<float>(strideDims[1])));
 
-    // TODO: kernel computation
-    // output (batch, outCh, Xout, Yout)
-    // input  (batch, ch, Xin, Yin)
-    // weight (outCh, ch, kernelX, kernelY)
-    // does not take Dilation parameter into account
     using signedsize = std::make_signed<std::size_t>::type;
     for (std::size_t batch = 0; batch < dims[0]; ++batch) {
         for (std::size_t ch = 0; ch < dims[1]; ++ch) {
@@ -77,12 +72,15 @@ void MaxPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideD
                     I poolValue(0.0);
                     bool valid = false;
 
-                    for (unsigned int channel = 0; channel < dims[1];
-                            ++channel){
-                        for (unsigned int sy = syMin; sy < syMax; ++sy) {
-                            for (unsigned int sx = sxMin; sx < sxMax; ++sx)
-                            {
-                                const I value = input[iIndex + (ix+sx)*dims[3] + (iy+sy)];
+                    for (unsigned int sy = syMin; sy < syMax; ++sy) {
+                        for (unsigned int sx = sxMin; sx < sxMax; ++sx) {
+                            // Apply dilation factor to kernel indices
+                            const std::size_t dilated_sx = sx * dilations[0];
+                            const std::size_t dilated_sy = sy * dilations[1];
+
+                            // Ensure indices are within bounds
+                            if ((ix + dilated_sx) < dims[2] && (iy + dilated_sy) < dims[3]) {
+                                const I value = input[iIndex + (ix + dilated_sx) * dims[3] + (iy + dilated_sy)];
 
                                 if (!valid || value > poolValue) {
                                     poolValue = value;
@@ -98,106 +96,6 @@ void MaxPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideD
     }
 }
 
-//N2D2 version
-/*
-template <class T>
-void N2D2::PoolCell_Frame_Kernels::forwardMax(const T* alpha,
-                                              const Tensor<T>&
-                                              inputs,
-                                              const Descriptor& desc,
-                                              const T* beta,
-                                              Tensor<T>& outputs,
-                                              Tensor<ArgMax>& argMax,
-                                              bool useArgMax,
-                                              const Tensor<bool>& maps)
-{
-    const unsigned int size = inputs.dimB() * outputs.dimZ();
-
-#if defined(_OPENMP) && _OPENMP >= 200805
-#pragma omp parallel for collapse(2) if (size > 16)
-#else
-#pragma omp parallel for if (inputs.dimB() > 4 && size > 16)
-#endif
-    for (int batchPos = 0; batchPos < (int)inputs.dimB(); ++batchPos) {
-        for (unsigned int output = 0; output < outputs.dimZ(); ++output) {
-            for (unsigned int oy = 0; oy < outputs.dimY(); ++oy) {
-                for (unsigned int ox = 0; ox < outputs.dimX(); ++ox) {
-                    const unsigned int sxMin = (unsigned int)std::max(
-                        desc.padding[0] - (int)(ox * desc.stride[0]), 0);
-                    const unsigned int syMin = (unsigned int)std::max(
-                        desc.padding[1] - (int)(oy * desc.stride[1]), 0);
-                    const unsigned int sxMax = Utils::clamp
-                        <int>(inputs.dimX() + desc.padding[0] - ox * desc.stride[0],
-                              0,
-                              desc.pool[0]);
-                    const unsigned int syMax = Utils::clamp
-                        <int>(inputs.dimY() + desc.padding[1] - oy * desc.stride[1],
-                              0,
-                              desc.pool[1]);
-
-                    const int ix = (int)(ox * desc.stride[0]) - desc.padding[0];
-                    const int iy = (int)(oy * desc.stride[1]) - desc.padding[1];
-
-                    T poolValue(0.0);
-
-                    // For each output, compute the pool value
-                    if (useArgMax) {
-                        const ArgMax inputMax
-                            = argMax(ox, oy, output, batchPos);
-
-                        if (inputMax.valid) {
-                            poolValue = inputs(inputMax.ix,
-                                               inputMax.iy,
-                                               inputMax.channel,
-                                               batchPos);
-                        }
-                    }
-                    else {
-                        unsigned int ixMax = 0;
-                        unsigned int iyMax = 0;
-                        unsigned int channelMax = 0;
-                        bool valid = false;
-
-                        for (unsigned int channel = 0; channel < inputs.dimZ();
-                             ++channel)
-                        {
-                            if (!maps.empty() && !maps(output, channel))
-                                continue;
-
-                            for (unsigned int sy = syMin; sy < syMax; ++sy) {
-                                for (unsigned int sx = sxMin; sx < sxMax; ++sx)
-                                {
-                                    const T value = inputs(ix + sx,
-                                                                 iy + sy,
-                                                                 channel,
-                                                                 batchPos);
-
-                                    if (!valid || value > poolValue) {
-                                        poolValue = value;
-                                        valid = true;
-
-                                        ixMax = ix + sx;
-                                        iyMax = iy + sy;
-                                        channelMax = channel;
-                                    }
-                                }
-                            }
-                        }
-
-                        argMax(ox, oy, output, batchPos)
-                            = ArgMax(ixMax, iyMax, channelMax, valid);
-                    }
-
-                    outputs(ox, oy, output, batchPos)
-                        = (*alpha) * poolValue
-                          + (*beta) * outputs(ox, oy, output, batchPos);
-                }
-            }
-        }
-    }
-}
-
-*/
 
 // Kernels registration to implementation entry point
 REGISTRAR(MaxPoolingImpl2D_cpu,
diff --git a/include/aidge/backend/cpu/operator/MulImpl.hpp b/include/aidge/backend/cpu/operator/MulImpl.hpp
index c927af9ebd4d658c764cc059df9778c273ba178e..eec5583bb548a3d3343b966c54cfccd1600b8f76 100644
--- a/include/aidge/backend/cpu/operator/MulImpl.hpp
+++ b/include/aidge/backend/cpu/operator/MulImpl.hpp
@@ -34,6 +34,7 @@ using MulImpl_cpu = OperatorImpl_cpu<Mul_Op,
         const std::size_t,
         const std::vector<std::size_t>,
         const std::vector<std::size_t>,
+        const std::vector<std::size_t>,
         const void*,
         const void*,
         const void*,
diff --git a/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp
index 556dd56cd32f28de14a43d20b97deb0083341fee..36acb9199c51e900287ca9b262322aa86287d838 100644
--- a/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp
@@ -149,61 +149,53 @@ void MulImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
 
 template <class I1, class I2, class O>
 void MulImpl_cpu_backward_kernel(const std::size_t input0Length,
-                                 const std::size_t input1Length,
-                                 const std::size_t grad0Length,
-                                 const std::vector<std::size_t> input0Dims,
-                                 const std::vector<std::size_t> input1Dims,
-                                 const void* input0_,
-                                 const void* input1_,
-                                 const void* grad_output_,
-                                 void* gradientInput0,
-                                 void* gradientInput1)
+                                  const std::size_t input1Length,
+                                  const std::size_t gradOutputLength,
+                                  const std::vector<std::size_t>& dims0,
+                                  const std::vector<std::size_t>& dims1,
+                                  const std::vector<std::size_t>& outputDims,
+                                  const void* input0_,
+                                  const void* input1_,
+                                  const void* grad_output_,
+                                  void* gradientInput0_,
+                                  void* gradientInput1_)
 {
-    const auto* input0 = static_cast<const I1*>(input0_);
-    const auto* input1 = static_cast<const I1*>(input1_);
-    const auto* grad_output = static_cast<const O*>(grad_output_);
-    auto* grad_input_0 = static_cast<I1*>(gradientInput0);
-    auto* grad_input_1 = static_cast<I2*>(gradientInput1);
-
-
-    if(input0Dims.size() >= input1Dims.size())
-    {
-        AIDGE_ASSERT(input0Length == grad0Length, "Incorrect dimensions between Mul input and output tensors");
-
-        for(auto i = 0U; i < input0Length; ++i)
-        {
-            const auto indices = getMultiDimIndices(input1Dims, i);
-            const auto flattenedIndex = getFlattenedIndex(input1Dims, indices);
-
-            grad_input_0[i] = input1[flattenedIndex] * grad_output[i];
-        }
-
-        for(std::size_t i = 0 ; i < grad0Length; ++i)
-        {
-            const auto indices = getMultiDimIndices(input1Dims, i);
-            const auto flattenedIndex = getFlattenedIndex(input1Dims, indices);
-
-            grad_input_1[flattenedIndex] += input0[i] * grad_output[i];
+    const I1* input0 = static_cast<const I1*>(input0_);
+    const I2* input1 = static_cast<const I2*>(input1_);
+    const O* grad_output = static_cast<const O*>(grad_output_);
+    auto* grad_input_0 = static_cast<I1*>(gradientInput0_);
+    auto* grad_input_1 = static_cast<I2*>(gradientInput1_);
+
+    std::fill_n(grad_input_0, input0Length, static_cast<I1>(0));
+    std::fill_n(grad_input_1, input1Length, static_cast<I2>(0));
+
+    // Broadcast dims0 and dims1 to match the shape of outputDims
+    auto broadcastedDims0 = getBroadcastedDims(outputDims, dims0);
+    auto broadcastedDims1 = getBroadcastedDims(outputDims, dims1);
+
+    for (std::size_t i = 0; i < gradOutputLength; ++i) {
+        auto idxOutputGrad = getMultiDimIndices(outputDims, i);
+        std::vector<std::size_t> idxInput0(broadcastedDims0.size());
+        std::vector<std::size_t> idxInput1(broadcastedDims1.size());
+
+        // Map output indices to input0 indices, considering broadcasting
+        for (std::size_t dimension = 0; dimension < broadcastedDims0.size(); ++dimension) {
+            // If input0 is broadcasted along this dimension (== 1) or both dimensions are 1, index is 0.
+            // idxInput0 represent the multi dim index of input0 contributing
+            // to the output at index i.
+            idxInput0[dimension] = (broadcastedDims0[dimension] == 1) ? 0 : idxOutputGrad[dimension];
         }
 
-    } else {
-        AIDGE_ASSERT(input1Length == grad0Length, "Incorrect dimensions between Mul input and output tensors");
-
-        for(auto i = 0U; i < input1Length; ++i)
-        {
-            const auto indices = getMultiDimIndices(input0Dims, i);
-            const auto flattenedIndex = getFlattenedIndex(input0Dims, indices);
-
-            grad_input_1[i] = input0[flattenedIndex] * grad_output[i];
+        for (std::size_t dimension = 0; dimension < broadcastedDims1.size(); ++dimension) {
+            idxInput1[dimension] = (broadcastedDims1[dimension] == 1) ? 0 : idxOutputGrad[dimension];
         }
 
-        for(std::size_t i = 0 ; i < grad0Length; ++i)
-        {
-            const auto indices = getMultiDimIndices(input0Dims, i);
-            const auto flattenedIndex = getFlattenedIndex(input0Dims, indices);
+        // We have to access tensors with a flat index, hence the conversion
+        auto idx0 = getFlattenedIndex(broadcastedDims0, idxInput0);
+        auto idx1 = getFlattenedIndex(broadcastedDims1, idxInput1);
 
-            grad_input_0[flattenedIndex] += input1[i] * grad_output[i];
-        }
+        grad_input_0[idx0] += static_cast<I1>(grad_output[i] * input1[idx1]);
+        grad_input_1[idx1] += static_cast<I2>(grad_output[i] * input0[idx0]);
     }
 }
 
@@ -211,6 +203,9 @@ void MulImpl_cpu_backward_kernel(const std::size_t input0Length,
 REGISTRAR(MulImpl_cpu,
     {DataType::Float32},
     {ProdConso::inPlaceModel, Aidge::MulImpl_cpu_forward_kernel<float, float, float>, Aidge::MulImpl_cpu_backward_kernel<float, float, float>});
+REGISTRAR(MulImpl_cpu,
+    {{{DataType::Float32}, {DataType::Float64}}, {DataType::Float32}},
+    {ProdConso::inPlaceModel, Aidge::MulImpl_cpu_forward_kernel<float, double, float>, Aidge::MulImpl_cpu_backward_kernel<float, double, float>});
 REGISTRAR(MulImpl_cpu,
     {DataType::Float64},
     {ProdConso::inPlaceModel, Aidge::MulImpl_cpu_forward_kernel<double, double, double>, Aidge::MulImpl_cpu_backward_kernel<double, double, double>});
diff --git a/include/aidge/backend/cpu/operator/ResizeImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ResizeImpl_kernels.hpp
index 6a22ff4ec9d7beaf05be3b479b43dd3ad69bc74b..6449417baf855620669aba11ebca16d9384c4e7c 100644
--- a/include/aidge/backend/cpu/operator/ResizeImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ResizeImpl_kernels.hpp
@@ -99,30 +99,31 @@ void ResizeImpl_cpu_forward_kernel(
     }
     return;
 }
+
 // Kernels registration to implementation entry point
 REGISTRAR(ResizeImpl_cpu,
           {{{DataType::Int16},
-            {DataType::Float32},
-            {DataType::Float32},
-            {DataType::UInt64}},
+            {DataType::Any},
+            {DataType::Any},
+            {DataType::Any}},
            {DataType::Int16}},
           {ProdConso::inPlaceModel,
            ResizeImpl_cpu_forward_kernel<int16_t>,
            nullptr});
 REGISTRAR(ResizeImpl_cpu,
           {{{DataType::Int32},
-            {DataType::Float32},
-            {DataType::Float32},
-            {DataType::UInt64}},
+            {DataType::Any},
+            {DataType::Any},
+            {DataType::Any}},
            {DataType::Int32}},
           {ProdConso::inPlaceModel,
            ResizeImpl_cpu_forward_kernel<int32_t>,
            nullptr});
 REGISTRAR(ResizeImpl_cpu,
           {{{DataType::Int64},
-            {DataType::Float32},
-            {DataType::Float32},
-            {DataType::Int64}},
+            {DataType::Any},
+            {DataType::Any},
+            {DataType::Any}},
            {DataType::UInt64}},
           {ProdConso::inPlaceModel,
            ResizeImpl_cpu_forward_kernel<int64_t>,
@@ -130,27 +131,27 @@ REGISTRAR(ResizeImpl_cpu,
 
 REGISTRAR(ResizeImpl_cpu,
           {{{DataType::Float16},
-            {DataType::Float32},
-            {DataType::Float32},
-            {DataType::UInt64}},
+            {DataType::Any},
+            {DataType::Any},
+            {DataType::Any}},
            {DataType::Float16}},
           {ProdConso::inPlaceModel,
            ResizeImpl_cpu_forward_kernel<half_float::half>,
            nullptr});
 REGISTRAR(ResizeImpl_cpu,
           {{{DataType::Float32},
-            {DataType::Float32},
-            {DataType::Float32},
-            {DataType::UInt64}},
+            {DataType::Any},
+            {DataType::Any},
+            {DataType::Any}},
            {DataType::Float32}},
           {ProdConso::inPlaceModel,
            ResizeImpl_cpu_forward_kernel<float>,
            nullptr});
 REGISTRAR(ResizeImpl_cpu,
           {{{DataType::Float64},
-            {DataType::Float32},
-            {DataType::Float32},
-            {DataType::UInt64}},
+            {DataType::Any},
+            {DataType::Any},
+            {DataType::Any}},
            {DataType::Float64}},
           {ProdConso::inPlaceModel,
            ResizeImpl_cpu_forward_kernel<double>,
diff --git a/include/aidge/backend/cpu/operator/WeightInterleavedImpl.hpp b/include/aidge/backend/cpu/operator/WeightInterleavedImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ff5c4778f530912e8bdf97ffadb2f546789e2c48
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/WeightInterleavedImpl.hpp
@@ -0,0 +1,37 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_WEIGHTINTERLEAVINGIMPL_H_
+#define AIDGE_CPU_OPERATOR_WEIGHTINTERLEAVINGIMPL_H_
+
+#include <array>
+#include <memory>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/WeightInterleaving.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using WeightInterleavedImpl_cpu = OperatorImpl_cpu<WeightInterleaving_Op,
+    void(const DimSize_t,
+        const DimSize_t,
+        const DimSize_t,
+        const void *,
+        void *)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(WeightInterleaving_Op, "cpu", Aidge::WeightInterleavedImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_WeightInterleavingIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/WeightInterleavedImpl_kernels.hpp b/include/aidge/backend/cpu/operator/WeightInterleavedImpl_kernels.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..18557f8fb5fcdd31476904d273d4d2d7f37a66b5
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/WeightInterleavedImpl_kernels.hpp
@@ -0,0 +1,143 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_WEIGHTINTERLEAVEDIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_WEIGHTINTERLEAVEDIMPL_KERNELS_H_
+
+#include <cstddef>  // std::size_t
+#include <cstdint>  // std::int8_t, std::uint8_t
+
+#include "aidge/backend/cpu/operator/WeightInterleavedImpl.hpp"
+#include "aidge/data/DataType.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/ErrorHandling.hpp"
+
+
+namespace Aidge {
+
+    /**
+     * @brief Compacts 8-bit data into a smaller bit-width representation.
+     *
+     * This function takes an array of 8-bit data and compacts it into smaller chunks
+     * based on the specified bit-width `nb_bits`. Each element in `compactData` will
+     * store multiple packed `nb_bits` segments extracted from `data`.
+     *
+     * @param data The input array of 8-bit values to be compacted.
+     * @param dataSize The size of the input `data` array.
+     * @param compactData The output array storing the compacted data.
+     * @param nb_bits The number of bits to extract from each `data` element (must be less than 8).
+     */
+    template <typename T>
+    void compact_data(const T* data, std::size_t dataSize, T* compactData, std::uint8_t nb_bits) {
+        AIDGE_ASSERT(nb_bits > 0 && nb_bits < 5, "Cannot compact with the given nb_bits"); // Ensure valid bit width
+
+        // Mask to extract `nb_bits` from each data element
+        const unsigned int mask = (1U << nb_bits) - 1;
+
+        // Calculate the number of `nb_bits` segments that fit into an 8-bit compacted value
+        const unsigned int nbSlot = 8 / nb_bits;
+
+        // Case nb_bits=3 or 4, then shift is 4
+        // Case nb_bits=2, then shift is 2
+        // Case nb_bits=1, then shift is 1
+        std::uint8_t shift = 8 / nbSlot;
+
+        const unsigned int nbFullCompactbytes = dataSize / nbSlot;
+
+        // Main loop to process data in groups of `nbSlot`
+        for (std::size_t i = 0; i < nbFullCompactbytes; ++i) {
+            T compact = 0;
+
+            for (unsigned int j = 0; j < nbSlot; ++j) {
+                compact |= (data[i * nbSlot + j] & mask);    // Apply mask to keep `nb_bits` only
+
+                // Shift only if not on the last slot to make room for the next `nb_bits`
+                if (j < nbSlot - 1) {
+                    compact <<= shift;
+                }
+            }
+            // Store the compacted value in the output array
+            compactData[i] = compact;
+        }
+
+
+        // Handle any remaining data elements (if dataSize is not a multiple of nbSlot).
+        std::size_t remaining = dataSize % nbSlot;
+        if (remaining != 0) {
+            std::int8_t compact = 0;
+            for (std::size_t j = 0; j < remaining; ++j) {
+                compact |= (data[nbFullCompactbytes*nbSlot + j] & mask);
+
+                if (j < remaining - 1) {
+                    compact <<= shift;
+                }
+            }
+            compact <<= (shift*(nbSlot - remaining));
+            // Store the last compacted value
+            compactData[dataSize / nbSlot] = compact;
+        }
+    }
+
+template <class I, class O, int nb_bits>
+void WeightInterleavedImpl_cpu_forward_kernel(const DimSize_t input_interleaving,
+                            const DimSize_t nb_interleaving,
+                            const DimSize_t output_interleaving,
+                            const void* input_,
+                            void* output_) {
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+
+    // Aidge::compact_data(const std::int8_t* data, std::size_t dataSize, std::int8_t* compactData, std::uint8_t nb_bits) {
+    for (std::size_t i=0; i<nb_interleaving; ++i){
+        compact_data(input+(i*input_interleaving), input_interleaving, output+(i*output_interleaving), static_cast<std::uint8_t>(nb_bits));
+    }
+
+}
+
+
+REGISTRAR(WeightInterleavedImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Int4, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavedType_v<DataType::Int4>, DataFormat::NHWC}},
+    {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<int8_t, int8_t, 4>, nullptr});
+REGISTRAR(WeightInterleavedImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Int3, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavedType_v<DataType::Int3>, DataFormat::NHWC}},
+    {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<int8_t, int8_t, 3>, nullptr});
+REGISTRAR(WeightInterleavedImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Int2, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavedType_v<DataType::Int2>, DataFormat::NHWC}},
+    {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<int8_t, int8_t, 2>, nullptr});
+REGISTRAR(WeightInterleavedImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Binary, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavedType_v<DataType::Binary>, DataFormat::NHWC}},
+    {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<int8_t, int8_t, 1>, nullptr});
+
+REGISTRAR(WeightInterleavedImpl_cpu,
+    {ImplSpec::IOSpec{DataType::UInt4, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavedType_v<DataType::UInt4>, DataFormat::NHWC}},
+    {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<uint8_t, uint8_t, 4>, nullptr});
+REGISTRAR(WeightInterleavedImpl_cpu,
+    {ImplSpec::IOSpec{DataType::UInt3, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavedType_v<DataType::UInt3>, DataFormat::NHWC}},
+    {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<uint8_t, uint8_t, 3>, nullptr});
+REGISTRAR(WeightInterleavedImpl_cpu,
+    {ImplSpec::IOSpec{DataType::UInt2, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavedType_v<DataType::UInt2>, DataFormat::NHWC}},
+    {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<uint8_t, uint8_t, 2>, nullptr});
+
+
+// REGISTRAR(WeightInterleavedImpl_cpu,
+//     {ImplSpec::IOSpec{DataType::Int4, DataFormat::NHWC}},
+//     {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<int8_t, int8_t, 4>, nullptr});
+// REGISTRAR(WeightInterleavedImpl_cpu,
+//     {ImplSpec::IOSpec{DataType::Int3, DataFormat::NHWC}},
+//     {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<int8_t, int8_t, 3>, nullptr});
+// REGISTRAR(WeightInterleavedImpl_cpu,
+//     {ImplSpec::IOSpec{DataType::Int2, DataFormat::NHWC}},
+//     {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<int8_t, int8_t, 2>, nullptr});
+
+
+}
+
+#endif /* AIDGE_CPU_OPERATOR_WEIGHTINTERLEAVEDIMPL_KERNELS_H_ */
\ No newline at end of file
diff --git a/include/aidge/backend/version.h.in b/include/aidge/backend/version.h.in
new file mode 100644
index 0000000000000000000000000000000000000000..4b876f63002972c1f8f1340b70cdecdace911012
--- /dev/null
+++ b/include/aidge/backend/version.h.in
@@ -0,0 +1,11 @@
+#ifndef VERSION_H
+#define VERSION_H
+
+namespace Aidge {
+static constexpr const int PROJECT_VERSION_MAJOR = @PROJECT_VERSION_MAJOR@;
+static constexpr const int PROJECT_VERSION_MINOR = @PROJECT_VERSION_MINOR@;
+static constexpr const int PROJECT_VERSION_PATCH = @PROJECT_VERSION_PATCH@;
+static constexpr const char * PROJECT_VERSION = "@PROJECT_VERSION_MAJOR@.@PROJECT_VERSION_MINOR@.@PROJECT_VERSION_PATCH@";
+static constexpr const char * PROJECT_GIT_HASH = "@GIT_COMMIT_HASH@";
+}
+#endif // VERSION_H
diff --git a/include/aidge/utils/sys_info/CpuVersionInfo.hpp b/include/aidge/utils/sys_info/CpuVersionInfo.hpp
index 887ce839e079349d9d64505f7184831ffc4cf1c2..3df70d139bd4ad26e0c88d2ee5192e508bc3f71a 100644
--- a/include/aidge/utils/sys_info/CpuVersionInfo.hpp
+++ b/include/aidge/utils/sys_info/CpuVersionInfo.hpp
@@ -2,17 +2,20 @@
 #define AIDGE_UTILS_SYS_INFO_CPU_VERSION_INFO_H
 
 #include "aidge/utils/Log.hpp"
+#include "aidge/backend/cpu_version.h"
 
 namespace Aidge {
 
-#ifndef PROJECT_VERSION // Normally defined in CMakeLists.txt
-#define PROJECT_VERSION "Unknown version"
-#endif
-#ifndef GIT_COMMIT_HASH
-#define GIT_COMMIT_HASH ""
-#endif
-void showCpuVersion() {
-    Log::info("Aidge backend CPU: {} ({}), {} {}", PROJECT_VERSION, GIT_COMMIT_HASH, __DATE__, __TIME__);
+constexpr inline const char * getBackendCPUProjectVersion(){
+    return PROJECT_VERSION;
+}
+
+constexpr inline const char * getBackendCPUGitHash(){
+    return PROJECT_GIT_HASH;
+}
+
+void showBackendCpuVersion() {
+    Log::info("Aidge backend CPU: {} ({}), {} {}", getBackendCPUProjectVersion(), getBackendCPUGitHash(), __DATE__, __TIME__);
         // Compiler version
     #if defined(__clang__)
     /* Clang/LLVM. ---------------------------------------------- */
diff --git a/project_name.txt b/project_name.txt
new file mode 100644
index 0000000000000000000000000000000000000000..25caafdd8ab794dbcb8c8cdef5097e2143accc6a
--- /dev/null
+++ b/project_name.txt
@@ -0,0 +1 @@
+aidge_backend_cpu
diff --git a/pyproject.toml b/pyproject.toml
index 3c08302d0fcd4e77943d165fab802a22f4dc39cc..baa61de57375aa97f803efb19bf59973d39ef36f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,17 +7,25 @@ dependencies = [
 requires-python = ">= 3.7"
 readme = "README.md"
 license = { file = "LICENSE" }
-classifiers = [ 
+classifiers = [
     "Development Status :: 2 - Pre-Alpha",
     "Programming Language :: Python :: 3"
     ]
-dynamic = ["version"] # defined in tool.setuptools_scm
+dynamic = ["version"] # defined by pbr
+
+
+[project.urls]
+Homepage = "https://www.deepgreen.ai/en/platform"
+Documentation = "https://eclipse-aidge.readthedocs.io/en/latest/"
+Repository = "https://gitlab.eclipse.org/eclipse/aidge/aidge_backend_cpu"
+Issues = "https://gitlab.eclipse.org/eclipse/aidge/aidge_backend_cpu/-/issues"
+Changelog = "https://gitlab.eclipse.org/eclipse/aidge/aidge_backend_cpu/-/releases"
 
 [build-system]
 requires = [
     "setuptools>=64",
-    "setuptools_scm[toml]==7.1.0",
-    "cmake>=3.18.4.post1"
+    "cmake>=3.18.4.post1",
+    "pbr"
 ]
 build-backend = "setuptools.build_meta"
 
@@ -29,9 +37,6 @@ where = ["."]  # list of folders that contain the packages (["."] by default)
 include = ["aidge_backend_cpu*"]  # package names should match these glob patterns (["*"] by default)
 exclude = ["aidge_backend_cpu.unit_tests*"]  # exclude packages matching these glob patterns (empty by default)
 namespaces = false  # to disable scanning PEP 420 namespaces (true by default)
-# SETUPTOOLS_SCM
-[tool.setuptools_scm]
-write_to = "aidge_backend_cpu/_version.py"
 
 #####################################################
 # CIBUILDWHEEL
diff --git a/python_binding/pybind_cpu.cpp b/python_binding/pybind_cpu.cpp
index d5022e1d469ae4171e796baed6c1aa061dd95765..e576de0849ec6e0739b7cad32e8e8b003c092b7b 100644
--- a/python_binding/pybind_cpu.cpp
+++ b/python_binding/pybind_cpu.cpp
@@ -6,10 +6,10 @@ namespace py = pybind11;
 
 namespace Aidge {
 
-void init_cpu_sys_info(py::module& m);
+void init_CpuVersionInfo(py::module& m);
 
 void init_Aidge(py::module& m){
-    init_cpu_sys_info(m);
+    init_CpuVersionInfo(m);
 }
 
 
diff --git a/python_binding/utils/sys_info/pybind_CpuVersionInfo.cpp b/python_binding/utils/sys_info/pybind_CpuVersionInfo.cpp
index 573bee3659c65f90935e03c06eff5a2998bb9f5b..7461dd955e8b3e64b086a51664b3d73b6bfaed93 100644
--- a/python_binding/utils/sys_info/pybind_CpuVersionInfo.cpp
+++ b/python_binding/utils/sys_info/pybind_CpuVersionInfo.cpp
@@ -3,7 +3,9 @@
 
 namespace py = pybind11;
 namespace Aidge {
-void init_cpu_sys_info(py::module& m){
-    m.def("show_cpu_version", &showCpuVersion);
+void init_CpuVersionInfo(py::module& m){
+    m.def("show_version", &showBackendCpuVersion);
+    m.def("get_project_version", &getBackendCPUProjectVersion);
+    m.def("get_git_hash", &getBackendCPUGitHash);
 }
 }
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..aa0f227f6688468a5ab93384f7b1670086000035
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,3 @@
+# pbr file
+[metadata]
+version = file: version.txt
diff --git a/setup.py b/setup.py
index 22cbd9732c8b9e1099c3e322032e8377f6d4506b..82edd09c90413f4c81ab3356d57e0380d4eab5ab 100644
--- a/setup.py
+++ b/setup.py
@@ -11,8 +11,10 @@ from math import ceil
 from setuptools import setup, Extension
 from setuptools.command.build_ext import build_ext
 
+def get_project_name() -> str:
+    return open(pathlib.Path().absolute() / "project_name.txt", "r").read().strip()
 
-PROJECT_NAME = "aidge_backend_cpu"
+PROJECT_NAME = get_project_name()
 
 SETUP_DIR = pathlib.Path(__file__).parent
 
@@ -66,7 +68,7 @@ class AidgePkgBuild(build_ext):
             else []
         )
         test_onoff = os.environ.get("AIDGE_BUILD_TEST", "OFF")
-        
+
         self.spawn(
             [
                 "cmake",
diff --git a/src/operator/AvgPoolingImpl.cpp b/src/operator/AvgPoolingImpl.cpp
index 01a5e8cf1772161f5cf98d3a8bd52f43ac7a1d0d..eb5ef87bd16620cdef33330bd7b39ece1783ecfc 100644
--- a/src/operator/AvgPoolingImpl.cpp
+++ b/src/operator/AvgPoolingImpl.cpp
@@ -32,7 +32,9 @@ void Aidge::AvgPoolingImpl2D_cpu::forward() {
     // Call kernel
     impl.forward(op_.strideDims(),
                op_.kernelDims(),
+               op_.dilations(),
                op_.getInput(0)->template dims<4>(),
+               op_.ceilMode(),
                getCPUPtr(op_.getInput(0)),
                getCPUPtr(op_.getOutput(0)));
 }
diff --git a/src/operator/EqualImpl.cpp b/src/operator/EqualImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5926212e8453a32e54de7691343d44f9c6849a05
--- /dev/null
+++ b/src/operator/EqualImpl.cpp
@@ -0,0 +1,61 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <cassert>
+#include <chrono>  // std::chrono::milliseconds
+#include <numeric> // std::accumulate
+#include <thread>  // std::this_thread::sleep_for
+#include <vector>
+
+#include "aidge/operator/Equal.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/Broadcasting.hpp"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+
+#include "aidge/backend/cpu/operator/EqualImpl.hpp"
+#include "aidge/backend/cpu/operator/EqualImpl_kernels.hpp"
+
+template <>
+void Aidge::EqualImpl_cpu::forward() {
+    const Equal_Op& op = static_cast<const Equal_Op&>(mOp);
+    // Check inputs
+    AIDGE_ASSERT(op.getInput(0), "missing input in Equal operator");
+    AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Equal forward because the 0-th input has no implementation.");
+
+    AIDGE_ASSERT(op.getInput(1), "missing input in Equal operator");
+    AIDGE_ASSERT(op.getInput(1)->hasImpl(), "cannot run Equal forward because the 1st input has no implementation.");
+
+    AIDGE_ASSERT(op.getInput(1)->dataType() == op.getInput(0)->dataType(), "Cannot Equal inputs with two differents data type.");
+
+    // Find the correct kernel type
+    const auto impl = Registrar<EqualImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Convert input data (no overhead if not needed!)
+    // TODO: right now, if needed, memory will be allocated/deallocated at each
+    // call to forward(). We might put the following shared_ptr as members of
+    // this class to avoid that.
+    std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback;
+    const auto& input0 = op.getInput(0)->refCastFrom(input0Fallback, *op.getInput(0));
+    const auto& input1 = op.getInput(1)->refCastFrom(input1Fallback, *op.getInput(1));
+
+
+    impl.forward(op.getInput(0)->dims(),
+                op.getInput(1)->dims(),
+                op.getOutput(0)->dims(),
+                input0.getImpl()->rawPtr(),
+                input1.getImpl()->rawPtr(),
+                getCPUPtr(op.getRawOutput(0)));
+}
+
+template <>
+void Aidge::EqualImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Equal_Op on backend cpu");
+}
diff --git a/src/operator/MaxPoolingImpl.cpp b/src/operator/MaxPoolingImpl.cpp
index 90075a397be3f082ef95fd4df074c99d926fd385..13ef75b0eb92cea700e5b487a06046fc9285d5ad 100644
--- a/src/operator/MaxPoolingImpl.cpp
+++ b/src/operator/MaxPoolingImpl.cpp
@@ -30,6 +30,7 @@ void Aidge::MaxPoolingImpl2D_cpu::forward() {
     // Call kernel
     impl.forward(op_.strideDims(),
                 op_.kernelDims(),
+                op_.dilations(),
                 op_.ceilMode(),
                 op_.getInput(0)->template dims<4>(),
                 getCPUPtr(mOp.getRawInput(0)),
diff --git a/src/operator/MulImpl.cpp b/src/operator/MulImpl.cpp
index 422bdd005f058fc9200cf5f7962bfc8d5877e6e1..a90d521a759f1ce6f4883bdd0bc05d84daa0f668 100644
--- a/src/operator/MulImpl.cpp
+++ b/src/operator/MulImpl.cpp
@@ -58,6 +58,7 @@ void Aidge::MulImpl_cpu::backward() {
                /* grad0Length  */ out0grad->size(),
                /* input0Dims   */ in0->dims(),
                /* input1Dims   */ in1->dims(),
+               out0grad->dims(),
                getCPUPtr(in0),
                getCPUPtr(in1),
                getCPUPtr(out0grad),
diff --git a/src/operator/WeightInterleavedImpl.cpp b/src/operator/WeightInterleavedImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2c9f3a6e8df35616a4f7ffae86cbeacd841f44bf
--- /dev/null
+++ b/src/operator/WeightInterleavedImpl.cpp
@@ -0,0 +1,75 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include "aidge/backend/cpu/operator/WeightInterleavedImpl.hpp"
+
+#include <cstddef>  // std::size_t
+#include <functional>
+#include <memory>
+#include <tuple>
+
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include "aidge/backend/cpu/operator/WeightInterleavedImpl_kernels.hpp"
+#include "aidge/operator/WeightInterleaving.hpp"
+#include "aidge/utils/ErrorHandling.hpp"
+#include "aidge/utils/Types.h"
+
+
+template <>
+void Aidge::WeightInterleavedImpl_cpu::forward()
+{
+    const WeightInterleaving_Op& op_ = dynamic_cast<const WeightInterleaving_Op&>(mOp);
+    AIDGE_ASSERT(op_.getInput(0), "missing input #0");
+
+    const auto impl = Registrar<WeightInterleavedImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Convert input data (no overhead if not needed!)
+    // TODO: right now, if needed, memory will be allocated/deallocated at each
+    // call to forward(). We might put the following shared_ptr as members of
+    // this class to avoid that.
+    std::shared_ptr<Tensor> input0Fallback;
+    const auto& input0 = op_.getInput(0)->refCastFrom(input0Fallback, *(op_.getOutput(0)));
+
+    // inputInterleaving is the number of consecutive input elements that will be compacted
+    // Here the interleaving is the last dimension (cf STM32 low bit kernels)
+    std::size_t inputInterleaving = input0.dims().back();
+
+    // The resulting compacted dimension was computed in forwardDims and the output tensor was resized
+    std::size_t outputInterleaving = op_.getOutput(0)->dims().back();
+
+    // nb_interleaving is the number of compacted segments
+    std::size_t nbInterleaving;
+
+    // Determine the number of segment to compact
+    if (input0.dims().size() > 1){
+        nbInterleaving = std::accumulate(
+        input0.dims().cbegin(),
+        std::prev(input0.dims().cend()), // Exclude the last element
+        std::size_t(1),
+        std::multiplies<std::size_t>());
+    } else {
+        // Case when the weight tensor is only one dimension
+        nbInterleaving = 1;
+    }
+
+    impl.forward(inputInterleaving,
+        nbInterleaving,
+        outputInterleaving,
+        input0.getImpl()->rawPtr(),
+        getCPUPtr(mOp.getRawOutput(0)));
+
+
+}
+
+template <>
+void Aidge::WeightInterleavedImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for WeightInterleaving_Op on backend cpu");
+}
\ No newline at end of file
diff --git a/unit_tests/CMakeLists.txt b/unit_tests/CMakeLists.txt
index 5984524fdc8c596641e505897d16e12de78024cc..7e63fb2be1a8aa9e7955b768c736302e9f9e3920 100644
--- a/unit_tests/CMakeLists.txt
+++ b/unit_tests/CMakeLists.txt
@@ -1,12 +1,19 @@
-Include(FetchContent)
+find_package(Catch2 QUIET)
 
-FetchContent_Declare(
-  Catch2
-  GIT_REPOSITORY https://github.com/catchorg/Catch2.git
-  GIT_TAG        v3.7.1 # or a later release
-)
+if(NOT Catch2_FOUND)
+    message(STATUS "Catch2 not found in system, retrieving from git")
+    Include(FetchContent)
 
-FetchContent_MakeAvailable(Catch2)
+    FetchContent_Declare(
+      Catch2
+      GIT_REPOSITORY https://github.com/catchorg/Catch2.git
+      GIT_TAG        devel # or a later release
+    )
+
+    FetchContent_MakeAvailable(Catch2)
+else()
+    message(STATUS "Found system Catch2 version ${Catch2_VERSION}")
+endif()
 
 file(GLOB_RECURSE src_files "*.cpp")
 
diff --git a/unit_tests/data/Test_TensorImpl.cpp b/unit_tests/data/Test_TensorImpl.cpp
index fd938f10a947d1520600a1d00022eeb970cd76e6..2bc1e7d4c6f8a7cfbae8807e3021f9c5dd89fff6 100644
--- a/unit_tests/data/Test_TensorImpl.cpp
+++ b/unit_tests/data/Test_TensorImpl.cpp
@@ -9,19 +9,23 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
-#include <cstddef>   // std::size_t
-#include <cstdint>   // std::uint16_t
-#include <chrono>
-#include <iostream>
+#include <chrono>      // std::micro, std::chrono::time_point,
+                       // std::chrono::system_clock
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::int32_t, std::uint16_t
 #include <memory>
-#include <numeric>   // std::accumulate
-#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <random>      // std::random_device, std::mt19937
+                       // std::uniform_int_distribution, std::uniform_real_distribution
+#include <vector>
+
+#include <catch2/catch_test_macros.hpp>
+#include <fmt/core.h>
 
-#include "aidge/data/Tensor.hpp"
 #include "aidge/backend/cpu/data/TensorImpl.hpp"
-#include "aidge/operator/Add.hpp"
 #include "aidge/backend/cpu/operator/AddImpl.hpp"
+#include "aidge/data/Data.hpp"
+#include "aidge/operator/Add.hpp"
+#include "aidge/utils/ArrayHelpers.hpp"
 
 namespace Aidge {
 
@@ -35,8 +39,7 @@ TEST_CASE("Test addition of Tensors","[TensorImpl][Add][Data]") {
     std::uniform_int_distribution<int> boolDist(0,1);
 
     // Create MatMul Operator
-    std::shared_ptr<Node> mySub = Add();
-    auto op = std::static_pointer_cast<OperatorTensor>(mySub-> getOperator());
+    std::shared_ptr<Add_Op> op = std::make_shared<Add_Op>();
     op->setDataType(DataType::Float32);
     op->setBackend("cpu");
 
diff --git a/unit_tests/operator/Test_AddImpl.cpp b/unit_tests/operator/Test_AddImpl.cpp
index bca4025705cb1c851dcf3e9accbf016c4535120a..bff9629be152163b2aa92bdc9d0c3029d7987b9b 100644
--- a/unit_tests/operator/Test_AddImpl.cpp
+++ b/unit_tests/operator/Test_AddImpl.cpp
@@ -9,12 +9,16 @@
  *
  ********************************************************************************/
 
+#include <memory>
+
 #include <catch2/catch_test_macros.hpp>
 
+#include "aidge/backend/cpu/operator/AddImpl.hpp"
+#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
+#include "aidge/graph/Node.hpp"
 #include "aidge/operator/Add.hpp"
-
-#include "aidge/backend/cpu.hpp"
+#include "aidge/utils/ArrayHelpers.hpp"
 
 using namespace Aidge;
 
@@ -96,7 +100,7 @@ TEST_CASE("[cpu/operator] Add(forward)", "[Add][CPU]") {
         });                                     //
 
         std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<int,2> {{100,200}});
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
+        Tensor expectedOutput = Array4D<int,3,3,3,2> {
             {                                               //
                 {                                           //
                     {{ 120, 222},{ 124, 226},{ 128, 230}},  //
@@ -114,7 +118,7 @@ TEST_CASE("[cpu/operator] Add(forward)", "[Add][CPU]") {
                     {{ 144, 246},{ 148, 250},{152, 254}}    //
                 }                                           //
             }                                               //
-        });                                                 //
+        };                                                 //
 
         std::shared_ptr<Node> myAdd_0 = Add();
         std::shared_ptr<Node> myAdd_1 = Add();
@@ -131,8 +135,8 @@ TEST_CASE("[cpu/operator] Add(forward)", "[Add][CPU]") {
         op_1->setBackend("cpu");
         myAdd_0->forward();
         myAdd_1->forward();
-        op_1->getOutput(0)->print();
-        expectedOutput->print();
-        REQUIRE(*op_1->getOutput(0) == *expectedOutput);
+        Log::info("Add_1 Tensor:\n{}", *(op_1->getOutput(0)));
+        Log::info("Expected Add_1 Tensor:\n{}", expectedOutput);
+        REQUIRE(*op_1->getOutput(0) == expectedOutput);
     }
 }
\ No newline at end of file
diff --git a/unit_tests/operator/Test_AndImpl.cpp b/unit_tests/operator/Test_AndImpl.cpp
index 053bb3ea4ed913bd388f3ae049c4d6402ad58d59..148298d5f44b9f7744ab18bcfc5ce675f77a784c 100644
--- a/unit_tests/operator/Test_AndImpl.cpp
+++ b/unit_tests/operator/Test_AndImpl.cpp
@@ -9,86 +9,109 @@
  *
  ********************************************************************************/
 
+#include <cstddef> // std::size_t
+#include <cstdint> // std::uint16_t
+#include <memory>
+#include <random>  // std::random_device, std::mt19937, std::uniform_int_distribution, std::uniform_real_distribution
+
 #include <catch2/catch_test_macros.hpp>
-#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
 
+#include "aidge/backend/cpu/operator/AndImpl.hpp"
+#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
+#include "aidge/graph/Node.hpp"
 #include "aidge/operator/And.hpp"
-
-#include "aidge/backend/cpu.hpp"
+#include "aidge/utils/ArrayHelpers.hpp"
 
 using namespace Aidge;
 
 TEST_CASE("[cpu/operator] And(forward)", "[And][CPU]") {
-        SECTION("ForwardDims")
-    {
+    SECTION("ForwardDims") {
         constexpr std::uint16_t NBTRIALS = 10;
         // Create a random number generator
         std::random_device rd;
         std::mt19937 gen(rd());
-        std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1
-        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
-        std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5));
-        std::uniform_int_distribution<int> boolDist(0,1);
+        std::uniform_int_distribution<int> boolDist(0, 1); // Use 0 for false, 1 for true
+        std::uniform_int_distribution<std::size_t> dimSizeDist(2, 10);
+        std::uniform_int_distribution<std::size_t> nbDimsDist(1, 5);
 
         SECTION("Same dimensions") {
             for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
                 DimSize_t nbDims = nbDimsDist(gen);
                 std::vector<DimSize_t> dims(nbDims);
-                for (std::size_t i = 0; i < nbDims; i++) {
+                for (std::size_t i = 0; i < nbDims; ++i) {
                     dims[i] = dimSizeDist(gen);
                 }
-
+                const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                float* array0 = new float[nb_elements];
+                float* array1 = new float[nb_elements];
+                for (std::size_t i = 0; i < nb_elements; ++i) {
+                    array0[i] = boolDist(gen);
+                    array1[i] = boolDist(gen);
+                }
                 std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims);
-                myInput1->setBackend("cpu");
-                myInput1->setDataType(DataType::Float32);
-                myInput1->zeros();
                 std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims);
-                myInput2->setBackend("cpu");
+                myInput1->setDataType(DataType::Float32);
                 myInput2->setDataType(DataType::Float32);
-                myInput2->zeros();
+                myInput1->setBackend("cpu");
+                myInput2->setBackend("cpu");
+
+                myInput1 -> getImpl() -> setRawPtr(array0, nb_elements);
+                myInput2 -> getImpl() -> setRawPtr(array1, nb_elements);
+
                 std::shared_ptr<Node> myAnd = And();
-                auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator());
-                op->associateInput(0,myInput1);
-                op->associateInput(1,myInput2);
+                auto op = std::static_pointer_cast<OperatorTensor>(myAnd->getOperator());
+                op->associateInput(0, myInput1);
+                op->associateInput(1, myInput2);
                 op->setDataType(DataType::Float32);
                 op->setBackend("cpu");
                 op->forwardDims();
 
                 const auto outputDims = op->getOutput(0)->dims();
                 REQUIRE(outputDims == dims);
+                delete[] array0;
+                delete[] array1;
             }
         }
+
         SECTION("Broadcasting") {
             for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
                 DimSize_t nbDims = nbDimsDist(gen);
                 std::vector<DimSize_t> dims1(nbDims, 1);
                 std::vector<DimSize_t> dims2(nbDims, 1);
                 std::vector<DimSize_t> expectedOutDims;
-                for (std::size_t i = 0; i < nbDims; i++) {
+                for (std::size_t i = 0; i < nbDims; ++i) {
                     DimSize_t dim = dimSizeDist(gen);
-                    if (boolDist(gen)) {
-                        dims1[i] = dim;
-                    }
-                    if (boolDist(gen)) {
-                        dims2[i] = dim;
-                    }
-                    expectedOutDims.push_back(std::max(dims1[i],dims2[i]));
+                    if (boolDist(gen)) dims1[i] = dim;
+                    if (boolDist(gen)) dims2[i] = dim;
+                    expectedOutDims.push_back(std::max(dims1[i], dims2[i]));
                 }
 
+                const std::size_t nb_elements0 = std::accumulate(dims1.cbegin(), dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                const std::size_t nb_elements1 = std::accumulate(dims2.cbegin(), dims2.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                float* array0 = new float[nb_elements0];
+                float* array1 = new float[nb_elements1];
+                for (std::size_t i = 0; i < nb_elements0; ++i) {
+                    array0[i] = boolDist(gen);
+                }
+                for (std::size_t i = 0; i < nb_elements1; ++i) {
+                    array1[i] = boolDist(gen);
+                }
 
                 std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims1);
-                myInput1->setBackend("cpu");
-                myInput1->setDataType(DataType::Float32);
-                myInput1->zeros();
                 std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims2);
-                myInput2->setBackend("cpu");
+                myInput1->setDataType(DataType::Float32);
                 myInput2->setDataType(DataType::Float32);
-                myInput2->zeros();
+                myInput1->setBackend("cpu");
+                myInput2->setBackend("cpu");
+                myInput1 -> getImpl() -> setRawPtr(array0, nb_elements0);
+                myInput2 -> getImpl() -> setRawPtr(array1, nb_elements1);
+
+
                 std::shared_ptr<Node> myAnd = And();
-                auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator());
-                op->associateInput(0,myInput1);
-                op->associateInput(1,myInput2);
+                auto op = std::static_pointer_cast<OperatorTensor>(myAnd->getOperator());
+                op->associateInput(0, myInput1);
+                op->associateInput(1, myInput2);
                 op->setDataType(DataType::Float32);
                 op->setBackend("cpu");
 
@@ -96,110 +119,67 @@ TEST_CASE("[cpu/operator] And(forward)", "[And][CPU]") {
 
                 const auto outputDims = op->getOutput(0)->dims();
                 REQUIRE(outputDims == expectedOutDims);
+                delete[] array0;
+                delete[] array1;
             }
         }
     }
+
     SECTION("Same size inputs") {
-        std::shared_ptr<Tensor> input1 = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
-        {                                       //
-            {                                   //
-                {{20, 15},{31, 11},{22, 49}},   //
-                {{41, 10},{24, 51},{27, 52}},   //
-                {{26, 53},{27, 54},{28, 55}}    //
-            },                                  //
-            {                                   //
-                {{29, 56},{30, 57},{31, 58}},   //
-                {{32, 59},{33, 60},{34, 61}},   //
-                {{35, 62},{36, 63},{37, 64}}    //
-            },                                  //
-            {                                   //
-                {{38, 65},{39, 66},{40, 67}},   //
-                {{41, 68},{42, 69},{43, 70}},   //
-                {{44, 71},{45, 72},{46, 73}}    //
-            }                                   //
-        }                                       //
-    });                                         //
-        std::shared_ptr<Tensor> input2 = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
-            {                                       //
-                {                                   //
-                    {{20, 47},{21, 48},{22, 49}},   //
-                    {{23, 50},{24, 51},{25, 52}},   //
-                    {{17, 53},{27, 26},{14, 33}}    //
-                },                                  //
-                {                                   //
-                    {{29, 56},{30, 57},{31, 58}},   //
-                    {{72, 44},{33, 20},{27, 55}},   //
-                    {{35, 24},{25, 63},{28, 64}}    //
-                },                                  //
-                {                                   //
-                    {{32, 65},{39, 66},{40, 70}},   //
-                    {{41, 53},{42, 60},{34, 70}},   //
-                    {{44, 71},{30, 12},{46, 73}}    //
-                }                                   //
-            }                                       //
-        });                                         //
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
+        std::shared_ptr<Tensor> input1 = std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 2>{
             {
-                {
-                    {{1, 0},{0, 0},{1, 1}},
-                    {{0, 0},{1, 1},{0, 1}},
-                    {{0, 1},{1, 0},{0, 0}}
-                },
-                {
-                    {{1, 1},{1, 1},{1, 1}},
-                    {{0, 0},{1, 0},{0, 0}},
-                    {{1, 0},{0, 1},{0, 1}}
-                },
-                {
-                    {{0, 1},{1, 1},{1, 0}},
-                    {{1, 0},{1, 0},{0, 1}},
-                    {{1, 1},{0, 0},{1, 1}}
-                }
-            }
-        });
+                {{{1, 0}, {0, 1}},
+                {{1, 1}, {0, 0}}},
+                {{{0, 1}, {1, 0}},
+                {{1, 0}, {0, 1}}}}
+            });
+        std::shared_ptr<Tensor> input2 = std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 2>{
+            {
+                {{{1, 1}, {0, 0}},
+                {{0, 1}, {1, 1}}},
+                {{{1, 1}, {0, 0}},
+                {{0, 1}, {1, 0}}}}
+            });
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 2>{
+            {
+                {{{1, 0}, {0, 0}},
+                {{0, 1}, {0, 0}}},
+                {{{0, 1}, {0, 0}},
+                {{0, 0}, {0, 0}}}}
+            });
 
         std::shared_ptr<Node> myAnd = And();
-        auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator());
+        auto op = std::static_pointer_cast<OperatorTensor>(myAnd->getOperator());
         op->associateInput(0, input1);
         op->associateInput(1, input2);
         op->setBackend("cpu");
-        op->setDataType(DataType::Int32);
+        op->setDataType(DataType::Float32);
         myAnd->forward();
-
+        op->getOutput(0)->print();
         REQUIRE(*(op->getOutput(0)) == *expectedOutput);
     }
 
     SECTION("Broadcasting") {
-        std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<int,1,3,3,2> {
-        {                                       //
-            {                                   //
-                {{10, 20},{22, 23},{20, 20}},   //
-                {{10, 15},{10, 29},{20, 20}},   //
-                {{26, 25},{33, 20},{10, 20}}    //
-            }                                   //
-        }                                       //
-        });                                     //
-
-        std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<int,2> {{10, 20}});  
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,1,3,3,2> {
-            {                                   //
-                {                               //
-                    {{ 1, 1},{ 0, 0},{ 0, 1}},  //
-                    {{ 1, 0},{ 1, 0},{ 0, 1}},  //
-                    {{ 0, 0},{ 0, 1},{ 1, 1}}   //
-                }                               //
-            }                                   //
-        });                                     //
+        std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<float, 1, 2, 2, 2>{
+            {
+                {{{1, 0}, {1, 0}},
+                {{1, 1}, {0, 0}}}}
+            });
+        std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<float, 2>{{1, 0}});
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<float, 1, 2, 2, 2>{
+            {
+                {{{1, 0}, {1, 0}},
+                {{1, 0}, {0, 0}}}}
+            });
 
         std::shared_ptr<Node> myAnd = And();
-        auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator());
+        auto op = std::static_pointer_cast<OperatorTensor>(myAnd->getOperator());
         op->associateInput(0, input_1);
         op->associateInput(1, input_2);
-        op->setDataType(DataType::Int32);
+        op->setDataType(DataType::Float32);
         op->setBackend("cpu");
         myAnd->forward();
-        op->getOutput(0)->print();
-        expectedOutput->print();
-        REQUIRE(*op->getOutput(0) == *expectedOutput);
+
+        REQUIRE(*(op->getOutput(0)) == *expectedOutput);
     }
-}
\ No newline at end of file
+}
diff --git a/unit_tests/operator/Test_ArgMaxImpl.cpp b/unit_tests/operator/Test_ArgMaxImpl.cpp
index 9915d90423e976db1bdd2a694a2cfd7beb380cee..894697f65a6f73af27a568b994c1dd2dc6b118f3 100644
--- a/unit_tests/operator/Test_ArgMaxImpl.cpp
+++ b/unit_tests/operator/Test_ArgMaxImpl.cpp
@@ -9,17 +9,20 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
+#include <cstddef> // std::size_t
+#include <cstdint> // std::uint16_t
 #include <memory>
-#include <numeric>   // std::accumulate
-#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <random>  // std::random_device, std::mt19937, std::uniform_int_distribution, std::uniform_real_distribution
+
+#include <catch2/catch_test_macros.hpp>
+#include <fmt/core.h>
 
+#include "aidge/backend/cpu/operator/ArgMaxImpl.hpp"
+#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
+#include "aidge/graph/Node.hpp"
 #include "aidge/operator/ArgMax.hpp"
-#include "aidge/operator/Conv.hpp"
-
-#include "aidge/backend/cpu.hpp"
-#include "aidge/utils/TensorUtils.hpp"
+#include "aidge/utils/ArrayHelpers.hpp"
 
 using namespace Aidge;
 
@@ -118,8 +121,8 @@ TEST_CASE("[cpu/operator] ArgMax(forward)", "[ArgMax][CPU]") {
         SECTION("Axis 2") {
 
             Tensor myOutput = Tensor(Array3D<float,2,3, 1> {
-               { 
-                    { 
+               {
+                    {
                         {3.0},
                         {2.0},
                         {1.0}
@@ -144,7 +147,7 @@ TEST_CASE("[cpu/operator] ArgMax(forward)", "[ArgMax][CPU]") {
         SECTION("Axis 2 with keep_dims false") {
 
             Tensor myOutput = Tensor(Array2D<float,2,3> {
-               { 
+               {
                     { 3.0, 2.0, 1.0 },
                     { 2.0, 1.0, 0.0 }
                }
@@ -196,10 +199,11 @@ TEST_CASE("[cpu/operator] ArgMax(forward)", "[ArgMax][CPU]") {
             op->associateInput(0,myInput);
             op->setDataType(DataType::Float32);
             op->setBackend("cpu");
-            std::cout << " ...............  "<< std::endl;
+            fmt::print("{:.^20}\n", "forward");
             myArgMax->forward();
+            fmt::print("{:.^20}\n", "result");
             op->getOutput(0)->print();
-            std::cout <<"------"<<std::endl;
+            fmt::print("{:.^20}\n", "truth");
             myOutput.print();
 
             REQUIRE(*(op->getOutput(0)) == myOutput);
diff --git a/unit_tests/operator/Test_Atan.cpp b/unit_tests/operator/Test_Atan.cpp
index 9548e35d81b0423125424a4198d82558c4e57df4..b9438db0b38642e8c49e46451544a68714ac4de6 100644
--- a/unit_tests/operator/Test_Atan.cpp
+++ b/unit_tests/operator/Test_Atan.cpp
@@ -9,14 +9,18 @@
  *
  ********************************************************************************/
 
+#include <cmath>    // std::abs
+#include <cstddef>  // std::size_t
+#include <memory>
+
 #include <catch2/catch_test_macros.hpp>
 
+#include "aidge/backend/cpu/operator/AtanImpl.hpp"
+#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
+#include "aidge/graph/Node.hpp"
 #include "aidge/operator/Atan.hpp"
-
-#include "aidge/backend/cpu.hpp"
-
-#include <memory>
+#include "aidge/utils/ArrayHelpers.hpp"
 
 using namespace Aidge;
 
@@ -32,7 +36,7 @@ TEST_CASE("[cpu/operator] Atan(forward)") {
              0.09486303, 0.16007232, 0.40421187, 0.4102045, 0.39055911}});
 
     std::shared_ptr<Node> myAtan = Atan();
-    auto op = std::static_pointer_cast<OperatorTensor>(myAtan->getOperator());
+    auto op = std::static_pointer_cast<Atan_Op>(myAtan->getOperator());
     op->associateInput(0, input0);
     op->setDataType(DataType::Float32);
     op->setBackend("cpu");
@@ -61,7 +65,7 @@ TEST_CASE("[cpu/operator] Atan(forward)") {
                                   {0.75377332, 0.77411225, 0.32928031}}}});
 
     std::shared_ptr<Node> myAtan = Atan();
-    auto op = std::static_pointer_cast<OperatorTensor>(myAtan->getOperator());
+    auto op = std::static_pointer_cast<Atan_Op>(myAtan->getOperator());
     op->associateInput(0, input0);
     op->setDataType(DataType::Float32);
     op->setBackend("cpu");
diff --git a/unit_tests/operator/Test_AvgPoolingImpl.cpp b/unit_tests/operator/Test_AvgPoolingImpl.cpp
index aaa2757830c245275d02792a7a5a2eb1db32d7b8..372febc61d04c2ba983dd33f009fe5bf1d2908a0 100644
--- a/unit_tests/operator/Test_AvgPoolingImpl.cpp
+++ b/unit_tests/operator/Test_AvgPoolingImpl.cpp
@@ -9,14 +9,18 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
+#include <cmath>    // std::abs
+#include <cstddef>  // std::size_t
 #include <memory>
-#include <cstdlib>
 
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu/operator/AvgPoolingImpl.hpp"
+#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
+#include "aidge/graph/Node.hpp"
 #include "aidge/operator/AvgPooling.hpp"
-
-#include "aidge/backend/cpu.hpp"
+#include "aidge/utils/ArrayHelpers.hpp"
 
 using namespace Aidge;
 
@@ -53,7 +57,7 @@ TEST_CASE("[cpu/operator] AvgPooling(forward)", "[AvgPooling][CPU]") {
     });
     SECTION("Stride") {
         std::shared_ptr<Node> myAvgPool = AvgPooling({2,2}, "mycdw", {2,2});
-        auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool -> getOperator());
+        auto op = std::static_pointer_cast<AvgPooling_Op<2>>(myAvgPool -> getOperator());
 
         std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,2,2,2> {
             {
@@ -90,7 +94,7 @@ TEST_CASE("[cpu/operator] AvgPooling(forward)", "[AvgPooling][CPU]") {
         }
         });
         std::shared_ptr<Node> myAvgPool = AvgPooling({3,3}, "mycdw", {3,3});
-        auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool -> getOperator());
+        auto op = std::static_pointer_cast<AvgPooling_Op<2>>(myAvgPool -> getOperator());
 
         Tensor myOutput = Array4D<float,1,1,1,1> {
             {{{{(0.3745 + 0.9507 + 0.7320 + 0.5987 + 0.1560 + 0.1560 + 0.0581 + 0.8662 + 0.6011)/9.0}}}}
diff --git a/unit_tests/operator/Test_BatchNormImpl.cpp b/unit_tests/operator/Test_BatchNormImpl.cpp
index 1b42c90dd09d63cd319f19bd29751da816db06c0..26e964f9386e19a6070d75a4106b6b46a29e455d 100644
--- a/unit_tests/operator/Test_BatchNormImpl.cpp
+++ b/unit_tests/operator/Test_BatchNormImpl.cpp
@@ -9,20 +9,24 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
+#include <cmath>    // std::abs
+#include <cstddef>  // std::size_t
 #include <memory>
 
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu/operator/BatchNormImpl.hpp"
+#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
+#include "aidge/graph/Node.hpp"
 #include "aidge/operator/BatchNorm.hpp"
-#include "aidge/scheduler/SequentialScheduler.hpp"
-
-#include "aidge/backend/cpu.hpp"
+#include "aidge/utils/ArrayHelpers.hpp"
 
 using namespace Aidge;
 
 TEST_CASE("[cpu/operator] BatchNorm(forward)", "[BatchNorm][CPU]") {
     std::shared_ptr<Node> myBatchNorm = BatchNorm<2>(3, 0.00001F, 0.1F, "mybatchnorm");
-    auto op = std::static_pointer_cast<OperatorTensor>(myBatchNorm -> getOperator());
+    auto op = std::static_pointer_cast<BatchNorm_Op<2>>(myBatchNorm -> getOperator());
     std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array1D<float,3> {{0.9044, 0.3028, 0.0218}});
     std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float,3> {{0.1332, 0.7503, 0.0878}});
     std::shared_ptr<Tensor> myMean = std::make_shared<Tensor>(Array1D<float,3> {{0.9931, 0.8421, 0.9936}});
diff --git a/unit_tests/operator/Test_BitShift.cpp b/unit_tests/operator/Test_BitShift.cpp
index a52990bc7991a325ce151cf6634b0d5a831992c8..33ab932e296be717604be42716d7abe2b61f65ee 100644
--- a/unit_tests/operator/Test_BitShift.cpp
+++ b/unit_tests/operator/Test_BitShift.cpp
@@ -9,15 +9,20 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
+#include <chrono>      // std::micro, std::chrono::time_point,
+                       // std::chrono::system_clock
 #include <cstddef>   // std::size_t
 #include <cstdint>   // std::uint16_t
 #include <chrono>
-#include <iostream>
 #include <memory>
-#include <numeric>   
+#include <numeric>
 #include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
-#include <iomanip>
+
+#include <catch2/catch_test_macros.hpp>
+#include <fmt/core.h>
+
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/BitShiftImpl.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/BitShift.hpp"
 #include "aidge/utils/TensorUtils.hpp"
@@ -29,7 +34,7 @@ TEST_CASE("[cpu/operator] BitShift_TEST", "[BitShift][CPU]") {
     // Create a random number generator
     std::random_device rd;
     std::mt19937 gen(rd());
-    std::uniform_int_distribution<int> valueDist(-15, 15); 
+    std::uniform_int_distribution<int> valueDist(-15, 15);
     std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(5));
     std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(3));
     std::uniform_int_distribution<int> boolDist(0,1);
@@ -131,8 +136,8 @@ TEST_CASE("[cpu/operator] BitShift_TEST", "[BitShift][CPU]") {
 
 
             }
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {}Î¼s\n", duration.count());
         }
         SECTION("Test BitShift kernels with Broadcasting") {
             std::size_t number_of_operation = 0;
@@ -194,7 +199,7 @@ TEST_CASE("[cpu/operator] BitShift_TEST", "[BitShift][CPU]") {
                                 }
                                 else
                                 {
-                                    result[idx_out + d] = array0[idx0] >> array1[idx1];                               
+                                    result[idx_out + d] = array0[idx0] >> array1[idx1];
                                 }
                             }
                         }
@@ -222,12 +227,7 @@ TEST_CASE("[cpu/operator] BitShift_TEST", "[BitShift][CPU]") {
                 duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
 
                 // comparison between truth and computed result
-                bool equiv = (approxEq<int>(*(op->getOutput(0)), *Tres));
-                if(equiv == false)
-                {
-                    std::cout << "Problem\n";
-                }
-                REQUIRE(equiv);
+                REQUIRE(approxEq<int>(*(op->getOutput(0)), *Tres));
 
                 delete[] array0;
                 delete[] array1;
@@ -236,8 +236,8 @@ TEST_CASE("[cpu/operator] BitShift_TEST", "[BitShift][CPU]") {
                 const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
                 number_of_operation += nb_elements;
             }
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {}Î¼s\n", duration.count());
         }
 
 }
diff --git a/unit_tests/operator/Test_ClipImpl.cpp b/unit_tests/operator/Test_ClipImpl.cpp
index 45c8da5bf7ecc84fad6b3e694fe204540f579af3..99147ac93bd659dd91897f6b7f1f3f33e5552ef6 100644
--- a/unit_tests/operator/Test_ClipImpl.cpp
+++ b/unit_tests/operator/Test_ClipImpl.cpp
@@ -9,36 +9,37 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
+#include <algorithm>  // std::max, std::min
+#include <chrono>
 #include <cstddef>  // std::size_t
 #include <cstdint>  // std::uint16_t
-#include <chrono>
-#include <iostream>
-#include <vector>
-#include <algorithm>
-#include <iomanip>
 #include <memory>
-#include <random>   // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <random>      // std::random_device, std::mt19937
+                       // std::uniform_int_distribution, std::uniform_real_distribution
+#include <vector>
+
+#include <catch2/catch_test_macros.hpp>
+#include <fmt/core.h>
 
+#include "aidge/backend/cpu/operator/ClipImpl.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/Clip.hpp"
 #include "aidge/operator/OperatorTensor.hpp"
 #include "aidge/utils/TensorUtils.hpp"
-#include "aidge/backend/cpu.hpp"
 
 void ComputeClipBackward(const std::vector<float>& vec1, std::vector<float>& vec2, float min, float max) {
     if (vec1.size() != vec2.size()) {
-        std::cerr << "Vectors should have the same sizes." << std::endl;
+        fmt::print(stderr, "Vectors should have the same sizes.\n");
         return;
     }
 
-    for (size_t i = 0; i < vec1.size(); ++i) {
+    for (std::size_t i = 0; i < vec1.size(); ++i) {
         if (vec1[i] < min || vec1[i] > max) {
             vec2[i] = 0.0f;
         }
     }
 }
-namespace Aidge 
+namespace Aidge
 {
 TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
  {
@@ -47,8 +48,8 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
     std::random_device rd;
     std::mt19937 gen(rd());
     std::uniform_real_distribution<float> dis(0.0, 10.0);
-    std::uniform_real_distribution<float> dismin(0.0, 4.5); 
-    std::uniform_real_distribution<float> dismax(5.5, 10.0); 
+    std::uniform_real_distribution<float> dismin(0.0, 4.5);
+    std::uniform_real_distribution<float> dismax(5.5, 10.0);
     std::uniform_int_distribution<std::size_t> distDims(5,15);
     std::uniform_int_distribution<std::size_t> distNbMatrix(1, 5);
 
@@ -71,7 +72,7 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
 
             // Create and populate the array with random float values
             float* Array = new float[dim0*dim1];
-            for (int i = 0; i < dim0*dim1; ++i) {
+            for (std::size_t i = 0; i < dim0*dim1; ++i) {
                 Array[i] = dis(gen); // Generate random float value
             }
 
@@ -80,7 +81,7 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
             TInput -> resize({dim0,dim1});
             TInput -> setBackend("cpu");
             TInput -> getImpl() -> setRawPtr(Array, dim0*dim1);
-            
+
             float min = dismin(gen);
             std::shared_ptr<Tensor> Tmin = std::make_shared<Tensor>(DataType::Float32);
             Tmin -> resize({});
@@ -109,7 +110,7 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
             op->setDataType(DataType::Float32);
             op->setBackend("cpu");
             op->forwardDims(true);
-            
+
             start = std::chrono::system_clock::now();
             myClip->forward();
             end = std::chrono::system_clock::now();
@@ -118,9 +119,9 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
 
             REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
         }
-        std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl;
-        std::cout << "total time: " << duration.count() << std::endl;
-    } 
+        Log::info("multiplications over time spent: {}\n", totalComputation/duration.count());
+        Log::info("total time: {}\n", duration.count());
+    }
     SECTION("Clip test with min >= max [Forward]") {
         std::size_t totalComputation = 0;
         for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
@@ -131,7 +132,7 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
 
             // Create and populate the array with random float values
             float* Array = new float[dim0*dim1];
-            for (int i = 0; i < dim0*dim1; ++i) {
+            for (std::size_t i = 0; i < dim0*dim1; ++i) {
                 Array[i] = dis(gen); // Generate random float value
             }
 
@@ -140,7 +141,7 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
             TInput -> resize({dim0,dim1});
             TInput -> setBackend("cpu");
             TInput -> getImpl() -> setRawPtr(Array, dim0*dim1);
-            
+
             float min = dismax(gen);
             std::shared_ptr<Tensor> Tmin = std::make_shared<Tensor>(DataType::Float32);
             Tmin -> resize({});
@@ -169,7 +170,7 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
             op->setDataType(DataType::Float32);
             op->setBackend("cpu");
             op->forwardDims(true);
-            
+
             start = std::chrono::system_clock::now();
             myClip->forward();
             end = std::chrono::system_clock::now();
@@ -178,13 +179,13 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
 
             REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
         }
-        std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl;
-        std::cout << "total time: " << duration.count() << std::endl;
-    } 
+        Log::info("multiplications over time spent: {}\n", totalComputation/duration.count());
+        Log::info("total time: {}\n", duration.count());
+    }
     SECTION("Clip with Clip Attr [Forward]")
     {
         std::size_t totalComputation = 0;
-        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) 
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial)
         {
 
             float min = dismin(gen);
@@ -200,7 +201,7 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
 
             // Create and populate the array with random float values
             float* Array = new float[dim0*dim1];
-            for (int i = 0; i < dim0*dim1; ++i) {
+            for (std::size_t i = 0; i < dim0*dim1; ++i) {
                 Array[i] = dis(gen); // Generate random float value
             }
             // Convert Input to Tensor
@@ -231,8 +232,8 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
 
             REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
         }
-        std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl;
-        std::cout << "total time: " << duration.count() << std::endl;
+        Log::info("multiplications over time spent: {}\n", totalComputation/duration.count());
+        Log::info("total time: {}\n", duration.count());
     }
     SECTION("Simple clip test [Backward]") {
         std::size_t totalComputation = 0;
@@ -243,13 +244,13 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
             // generate Tensors dimensions
             const std::size_t dim0 = distDims(gen);
             const std::size_t dim1 = distDims(gen);
-  
+
             totalComputation += dim0*dim1;
 
             // Create and populate the array with random float values
             float* Array = new float[dim0*dim1];
             float* gradArray = new float[dim0*dim1];
-            for (int i = 0; i < dim0*dim1; ++i) {
+            for (std::size_t i = 0; i < dim0*dim1; ++i) {
                 Array[i] = dis(gen); // Generate random float value
                 gradArray[i] = dis(gen);
             }
@@ -264,7 +265,7 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
             TInput -> resize({dim0,dim1});
             TInput -> setBackend("cpu");
             TInput -> getImpl() -> setRawPtr(Array, dim0*dim1);
-            
+
             float min = dismin(gen);
             std::shared_ptr<Tensor> Tmin = std::make_shared<Tensor>(DataType::Float32);
             Tmin -> resize({});
@@ -296,7 +297,7 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
             myClip->forward();
 
             op->getOutput(0)->setGrad(TGrad);
-            
+
             start = std::chrono::system_clock::now();
             REQUIRE_NOTHROW(myClip->backward());
             end = std::chrono::system_clock::now();
@@ -310,9 +311,9 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
             duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
             REQUIRE(GT1 == BackwardTensorVec);
         }
-        std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl;
-        std::cout << "total time: " << duration.count() << std::endl;
+        Log::info("multiplications over time spent: {}\n", totalComputation/duration.count());
+        Log::info("total time: {}\n", duration.count());
     }
  }
-} // namespace Aidge 
+} // namespace Aidge
 }
\ No newline at end of file
diff --git a/unit_tests/operator/Test_ConstantOfShapeImpl.cpp b/unit_tests/operator/Test_ConstantOfShapeImpl.cpp
index 42505d385fde7e72e09531f1607287ffc6978f75..8ec1669b92a5116999413cf55a8c5113363ef330 100644
--- a/unit_tests/operator/Test_ConstantOfShapeImpl.cpp
+++ b/unit_tests/operator/Test_ConstantOfShapeImpl.cpp
@@ -9,32 +9,27 @@
  *
  ********************************************************************************/
 
-#include <algorithm>
-#include <chrono>
-#include <cmath>
-#include <cstddef> // std::size_t
-#include <cstdint> // std::uint16_t
-#include <iostream>
+#include <chrono>      // std::micro, std::chrono::time_point,
+                       // std::chrono::system_clock
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::int64_t, std::uint16_t
 #include <memory>
-#include <numeric> // std::accumulate
-#include <ostream>
-#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <random>      // std::random_device, std::mt19937
+                       // std::uniform_int_distribution, std::uniform_real_distribution
+#include <vector>
 
-#include "catch2/internal/catch_compiler_capabilities.hpp"
-#include "catch2/internal/catch_enforce.hpp"
 #include <catch2/catch_test_macros.hpp>
 #include <catch2/generators/catch_generators_random.hpp>
 
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp"
+#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
+#include "aidge/filler/Filler.hpp"
 #include "aidge/operator/ConstantOfShape.hpp"
+#include "aidge/operator/OperatorTensor.hpp"
 #include "aidge/utils/TensorUtils.hpp"
-#include <aidge/data/Data.hpp>
-#include <aidge/data/half.hpp>
-#include <aidge/filler/Filler.hpp>
-#include <aidge/operator/OperatorTensor.hpp>
-#include <aidge/operator/Reshape.hpp>
-#include <aidge/utils/TensorUtils.hpp>
-#include <aidge/utils/Types.h>
+#include "aidge/utils/Types.h"
 
 namespace Aidge {
 TEST_CASE("[cpu/operator] ConstantOfShape", "[ConstantOfShape][CPU]") {
@@ -62,7 +57,7 @@ TEST_CASE("[cpu/operator] ConstantOfShape", "[ConstantOfShape][CPU]") {
     result->setDataType(DataType::Int64);
     result->setBackend("cpu");
     for (DimSize_t i = 0; i < result->size(); ++i) {
-      result->set<int64_t>(i, input_tensor_values_dist(gen));
+      result->set<std::int64_t>(i, input_tensor_values_dist(gen));
     }
     return result;
   };
diff --git a/unit_tests/operator/Test_DivImpl.cpp b/unit_tests/operator/Test_DivImpl.cpp
index 5d7dfdf12032d4c444e38cda6d2a4298fc552b14..4037b2ad4e117573279f07d0c1819d3435ee7ada 100644
--- a/unit_tests/operator/Test_DivImpl.cpp
+++ b/unit_tests/operator/Test_DivImpl.cpp
@@ -9,17 +9,26 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
-#include <cstddef>   // std::size_t
-#include <cstdint>   // std::uint16_t
-#include <chrono>
-#include <iostream>
+#include <chrono>      // std::micro, std::chrono::time_point,
+                       // std::chrono::system_clock
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::uint16_t
+#include <functional>  // std::multiplies
 #include <memory>
-#include <numeric>   // std::accumulate
-#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <numeric>     // std::accumulate
+#include <random>      // std::random_device, std::mt19937
+                       // std::uniform_int_distribution, std::uniform_real_distribution
+#include <vector>
+
+#include <catch2/catch_test_macros.hpp>
+#include <fmt/core.h>
 
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/DivImpl.hpp"
+#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/Div.hpp"
+#include "aidge/operator/OperatorTensor.hpp"
 #include "aidge/utils/TensorUtils.hpp"
 
 namespace Aidge {
@@ -117,8 +126,8 @@ TEST_CASE("[cpu/operator] Div", "[Div][CPU]") {
 
                 // with broadcasting
             }
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {} Î¼s\n", duration.count());
         }
 
         SECTION("+1-D Tensor / +1-D Tensor - broadcasting") {
@@ -212,8 +221,8 @@ TEST_CASE("[cpu/operator] Div", "[Div][CPU]") {
                 const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
                 number_of_operation += nb_elements;
             }
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {} Î¼s\n", duration.count());
         }
         SECTION("+1-D Tensor / 1-D Tensor") {
             std::size_t number_of_operation = 0;
@@ -308,8 +317,8 @@ TEST_CASE("[cpu/operator] Div", "[Div][CPU]") {
                 number_of_operation += nb_elements;
             }
 
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {} Î¼s\n", duration.count());
         }
     }
 }
diff --git a/unit_tests/operator/Test_EqualImpl.cpp b/unit_tests/operator/Test_EqualImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a229b8ce3ebcd7672323f2585e3a48343f544c3d
--- /dev/null
+++ b/unit_tests/operator/Test_EqualImpl.cpp
@@ -0,0 +1,205 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <catch2/catch_test_macros.hpp>
+#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
+
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/Equal.hpp"
+
+#include "aidge/backend/cpu.hpp"
+
+using namespace Aidge;
+
+TEST_CASE("[cpu/operator] Equal(forward)", "[Equal][CPU]") {
+        SECTION("ForwardDims")
+    {
+        constexpr std::uint16_t NBTRIALS = 10;
+        // Create a random number generator
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
+        std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5));
+        std::uniform_int_distribution<int> boolDist(0,1);
+
+        SECTION("Same dimensions") {
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                DimSize_t nbDims = nbDimsDist(gen);
+                std::vector<DimSize_t> dims(nbDims);
+                for (std::size_t i = 0; i < nbDims; i++) {
+                    dims[i] = dimSizeDist(gen);
+                }
+
+                std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims);
+                myInput1->setBackend("cpu");
+                myInput1->setDataType(DataType::Float32);
+                myInput1->zeros();
+                std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims);
+                myInput2->setBackend("cpu");
+                myInput2->setDataType(DataType::Float32);
+                myInput2->zeros();
+                std::shared_ptr<Node> myEqual = Equal();
+                auto op = std::static_pointer_cast<OperatorTensor>(myEqual -> getOperator());
+                op->associateInput(0,myInput1);
+                op->associateInput(1,myInput2);
+                op->setDataType(DataType::Float32);
+                op->setBackend("cpu");
+                op->forwardDims();
+
+                const auto outputDims = op->getOutput(0)->dims();
+                REQUIRE(outputDims == dims);
+            }
+        }
+        SECTION("Broadcasting") {
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                DimSize_t nbDims = nbDimsDist(gen);
+                std::vector<DimSize_t> dims1(nbDims, 1);
+                std::vector<DimSize_t> dims2(nbDims, 1);
+                std::vector<DimSize_t> expectedOutDims;
+                for (std::size_t i = 0; i < nbDims; i++) {
+                    DimSize_t dim = dimSizeDist(gen);
+                    if (boolDist(gen)) {
+                        dims1[i] = dim;
+                    }
+                    if (boolDist(gen)) {
+                        dims2[i] = dim;
+                    }
+                    expectedOutDims.push_back(std::max(dims1[i],dims2[i]));
+                }
+
+
+                std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims1);
+                myInput1->setBackend("cpu");
+                myInput1->setDataType(DataType::Float32);
+                myInput1->zeros();
+                std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims2);
+                myInput2->setBackend("cpu");
+                myInput2->setDataType(DataType::Float32);
+                myInput2->zeros();
+                std::shared_ptr<Node> myEqual = Equal();
+                auto op = std::static_pointer_cast<OperatorTensor>(myEqual -> getOperator());
+                op->associateInput(0,myInput1);
+                op->associateInput(1,myInput2);
+                op->setDataType(DataType::Float32);
+                op->setBackend("cpu");
+
+                op->forwardDims();
+
+                const auto outputDims = op->getOutput(0)->dims();
+                REQUIRE(outputDims == expectedOutDims);
+            }
+        }
+    }
+    SECTION("Same size inputs") {
+        std::shared_ptr<Tensor> input1 = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
+        {                                       //
+            {                                   //
+                {{20, 15},{31, 11},{22, 49}},   //
+                {{41, 10},{24, 51},{27, 52}},   //
+                {{26, 53},{27, 54},{28, 55}}    //
+            },                                  //
+            {                                   //
+                {{29, 56},{30, 57},{31, 58}},   //
+                {{32, 59},{33, 60},{34, 61}},   //
+                {{35, 62},{36, 63},{37, 64}}    //
+            },                                  //
+            {                                   //
+                {{38, 65},{39, 66},{40, 67}},   //
+                {{41, 68},{42, 69},{43, 70}},   //
+                {{44, 71},{45, 72},{46, 73}}    //
+            }                                   //
+        }                                       //
+    });                                         //
+        std::shared_ptr<Tensor> input2 = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
+            {                                       //
+                {                                   //
+                    {{20, 47},{21, 48},{22, 49}},   //
+                    {{23, 50},{24, 51},{25, 52}},   //
+                    {{17, 53},{27, 26},{14, 33}}    //
+                },                                  //
+                {                                   //
+                    {{29, 56},{30, 57},{31, 58}},   //
+                    {{72, 44},{33, 20},{27, 55}},   //
+                    {{35, 24},{25, 63},{28, 64}}    //
+                },                                  //
+                {                                   //
+                    {{32, 65},{39, 66},{40, 70}},   //
+                    {{41, 53},{42, 60},{34, 70}},   //
+                    {{44, 71},{30, 12},{46, 73}}    //
+                }                                   //
+            }                                       //
+        });                                         //
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
+            {
+                {
+                    {{1, 0},{0, 0},{1, 1}},
+                    {{0, 0},{1, 1},{0, 1}},
+                    {{0, 1},{1, 0},{0, 0}}
+                },
+                {
+                    {{1, 1},{1, 1},{1, 1}},
+                    {{0, 0},{1, 0},{0, 0}},
+                    {{1, 0},{0, 1},{0, 1}}
+                },
+                {
+                    {{0, 1},{1, 1},{1, 0}},
+                    {{1, 0},{1, 0},{0, 1}},
+                    {{1, 1},{0, 0},{1, 1}}
+                }
+            }
+        });
+
+        std::shared_ptr<Node> myEqual = Equal();
+        auto op = std::static_pointer_cast<OperatorTensor>(myEqual -> getOperator());
+        op->associateInput(0, input1);
+        op->associateInput(1, input2);
+        op->setBackend("cpu");
+        op->setDataType(DataType::Int32);
+        myEqual->forward();
+
+        REQUIRE(*(op->getOutput(0)) == *expectedOutput);
+    }
+
+    SECTION("Broadcasting") {
+        std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<int,1,3,3,2> {
+        {                                       //
+            {                                   //
+                {{10, 20},{22, 23},{20, 20}},   //
+                {{10, 15},{10, 29},{20, 20}},   //
+                {{26, 25},{33, 20},{10, 20}}    //
+            }                                   //
+        }                                       //
+        });                                     //
+
+        std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<int,2> {{10, 20}});  
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,1,3,3,2> {
+            {                                   //
+                {                               //
+                    {{ 1, 1},{ 0, 0},{ 0, 1}},  //
+                    {{ 1, 0},{ 1, 0},{ 0, 1}},  //
+                    {{ 0, 0},{ 0, 1},{ 1, 1}}   //
+                }                               //
+            }                                   //
+        });                                     //
+
+        std::shared_ptr<Node> myEqual = Equal();
+        auto op = std::static_pointer_cast<OperatorTensor>(myEqual -> getOperator());
+        op->associateInput(0, input_1);
+        op->associateInput(1, input_2);
+        op->setDataType(DataType::Int32);
+        op->setBackend("cpu");
+        myEqual->forward();
+        op->getOutput(0)->print();
+        expectedOutput->print();
+        REQUIRE(*op->getOutput(0) == *expectedOutput);
+    }
+}
\ No newline at end of file
diff --git a/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp b/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp
index 43af544871ad6c2ac319de09f3c6fce5065e60d5..8e8536accadcb874f74d4d962aae435bc1351d6e 100644
--- a/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp
+++ b/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp
@@ -9,34 +9,29 @@
  *
  ********************************************************************************/
 
-#include <aidge/utils/Types.h>
-#include <catch2/catch_test_macros.hpp>
 #include <chrono>
-#include <cmath>
 #include <cstddef> // std::size_t
 #include <cstdint> // std::uint16_t
-#include <iostream>
+#include <functional>  // std::multiplies
 #include <memory>
 #include <numeric> // std::accumulate
-#include <ostream>
-#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <random>      // std::random_device, std::mt19937
+                       // std::uniform_int_distribution, std::uniform_real_distribution
+#include <vector>
+
+#include <catch2/catch_test_macros.hpp>
+#include <fmt/core.h>
 
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/GlobalAveragePoolingImpl.hpp"
+#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/GlobalAveragePooling.hpp"
 #include "aidge/utils/TensorUtils.hpp"
-
-// debug print function
-void print_tensor(Aidge::Tensor &T) {
-  // Print tensors
-  std::cout << "Tensor : size = Â [";
-  for (auto &dim : T.dims()) {
-    std::cout << dim << " , ";
-  }
-  std::cout << "]" << std::endl;
-  T.print();
-}
+#include "aidge/utils/Types.h"
 
 namespace Aidge {
+
 TEST_CASE("[cpu/operator] GlobalAveragePooling",
           "[GlobalAveragePooling][CPU]") {
   constexpr std::uint16_t NBTRIALS = 10;
@@ -54,9 +49,7 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling",
                                                             std::size_t(7));
 
   // Create MatGlobalAveragePooling Operator
-  std::shared_ptr<Node> globAvgPool = GlobalAveragePooling();
-  auto op =
-      std::static_pointer_cast<OperatorTensor>(globAvgPool->getOperator());
+  std::shared_ptr<GlobalAveragePooling_Op> op = std::make_shared<GlobalAveragePooling_Op>();
   op->setDataType(DataType::Float32);
   op->setBackend("cpu");
 
@@ -99,7 +92,7 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling",
       T0->resize(dims);
       T0->getImpl()->setRawPtr(array0, nb_elements);
 
-      REQUIRE_THROWS(globAvgPool->forward());
+      REQUIRE_THROWS(op->forward());
       delete[] array0;
     }
 
@@ -158,7 +151,7 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling",
 
         op->forwardDims();
         start = std::chrono::system_clock::now();
-        REQUIRE_NOTHROW(globAvgPool->forward());
+        REQUIRE_NOTHROW(op->forward());
         end = std::chrono::system_clock::now();
         duration +=
             std::chrono::duration_cast<std::chrono::microseconds>(end - start);
@@ -231,7 +224,7 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling",
 
           op->forwardDims();
           start = std::chrono::system_clock::now();
-          REQUIRE_NOTHROW(globAvgPool->forward());
+          REQUIRE_NOTHROW(op->forward());
           end = std::chrono::system_clock::now();
           duration += std::chrono::duration_cast<std::chrono::microseconds>(
               end - start);
@@ -358,7 +351,7 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling",
           Tres->getImpl()->setRawPtr(result, out_nb_elems);
           op->forwardDims();
           start = std::chrono::system_clock::now();
-          REQUIRE_NOTHROW(globAvgPool->forward());
+          REQUIRE_NOTHROW(op->forward());
           end = std::chrono::system_clock::now();
           duration += std::chrono::duration_cast<std::chrono::microseconds>(
               end - start);
@@ -547,7 +540,7 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling",
           Tres->getImpl()->setRawPtr(result, out_nb_elems);
           op->forwardDims();
           start = std::chrono::system_clock::now();
-          REQUIRE_NOTHROW(globAvgPool->forward());
+          REQUIRE_NOTHROW(op->forward());
           end = std::chrono::system_clock::now();
           duration += std::chrono::duration_cast<std::chrono::microseconds>(
               end - start);
@@ -561,12 +554,9 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling",
           delete[] result;
         }
       }
-      std::cout << "GlobalAveragePooling total execution time : "
-                << duration.count() << "Âµs" << std::endl;
-      std::cout << "Number of operations : " << number_of_operation
-                << std::endl;
-      std::cout << "Operation / Âµs = " << number_of_operation / duration.count()
-                << std::endl;
+      Log::info("GlobalAveragePooling total execution time: {}Âµs\n", duration.count());
+      Log::info("Number of operations : {}\n", number_of_operation);
+      Log::info("Operation / Âµs = {}\n", number_of_operation / duration.count());
     }
   }
 }
diff --git a/unit_tests/operator/Test_MatMulImpl.cpp b/unit_tests/operator/Test_MatMulImpl.cpp
index d6e934b4dc8d84e8a595eb74d1af9d2c68c892d1..f062f06cddfbd04217d63e1edcb6505914bc77e9 100644
--- a/unit_tests/operator/Test_MatMulImpl.cpp
+++ b/unit_tests/operator/Test_MatMulImpl.cpp
@@ -9,21 +9,26 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
-#include <cstddef>  // std::size_t
-#include <cstdint>  // std::uint16_t
-#include <chrono>
-#include <iostream>
+#include <chrono>      // std::micro, std::chrono::time_point,
+                       // std::chrono::system_clock, std::chrono::duration
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::uint16_t
 #include <memory>
-#include <random>   // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <random>      // std::random_device, std::mt19937
+                       // std::uniform_int_distribution, std::uniform_real_distribution
+#include <vector>
+
+#include <catch2/catch_test_macros.hpp>
+#include <fmt/core.h>
 
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/MatMulImpl.hpp"
+#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/MatMul.hpp"
 #include "aidge/operator/OperatorTensor.hpp"
 #include "aidge/utils/TensorUtils.hpp"
 
-#include "aidge/backend/cpu/operator/MatMulImpl.hpp"
-
 namespace Aidge {
 
 TEST_CASE("[cpu/operator] MatMul(forward)", "[MatMul][CPU]") {
@@ -106,8 +111,8 @@ TEST_CASE("[cpu/operator] MatMul(forward)", "[MatMul][CPU]") {
             delete[] bigArray2;
             delete[] res;
         }
-        std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl;
-        std::cout << "total time: " << duration.count() << std::endl;
+        Log::info("number of multiplications over time spent: {}\n", (totalComputation / duration.count()));
+        Log::info("total time: {} Î¼s\n", duration.count());
     }
 
     SECTION("3-D Tensors") {
@@ -174,8 +179,8 @@ TEST_CASE("[cpu/operator] MatMul(forward)", "[MatMul][CPU]") {
             delete[] bigArray2;
             delete[] res;
         }
-        std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl;
-        std::cout << "total time: " << duration.count() << std::endl;
+        Log::info("number of multiplications over time spent: {}\n", (totalComputation / duration.count()));
+        Log::info("total time: {} Î¼s\n", duration.count());
     }
 
     SECTION("4-D Tensors") {
@@ -244,8 +249,8 @@ TEST_CASE("[cpu/operator] MatMul(forward)", "[MatMul][CPU]") {
             delete[] bigArray2;
             delete[] res;
         }
-        std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl;
-        std::cout << "total time: " << duration.count() << std::endl;
+        Log::info("number of multiplications over time spent: {}\n", (totalComputation / duration.count()));
+        Log::info("total time: {} Î¼s\n", duration.count());
     }
 
     SECTION("+2-D / 1-D") {
diff --git a/unit_tests/operator/Test_MaxPoolingImpl.cpp b/unit_tests/operator/Test_MaxPoolingImpl.cpp
index af04ede4e33c32ce785804e2484b6ba9ac5edc36..7db082c5c8c564648f65bedc27240e3d9b775eef 100644
--- a/unit_tests/operator/Test_MaxPoolingImpl.cpp
+++ b/unit_tests/operator/Test_MaxPoolingImpl.cpp
@@ -12,6 +12,7 @@
 #include <catch2/catch_test_macros.hpp>
 #include <memory>
 #include <cstdlib>
+#include <iostream>
 
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/MaxPooling.hpp"
@@ -79,4 +80,39 @@ TEST_CASE("[cpu/operator] MaxPooling(forward)", "[MaxPooling][CPU]") {
         op->getOutput(0)->print();
         REQUIRE(*(op->getOutput(0)) == *myOutput);
     }
+    SECTION("Dilation") {
+        std::shared_ptr<Node> myMaxPool = MaxPooling({2,2}, "mycdw", {2,2}, {2,2}); // Dilation 2x2
+        auto op = std::static_pointer_cast<OperatorTensor>(myMaxPool -> getOperator());
+
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,2,2,2> {
+            {
+                {
+                    {
+                        {0.71470, 0.52770},
+                        {0.71470, 0.48740}
+                    },
+                    {
+                        {2.23290, 0.48590},
+                        {2.23290, 0.07000}
+                    }
+                },
+                {
+                    {
+                        {1.76530, 1.20710},
+                        {1.76530, 1.20710}
+                    },
+                    {
+                        {1.04290, 0.67760},
+                        {1.72170, 0.67760}
+                    }
+                }
+            }
+        });
+        myMaxPool->getOperator()->associateInput(0,myInput);
+        myMaxPool->getOperator()->setDataType(DataType::Float32);
+        myMaxPool->getOperator()->setBackend("cpu");
+        myMaxPool->forward();
+        op->getOutput(0)->print();
+        REQUIRE(*(op->getOutput(0)) == *myOutput);
+    }
 }
\ No newline at end of file
diff --git a/unit_tests/operator/Test_Memorize.cpp b/unit_tests/operator/Test_Memorize.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..45ab40c5315e88b2db48c1fba92da0ab9b587398
--- /dev/null
+++ b/unit_tests/operator/Test_Memorize.cpp
@@ -0,0 +1,65 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <catch2/catch_test_macros.hpp>
+#include <memory>
+#include <string>
+
+#include "aidge/data/Tensor.hpp"
+#include "aidge/graph/Node.hpp"
+#include "aidge/graph/GraphView.hpp"
+#include "aidge/graph/OpArgs.hpp"
+#include "aidge/operator/Memorize.hpp"
+#include "aidge/operator/Producer.hpp"
+#include "aidge/scheduler/SequentialScheduler.hpp"
+
+#include "aidge/backend/cpu.hpp"
+#include "aidge/recipes/GraphViewHelper.hpp"
+
+
+namespace Aidge {
+
+TEST_CASE("[cpu/operator] Memorize(forward)", "[Memorize][CPU]") {
+    SECTION("Test simple") {
+        std::shared_ptr<Tensor> inputTensor =
+                std::make_shared<Tensor>(Array1D<int, 1>{{1}});
+
+        auto input = Producer({1}, "input");
+        auto init = Producer({1}, "init");
+        auto add = Add("add");
+        auto mem = Memorize(3, "mem");
+
+        input->addChild(add, 0, 0);
+        init->addChild(mem, 0, 1);
+        add->addChild(mem, 0,0);
+        mem->addChild(/*otherNode=*/add, /*outId=*/1, /*otherInId=*/1);
+
+        input->getOperator()->setOutput(0, inputTensor);
+        init->getOperator()->setOutput(0, inputTensor);
+
+        auto g = getConnectedGraphView(input);
+
+        g->setDataType(Aidge::DataType::Int32);
+        g->setBackend("cpu");
+        g->forwardDims();
+        g->save("simple_graph");
+
+        SequentialScheduler scheduler(g);
+        REQUIRE_NOTHROW(scheduler.forward());
+        scheduler.saveSchedulingDiagram("simple");
+
+        const auto expectedOutput = std::make_shared<Tensor>(Array1D<int, 1>{{4}});
+        std::shared_ptr<Tensor> other = std::static_pointer_cast<OperatorTensor>(mem->getOperator())->getOutput(0);
+        other->print();
+        REQUIRE((*other == *expectedOutput));
+    }
+}
+} // namespace Aidge
diff --git a/unit_tests/operator/Test_MulImpl.cpp b/unit_tests/operator/Test_MulImpl.cpp
index 3378861d0d3d7e74e7867c2765a0b09069fa8caf..2937e94938c671140eeeee87d47d5c48f685203e 100644
--- a/unit_tests/operator/Test_MulImpl.cpp
+++ b/unit_tests/operator/Test_MulImpl.cpp
@@ -9,379 +9,336 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
-#include <cstddef>   // std::size_t
-#include <cstdint>   // std::uint16_t
 #include <chrono>
-#include <iostream>
+#include <cstddef> // std::size_t
+#include <cstdint> // std::uint16_t
 #include <memory>
-#include <numeric>   // std::accumulate
-#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <numeric> // std::accumulate
+#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution,
+                   // std::uniform_int_distribution
+
+#include <catch2/catch_test_macros.hpp>
 
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/MulImpl.hpp"
+#include "aidge/data/DataType.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/Mul.hpp"
+#include "aidge/utils/ArrayHelpers.hpp"
+#include "aidge/utils/Log.hpp"
 #include "aidge/utils/TensorUtils.hpp"
 
 namespace Aidge {
 
-    TEST_CASE("[CPU/Operator] Mul Backward", "[Mul][CPU][Backward]")
-    {
-        std::shared_ptr<Node> myMul = Mul();
-        auto op = std::static_pointer_cast<OperatorTensor>(myMul->getOperator());
-        op->setDataType(DataType::Float32);
-        op->setBackend("cpu");
-
-        SECTION("Case 1: 2D and 1D tensors") {
-            const auto T0 = std::make_shared<Tensor>(Array2D<float,2,3>(
-                {
-                    {
-                        {1,2,3},{4,5,6}
-                    }
-                }
-            ));
+TEST_CASE("[CPU/Operator] Mul(Backward)", "[Mul][CPU][Backward]") {
+    std::shared_ptr<Mul_Op> op = std::make_shared<Mul_Op>();
+    op->setDataType(DataType::Float32);
+    op->setBackend("cpu");
 
-            const auto T1 = std::make_shared<Tensor>(Array1D<float,3>(
-                {0.1,0.2,0.3}
-            ));
+    // NOTE: The first four tests use fixed values, the last one uses random values but static dimensions.
 
-            T0->setDataType(DataType::Float32);
-            T0->setBackend("cpu");
-            T1->setDataType(DataType::Float32);
-            T1->setBackend("cpu");
+    SECTION("Case 1: 1D and 2D Tensors") {
+        const auto T0 = std::make_shared<Tensor>(
+            Array2D<cpptype_t<DataType::Float32>, 2, 3>({{{1, 2, 3}, {4, 5, 6}}}));
 
-            op->getOutput(0)->setGrad(std::make_shared<Tensor>(Array2D<float,2,3>({{{1.0,1.0,1.0},{1.0,1.0,1.0}}})));
+        const auto T1 =
+            std::make_shared<Tensor>(Array1D<cpptype_t<DataType::Float32>, 3>({0.1, 0.2, 0.3}));
 
-            op->associateInput(0,T0);
-            op->associateInput(1,T1);
-            op->forwardDims();
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(std::make_shared<Tensor>(
+            Array2D<float, 2, 3>({{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}})));
+        op->forwardDims();
 
-            myMul->forward();
-            myMul->backward();
+        op->backward();
 
-            auto T0Grad = std::make_shared<Tensor>(Array2D<float, 2,3>({{{0.1,0.2,0.3},{0.1, 0.2, 0.3}}}));
-            auto T1Grad = std::make_shared<Tensor>(Array1D<float, 3>({5,7,9}));
+        const Tensor expectedGrad0 =
+            Array2D<cpptype_t<DataType::Float32>, 2, 3>({{{0.1, 0.2, 0.3}, {0.1, 0.2, 0.3}}});
 
-            REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *T0Grad));
-            REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *T1Grad));
-        }
+        const Tensor expectedGrad1 = Array1D<cpptype_t<DataType::Float32>, 3>({5, 7, 9});
 
-        SECTION("Case 2: 3D and 1D tensors") {
-            const auto T0 = std::make_shared<Tensor>(Array3D<float,2,2,3>(
-                {
-                    {
-                        {
-                            {1.0, 2.0, 3.0},
-                            {4.0, 5.0, 6.0}
-                        },
-                        {
-                            {7.0, 8.0, 9.0},
-                            {10.0, 11.0, 12.0}
-                        }
-                    }
-                }
-            ));
-
-            const auto T1 = std::make_shared<Tensor>(Array1D<float, 3>({0.3,0.2,0.1}));
-
-            const auto newGrad = std::make_shared<Tensor>(Array3D<float,2,2,3>(
-                    {
-                        {
-                            {
-                                {1, 1, 1},
-                                {1, 1, 1}
-                            },
-                            {
-                                {1, 1, 1},
-                                {1, 1, 1}
-                            }
-                        }
-                    }
-                ));
-
-            const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float,2,2,3>(
-                {
-                    {
-                        {
-                            {0.3, 0.2, 0.1},
-                            {0.3, 0.2, 0.1}
-                        },
-                        {
-                            {0.3, 0.2, 0.1},
-                            {0.3, 0.2, 0.1}
-                        }
-                    }
-                }
-            ));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1));
+    }
 
-            const auto expectedGrad1 = std::make_shared<Tensor>(Array1D<float,3>(
-                {22.0, 26.0, 30.0}
-            ));
+    SECTION("Case 2: 3D and 1D tensors") {
+        const auto T0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+            {{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}},
+              {{7.0, 8.0, 9.0}, {10.0, 11.0, 12.0}}}}));
 
-            for(auto T: {T0, T1, newGrad, expectedGrad0, expectedGrad1})
-            {
-                    T->setBackend("cpu") ;
-                    T->setDataType(DataType::Float32);
-            }
+        const auto T1 =
+            std::make_shared<Tensor>(Array1D<float, 3>({0.3, 0.2, 0.1}));
 
-            op->associateInput(0, T0);
-            op->associateInput(1, T1);
-            op->getOutput(0)->setGrad(newGrad);
-            op->forwardDims();
+        const auto newGrad = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+            {{{{1, 1, 1}, {1, 1, 1}}, {{1, 1, 1}, {1, 1, 1}}}}));
 
-            myMul->backward();
+        const Tensor expectedGrad0 =
+            Array3D<float, 2, 2, 3>({{{{0.3, 0.2, 0.1}, {0.3, 0.2, 0.1}},
+                                      {{0.3, 0.2, 0.1}, {0.3, 0.2, 0.1}}}});
 
-            REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0));
-            REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1));
-        }
+        const Tensor expectedGrad1 = Array1D<cpptype_t<DataType::Float32>, 3>({22.0, 26.0, 30.0});
 
-        SECTION("Case 3: 4D and 2D tensors") {
-            const auto T0 = std::make_shared<Tensor>(Array4D<float,2, 2, 3, 3>(
-                {
-                    {
-                        {
-                            {
-                                {1.0, 2.0, 3.0},
-                                {4.0, 5.0, 6.0},
-                                {7.0, 8.0, 9.0}
-                            },
-                            {
-                                {10.0, 11.0, 12.0},
-                                {13.0, 14.0, 15.0},
-                                {16.0, 17.0, 18.0}
-                            }
-                        },
-                        {
-                            {
-                                {19.0, 20.0, 21.0},
-                                {22.0, 23.0, 24.0},
-                                {25.0, 26.0, 27.0}
-                            },
-                            {
-                                {28.0, 29.0, 30.0},
-                                {31.0, 32.0, 33.0},
-                                {34.0, 35.0, 36.0}
-                            }
-                        }
-                    }
-                }
-            ));
-
-            const auto T1 = std::make_shared<Tensor>(Array2D<float, 3,3>(
-                {
-                    {
-                        {0.5,0.3,0.1},
-                        {0.4,0.2,0.6},
-                        {0.7,0.8,0.9}
-                    }
-                }
-            ));
-
-            const auto newGrad = std::make_shared<Tensor>(Array4D<float,2, 2, 3, 3>(
-                {
-                    {
-                        {
-                            {
-                                {1.0, 1.0, 1.0},
-                                {1.0, 1.0, 1.0},
-                                {1.0, 1.0, 1.0}
-                            },
-                            {
-                                {1.0, 1.0, 1.0},
-                                {1.0, 1.0, 1.0},
-                                {1.0, 1.0, 1.0}
-                            }
-                        },
-                        {
-                            {
-                                {1.0, 1.0, 1.0},
-                                {1.0, 1.0, 1.0},
-                                {1.0, 1.0, 1.0}
-                            },
-                            {
-                                {1.0, 1.0, 1.0},
-                                {1.0, 1.0, 1.0},
-                                {1.0, 1.0, 1.0}
-                            }
-                        }
-                    }
-                }
-            ));
-
-            const auto expectedGrad0 = std::make_shared<Tensor>(Array4D<float,2,2,3,3>(
-                {
-                    {
-                        {
-                            {
-                                {0.5, 0.3, 0.1},
-                                {0.4, 0.2, 0.6},
-                                {0.7, 0.8, 0.9}
-                            },
-                            {
-                                {0.5, 0.3, 0.1},
-                                {0.4, 0.2, 0.6},
-                                {0.7, 0.8, 0.9}
-                            }
-                        },
-                        {
-                            {
-                                {0.5, 0.3, 0.1},
-                                {0.4, 0.2, 0.6},
-                                {0.7, 0.8, 0.9}
-                            },
-                            {
-                                {0.5, 0.3, 0.1},
-                                {0.4, 0.2, 0.6},
-                                {0.7, 0.8, 0.9}
-                            }
-                        }
-                    }
-                }
-            ));
-
-            const auto expectedGrad1 = std::make_shared<Tensor>(Array2D<float,3, 3>(
-                {
-                    {
-                        {58.0, 62.0, 66.0},
-                        {70.0, 74.0, 78.0},
-                        {82.0, 86.0, 90.0}
-                    }
-                }
-            ));
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(newGrad);
+        op->forwardDims();
 
-            for(const auto T: {T0, T1, newGrad, expectedGrad0, expectedGrad1})
-            {
-                    T->setBackend("cpu") ;
-                    T->setDataType(DataType::Float32);
-            }
+        op->backward();
+
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1));
+    }
 
-            op->associateInput(0, T0);
-            op->associateInput(1, T1);
-            op->getOutput(0)->setGrad(newGrad);
-            op->forwardDims();
+    SECTION("Case 3: 4D and 2D tensors") {
+        const auto T0 = std::make_shared<Tensor>(Array4D<cpptype_t<DataType::Float32>, 2, 2, 3, 3>(
+            {{{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}, {7.0, 8.0, 9.0}},
+               {{10.0, 11.0, 12.0}, {13.0, 14.0, 15.0}, {16.0, 17.0, 18.0}}},
+              {{{19.0, 20.0, 21.0}, {22.0, 23.0, 24.0}, {25.0, 26.0, 27.0}},
+               {{28.0, 29.0, 30.0},
+                {31.0, 32.0, 33.0},
+                {34.0, 35.0, 36.0}}}}}));
+
+        const auto T1 = std::make_shared<Tensor>(Array2D<cpptype_t<DataType::Float32>, 3, 3>(
+            {{{0.5, 0.3, 0.1}, {0.4, 0.2, 0.6}, {0.7, 0.8, 0.9}}}));
+
+        const auto newGrad =
+            std::make_shared<Tensor>(Array4D<cpptype_t<DataType::Float32>, 2, 2, 3, 3>(
+                {{{{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}},
+                   {{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}},
+                  {{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}},
+                   {{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}}}}));
+
+        const Tensor expectedGrad0 =
+            Array4D<cpptype_t<DataType::Float32>, 2, 2, 3, 3>(
+                {{{{{0.5, 0.3, 0.1}, {0.4, 0.2, 0.6}, {0.7, 0.8, 0.9}},
+                   {{0.5, 0.3, 0.1}, {0.4, 0.2, 0.6}, {0.7, 0.8, 0.9}}},
+                  {{{0.5, 0.3, 0.1}, {0.4, 0.2, 0.6}, {0.7, 0.8, 0.9}},
+                   {{0.5, 0.3, 0.1}, {0.4, 0.2, 0.6}, {0.7, 0.8, 0.9}}}}});
+
+        const Tensor expectedGrad1 =
+            Array2D<cpptype_t<DataType::Float32>, 3, 3>({{{58.0, 62.0, 66.0},
+                                   {70.0, 74.0, 78.0},
+                                   {82.0, 86.0, 90.0}}});
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(newGrad);
+        op->forwardDims();
+
+        op->backward();
+
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1));
+    }
+
+    SECTION("Case 4: 3D and 2D tensors") {
+        const auto T0 = std::make_shared<Tensor>(
+            Array3D<float, 2, 3, 4>({{{
+                                          {1.0, 2.0, 3.0, 4.0},
+                                          {5.0, 6.0, 7.0, 8.0},
+                                          {9.0, 10.0, 11.0, 12.0},
+                                      },
+                                      {
+                                          {13.0, 14.0, 15.0, 16.0},
+                                          {17.0, 18.0, 19.0, 20.0},
+                                          {21.0, 22.0, 23.0, 24.0},
+                                      }}}));
+
+        const auto T1 = std::make_shared<Tensor>(
+            Array2D<cpptype_t<DataType::Float32>, 3, 4>({{{0.1, 0.2, 0.3, 0.4},
+                                   {0.5, 0.6, 0.7, 0.8},
+                                   {0.9, 1.0, 1.1, 1.2}}}));
+
+        const auto newGrad = std::make_shared<Tensor>(
+            Array3D<cpptype_t<DataType::Float32>, 2, 3, 4>({{{
+                                          {1.0, 1.0, 1.0, 1.0},
+                                          {1.0, 1.0, 1.0, 1.0},
+                                          {1.0, 1.0, 1.0, 1.0},
+                                      },
+                                      {
+                                          {1.0, 1.0, 1.0, 1.0},
+                                          {1.0, 1.0, 1.0, 1.0},
+                                          {1.0, 1.0, 1.0, 1.0},
+                                      }}}));
+
+        const Tensor expectedGrad0 =
+            Array3D<cpptype_t<DataType::Float32>, 2, 3, 4>({{{{0.1, 0.2, 0.3, 0.4},
+                                       {0.5, 0.6, 0.7, 0.8},
+                                       {0.9, 1.0, 1.1, 1.2}},
+                                      {{0.1, 0.2, 0.3, 0.4},
+                                       {0.5, 0.6, 0.7, 0.8},
+                                       {0.9, 1.0, 1.1, 1.2}}}});
+
+        const Tensor expectedGrad1 =
+            Array2D<cpptype_t<DataType::Float32>, 3, 4>({{{14.0, 16.0, 18.0, 20.0},
+                                   {22.0, 24.0, 26.0, 28.0},
+                                   {30.0, 32.0, 34.0, 36.0}}});
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(newGrad);
+        op->forwardDims();
+
+        op->backward();
+
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1));
+    }
 
-            myMul->backward();
+    SECTION("Case 5: Tensors with random values") {
 
-            REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0));
-            REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1));
+        // Use random values
+        const std::vector<std::size_t> dims0 = {5, 2, 1, 7}; // First tensor
+        const std::vector<std::size_t> dims1 = {2, 6, 7};    // Second tensor
+        const std::vector<std::size_t> outputDims = {5, 2, 6, 7};
+
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> dist(0.1f, 1.0f);
+
+        auto T0 = std::make_shared<Tensor>(dims0);
+        T0->setDataType(DataType::Float32);
+        T0->setBackend("cpu");
+        float* input0Data = static_cast<float*>(T0->getImpl()->rawPtr());
+        // Fill with random values
+        for (std::size_t i = 0; i < T0->size(); ++i) {
+            input0Data[i] = dist(gen);
         }
 
-        SECTION("Case 4: 3D and 2D tensors") {
-            const auto T0 = std::make_shared<Tensor>(Array3D<float, 2, 3, 4>(
-                {
-                    {
-                        {
-                            {1.0, 2.0, 3.0, 4.0},
-                            {5.0, 6.0, 7.0, 8.0},
-                            {9.0, 10.0, 11.0, 12.0},
-                        },
-                        {
-                            {13.0, 14.0, 15.0, 16.0},
-                            {17.0, 18.0, 19.0, 20.0},
-                            {21.0, 22.0, 23.0, 24.0},
-                        }
-                    }
-                }
-            ));
-
-            const auto T1 = std::make_shared<Tensor>(Array2D<float, 3, 4>(
-                {
-                    {
-                        {0.1, 0.2, 0.3, 0.4},
-                        {0.5, 0.6, 0.7, 0.8},
-                        {0.9, 1.0, 1.1, 1.2}
-                    }
-                }
-            ));
-
-            const auto newGrad = std::make_shared<Tensor>(Array3D<float, 2,3,4>(
-                {
-                    {
-                        {
-                            {1.0, 1.0, 1.0, 1.0},
-                            {1.0, 1.0, 1.0, 1.0},
-                            {1.0, 1.0, 1.0, 1.0},
-                        },
-                        {
-                            {1.0, 1.0, 1.0, 1.0},
-                            {1.0, 1.0, 1.0, 1.0},
-                            {1.0, 1.0, 1.0, 1.0},
-                        }
-                    }
-                }
-            ));
-
-            const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float,2,3,4>(
-                {
-                    {
-                        {
-                            {0.1, 0.2, 0.3, 0.4},
-                            {0.5, 0.6, 0.7, 0.8},
-                            {0.9, 1.0, 1.1, 1.2}
-                        },
-                        {
-                            {0.1, 0.2, 0.3, 0.4},
-                            {0.5, 0.6, 0.7, 0.8},
-                            {0.9, 1.0, 1.1, 1.2}
-                        }
-                    }
-                }
-            ));
-
-            const auto expectedGrad1 = std::make_shared<Tensor>(Array2D<float,3, 4>(
-                {
-                    {
-                        {14.0, 16.0, 18.0, 20.0},
-                        {22.0, 24.0, 26.0, 28.0},
-                        {30.0, 32.0, 34.0, 36.0}
+        auto T1 = std::make_shared<Tensor>(dims1);
+        T1->setDataType(DataType::Float32);
+        T1->setBackend("cpu");
+        float* input1Data = static_cast<float*>(T1->getImpl()->rawPtr());
+        // Fill with random values
+        for (std::size_t i = 0; i < T1->size(); ++i) {
+            input1Data[i] = dist(gen);
+        }
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+
+        op->forwardDims();
+        op->forward();
+
+        Tensor expectedOutput{outputDims};
+        expectedOutput.setBackend("cpu");
+        float* expectedOutputData = static_cast<float*>(expectedOutput.getImpl()->rawPtr());
+
+        for (std::size_t n = 0; n < 5; ++n) {
+            for (std::size_t c = 0; c < 2; ++c) {
+                for (std::size_t h = 0; h < 6; ++h) {
+                    for (std::size_t w = 0; w < 7; ++w) {
+                        std::size_t outIdx = w + 7 * (h + 6 * (c + 2 * n));
+                        std::size_t in0Idx =
+                            w + 7 * (0 + 1 * (c + 2 * n)); // middle dim is 1
+                        std::size_t in1Idx =
+                            w + 7 * (h + 6 * c);           // no n dimension
+
+                        expectedOutputData[outIdx] = input0Data[in0Idx] * input1Data[in1Idx];
                     }
                 }
-            ));
-
-            for(const auto T: {T0, T1, newGrad, expectedGrad0, expectedGrad1})
-            {
-                T->setBackend("cpu") ;
-                T->setDataType(DataType::Float32);
             }
+        }
 
-            op->associateInput(0, T0);
-            op->associateInput(1, T1);
-            op->getOutput(0)->setGrad(newGrad);
-            op->forwardDims();
+        auto outputTensor = op->getOutput(0);
 
-            myMul->backward();
+        REQUIRE(approxEq<float>(*outputTensor, expectedOutput));
 
-            REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0));
-            REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1));
+        // Backward pass
+        std::vector<float> gradOutputData(expectedOutput.size());
+        for (auto &val : gradOutputData) {
+            val = dist(gen);
         }
+
+        op->getOutput(0)->setGrad(std::make_shared<Tensor>());
+        op->getOutput(0)->grad()->resize(outputDims);
+        op->getOutput(0)->grad()->getImpl()->setRawPtr(gradOutputData.data(),
+                                                       expectedOutput.size());
+
+        // Compute reference gradients
+        std::vector<float> expectedGrad0(T0->size(), 0.0f);
+        std::vector<float> expectedGrad1(T1->size(), 0.0f);
+
+        for (std::size_t n = 0; n < 5; ++n) {
+            for (std::size_t c = 0; c < 2; ++c) {
+                for (std::size_t h = 0; h < 6; ++h) {
+                    for (std::size_t w = 0; w < 7; ++w) {
+                        std::size_t outIdx = w + 7 * (h + 6 * (c + 2 * n));
+                        std::size_t in0Idx = w + 7 * (0 + 1 * (c + 2 * n));
+                        std::size_t in1Idx = w + 7 * (h + 6 * c);
+
+                        // Gradient for input0: grad_output * input1
+                        expectedGrad0[in0Idx] +=
+                            gradOutputData[outIdx] * input1Data[in1Idx];
+
+                        // Gradient for input1: grad_output * input0
+                        expectedGrad1[in1Idx] +=
+                            gradOutputData[outIdx] * input0Data[in0Idx];
+                    }
+                }
+            }
+        }
+
+        // Perform backward pass
+        op->backward();
+
+        auto expectedGrad0Tensor = std::make_shared<Tensor>();
+        expectedGrad0Tensor->resize(T0->dims());
+        expectedGrad0Tensor->setBackend("cpu");
+        expectedGrad0Tensor->setDataType(DataType::Float32);
+        expectedGrad0Tensor->getImpl()->setRawPtr(expectedGrad0.data(),
+                                                    expectedGrad0.size());
+
+        auto expectedGrad1Tensor = std::make_shared<Tensor>(T1->dims());
+        expectedGrad1Tensor->setBackend("cpu");
+        expectedGrad1Tensor->setDataType(DataType::Float32);
+        expectedGrad1Tensor->getImpl()->setRawPtr(expectedGrad1.data(),
+                                                    expectedGrad1.size());
+
+        // Verify backward pass
+        REQUIRE(approxEq<float>(*T0->grad(), *expectedGrad0Tensor));
+        REQUIRE(approxEq<float>(*T1->grad(), *expectedGrad1Tensor));
+
+        // Optional: Print some values for verification
+        // std::cout << "Input shapes: (" << dims0[0] << "," << dims0[1] <<
+        // "," << dims0[2] << "," << dims0[3]
+        //           << ") * (" << dims1[0] << "," << dims1[1] << "," <<
+        //           dims1[2]
+        //           << ") -> (" << outputDims[0] << "," << outputDims[1]
+        //           << "," << outputDims[2] << "," << outputDims[3] <<
+        //           ")\n";
+        // std::cout << "Input sizes: " << input0_size << " * " <<
+        // input1_size << " -> " << output_size << "\n";
     }
+}
 
-TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
+TEST_CASE("[cpu/operator] Mul(forward)", "[Mul][CPU]") {
     constexpr std::uint16_t NBTRIALS = 10;
     // Create a random number generator
     std::random_device rd;
     std::mt19937 gen(rd());
-    std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1
-    std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
-    std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(3));
-    std::uniform_int_distribution<int> boolDist(0,1);
-
-    // Create MatMul Operator
-    std::shared_ptr<Node> myMul = Mul();
-    auto op = std::static_pointer_cast<OperatorTensor>(myMul-> getOperator());
+    std::uniform_real_distribution<float> valueDist(
+        0.1f,
+        1.1f); // Random float distribution between 0 and 1
+    std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2),
+                                                           std::size_t(10));
+    std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1),
+                                                          std::size_t(3));
+    std::uniform_int_distribution<int> boolDist(0, 1);
+
+    std::shared_ptr<Mul_Op> op = std::make_shared<Mul_Op>();
     op->setDataType(DataType::Float32);
     op->setBackend("cpu");
 
-    // Create 2 input Tensors
     std::shared_ptr<Tensor> T0 = std::make_shared<Tensor>();
-    op->associateInput(0,T0);
+    op->associateInput(0, T0);
     T0->setDataType(DataType::Float32);
     T0->setBackend("cpu");
     std::shared_ptr<Tensor> T1 = std::make_shared<Tensor>();
-    op -> associateInput(1,T1);
+    op->associateInput(1, T1);
     T1->setDataType(DataType::Float32);
     T1->setBackend("cpu");
 
-    // Create results Tensor
     std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>();
     Tres->setDataType(DataType::Float32);
     Tres->setBackend("cpu");
@@ -391,14 +348,9 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
     std::chrono::time_point<std::chrono::system_clock> end;
     std::chrono::duration<double, std::micro> duration{};
 
-
     SECTION("MulImpl_cpu::forward()") {
-        SECTION("Scalar / Scalar") {
-
-        }
-        SECTION("Scalar / +1-D Tensor") {
-
-        }
+        SECTION("Scalar / Scalar") {}
+        SECTION("Scalar / +1-D Tensor") {}
         SECTION("+1-D Tensor / +1-D Tensor - same dimensions") {
 
             std::size_t number_of_operation = 0;
@@ -413,13 +365,17 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
                     dims.push_back(dimSizeDist(gen));
                 }
 
-                const auto nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                const auto nb_elements =
+                    std::accumulate(dims.cbegin(),
+                                    dims.cend(),
+                                    std::size_t(1),
+                                    std::multiplies<std::size_t>());
                 number_of_operation += nb_elements;
 
                 // without broadcasting
-                float* array0 = new float[nb_elements];
-                float* array1 = new float[nb_elements];
-                float* result = new float[nb_elements];
+                float *array0 = new float[nb_elements];
+                float *array1 = new float[nb_elements];
+                float *result = new float[nb_elements];
 
                 for (std::size_t i = 0; i < nb_elements; ++i) {
                     array0[i] = valueDist(gen);
@@ -429,21 +385,23 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
 
                 // input0
                 T0->resize(dims);
-                T0 -> getImpl() -> setRawPtr(array0, nb_elements);
+                T0->getImpl()->setRawPtr(array0, nb_elements);
 
                 // input1
                 T1->resize(dims);
-                T1 -> getImpl() -> setRawPtr(array1, nb_elements);
+                T1->getImpl()->setRawPtr(array1, nb_elements);
 
                 // results
                 Tres->resize(dims);
-                Tres -> getImpl() -> setRawPtr(result, nb_elements);
+                Tres->getImpl()->setRawPtr(result, nb_elements);
 
                 op->forwardDims();
                 start = std::chrono::system_clock::now();
-                myMul->forward();
+                op->forward();
                 end = std::chrono::system_clock::now();
-                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+                duration +=
+                    std::chrono::duration_cast<std::chrono::microseconds>(
+                        end - start);
 
                 REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
 
@@ -451,24 +409,23 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
                 delete[] array1;
                 delete[] result;
             }
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {}Î¼s\n", duration.count());
         }
 
-
         SECTION("+1-D Tensor / +1-D Tensor - broadcasting") {
             std::size_t number_of_operation = 0;
 
             for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
 
                 // generate 2 random Tensors
-                // handle dimensions, replace some dimensions with '1' to get broadcasting
+                // handle dimensions, replace some dimensions with '1' to get
+                // broadcasting
 
                 constexpr std::size_t nbDims = 4;
                 std::vector<std::size_t> dimensions;
 
-                for (std::size_t i = 0; i < nbDims; ++i)
-                {
+                for (std::size_t i = 0; i < nbDims; ++i) {
                     dimensions.push_back(dimSizeDist(gen));
                 }
 
@@ -476,77 +433,90 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
                 auto dims1 = dimensions;
                 auto dimsOut = dimensions;
 
-                for (std::size_t i = 0; i < nbDims; ++i)
-                {
-                    if (boolDist(gen))
-                    {
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    if (boolDist(gen)) {
                         dims0[i] = 1;
                     }
 
-                    if (boolDist(gen))
-                    {
+                    if (boolDist(gen)) {
                         dims1[i] = 1;
                     }
 
                     dimsOut[i] = (dims0[i] == 1) ? dims1[i] : dims0[i];
                 }
 
-                for(auto dim : dims0)
-                {
+                for (auto dim : dims0) {
                     Log::info("Dimension of input 0 : {}", dim);
                 }
 
-                for(auto dim : dims1)
-                {
+                for (auto dim : dims1) {
                     Log::info("Dimension of input 1 : {}", dim);
                 }
 
                 // create arrays and fill them with random values
-                float* array0 = new float[dims0[0]*dims0[1]*dims0[2]*dims0[3]];
-                float* array1 = new float[dims1[0]*dims1[1]*dims1[2]*dims1[3]];
-                float* result = new float[dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]];
-
-
-                for (std::size_t i = 0; i < dims0[0]*dims0[1]*dims0[2]*dims0[3]; ++i)
-                {
+                float *array0 =
+                    new float[dims0[0] * dims0[1] * dims0[2] * dims0[3]];
+                float *array1 =
+                    new float[dims1[0] * dims1[1] * dims1[2] * dims1[3]];
+                float *result = new float[dimsOut[0] * dimsOut[1] *
+                                          dimsOut[2] * dimsOut[3]];
+
+                for (std::size_t i = 0;
+                     i < dims0[0] * dims0[1] * dims0[2] * dims0[3];
+                     ++i) {
                     array0[i] = valueDist(gen);
                 }
 
-                for (std::size_t i = 0; i < dims1[0]*dims1[1]*dims1[2]*dims1[3]; ++i)
-                {
+                for (std::size_t i = 0;
+                     i < dims1[0] * dims1[1] * dims1[2] * dims1[3];
+                     ++i) {
                     array1[i] = valueDist(gen);
                 }
 
                 // compute true result
-                const std::size_t strides0[nbDims] = {dims0[1]*dims0[2]*dims0[3], dims0[2]*dims0[3], dims0[3], 1};
-                const std::size_t strides1[nbDims] = {dims1[1]*dims1[2]*dims1[3], dims1[2]*dims1[3], dims1[3], 1};
-
-                for (std::size_t a = 0; a < dimsOut[0]; ++a)
-                {
-                    for (std::size_t b = 0; b < dimsOut[1]; ++b)
-                    {
-                        const std::size_t idx0_0 = strides0[0] * ((dims0[0] > 1) ? a : 0)
-                                                    + strides0[1] * ((dims0[1] > 1) ? b : 0);
-
-                        const std::size_t idx1_0 = strides1[0] * ((dims1[0] > 1) ? a : 0)
-                                                    + strides1[1] * ((dims1[1] > 1) ? b : 0);
-
-                        for (std::size_t c = 0; c < dimsOut[2]; ++c)
-                        {
-                            const std::size_t idx_out = dimsOut[3] * (c + dimsOut[2] * (b + dimsOut[1] * a));
-
-                            for (std::size_t d = 0; d < dimsOut[3]; ++d)
-                            {
-                                std::size_t idx0 = idx0_0
-                                                    + strides0[2] * ((dims0[2] > 1) ? c : 0)
-                                                    + ((dims0[3] > 1) ? d : 0);
-
-                                std::size_t idx1 = idx1_0
-                                                    + strides1[2] * ((dims1[2] > 1) ? c : 0)
-                                                    + ((dims1[3] > 1) ? d : 0);
-
-                                result[idx_out + d] = array0[idx0] * array1[idx1];
-                                // std::cout << "(" << idx0 << ", " << idx1 << ") -> " << array0[idx0] << " * " << array1[idx1] << " -> " << idx_out + d << std::endl;
+                const std::size_t strides0[nbDims] = {
+                    dims0[1] * dims0[2] * dims0[3],
+                    dims0[2] * dims0[3],
+                    dims0[3],
+                    1};
+                const std::size_t strides1[nbDims] = {
+                    dims1[1] * dims1[2] * dims1[3],
+                    dims1[2] * dims1[3],
+                    dims1[3],
+                    1};
+
+                for (std::size_t a = 0; a < dimsOut[0]; ++a) {
+                    for (std::size_t b = 0; b < dimsOut[1]; ++b) {
+                        const std::size_t idx0_0 =
+                            strides0[0] * ((dims0[0] > 1) ? a : 0) +
+                            strides0[1] * ((dims0[1] > 1) ? b : 0);
+
+                        const std::size_t idx1_0 =
+                            strides1[0] * ((dims1[0] > 1) ? a : 0) +
+                            strides1[1] * ((dims1[1] > 1) ? b : 0);
+
+                        for (std::size_t c = 0; c < dimsOut[2]; ++c) {
+                            const std::size_t idx_out =
+                                dimsOut[3] *
+                                (c + dimsOut[2] * (b + dimsOut[1] * a));
+
+                            for (std::size_t d = 0; d < dimsOut[3]; ++d) {
+                                std::size_t idx0 =
+                                    idx0_0 +
+                                    strides0[2] * ((dims0[2] > 1) ? c : 0) +
+                                    ((dims0[3] > 1) ? d : 0);
+
+                                std::size_t idx1 =
+                                    idx1_0 +
+                                    strides1[2] * ((dims1[2] > 1) ? c : 0) +
+                                    ((dims1[3] > 1) ? d : 0);
+
+                                result[idx_out + d] =
+                                    array0[idx0] * array1[idx1];
+                                // std::cout << "(" << idx0 << ", " << idx1 <<
+                                // ") -> " << array0[idx0] << " * " <<
+                                // array1[idx1] << " -> " << idx_out + d <<
+                                // std::endl;
                             }
                         }
                     }
@@ -555,22 +525,30 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
                 // conversion to Aidge::Tensors
                 // input0
                 T0->resize(dims0);
-                T0 -> getImpl() -> setRawPtr(array0, dims0[0]*dims0[1]*dims0[2]*dims0[3]);
+                T0->getImpl()->setRawPtr(
+                    array0,
+                    dims0[0] * dims0[1] * dims0[2] * dims0[3]);
 
                 // input1
                 T1->resize(dims1);
-                T1 -> getImpl() -> setRawPtr(array1, dims1[0]*dims1[1]*dims1[2]*dims1[3]);
+                T1->getImpl()->setRawPtr(
+                    array1,
+                    dims1[0] * dims1[1] * dims1[2] * dims1[3]);
 
                 // results
                 Tres->resize(dimsOut);
-                Tres -> getImpl() -> setRawPtr(result, dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]);
+                Tres->getImpl()->setRawPtr(
+                    result,
+                    dimsOut[0] * dimsOut[1] * dimsOut[2] * dimsOut[3]);
 
                 // compute result
                 op->forwardDims();
                 start = std::chrono::system_clock::now();
-                myMul->forward();
+                op->forward();
                 end = std::chrono::system_clock::now();
-                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+                duration +=
+                    std::chrono::duration_cast<std::chrono::microseconds>(
+                        end - start);
 
                 // comparison between truth and computed result
                 REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
@@ -579,15 +557,21 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
                 delete[] array1;
                 delete[] result;
 
-                const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                const std::size_t nb_elements =
+                    std::accumulate(dimsOut.cbegin(),
+                                    dimsOut.cend(),
+                                    std::size_t(1),
+                                    std::multiplies<std::size_t>());
                 number_of_operation += nb_elements;
             }
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {}Î¼s\n", duration.count());
         }
         SECTION("+1-D Tensor / 1-D Tensor") {
             std::size_t number_of_operation = 0;
-            std::uniform_int_distribution<std::size_t> nbRemovedDimsDist(std::size_t(1), std::size_t(3));
+            std::uniform_int_distribution<std::size_t> nbRemovedDimsDist(
+                std::size_t(1),
+                std::size_t(3));
 
             for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
                 // generate 2 random Tensors
@@ -604,15 +588,24 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
                         dims1[i] = 1;
                     }
                 }
-                dims1.erase(dims1.cbegin(), dims1.cbegin() + nbRemovedDimsDist(gen));
+                dims1.erase(dims1.cbegin(),
+                            dims1.cbegin() + nbRemovedDimsDist(gen));
 
                 // create arrays and fill them with random values
-                float* array0 = new float[dims0[0]*dims0[1]*dims0[2]*dims0[3]];
-                std::size_t array1_size = std::accumulate(dims1.cbegin(), dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
-                float* array1 = new float[array1_size];
-                float* result = new float[dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]];
-
-                for (std::size_t i = 0; i < (dims0[0]*dims0[1]*dims0[2]*dims0[3]); ++i) {
+                float *array0 =
+                    new float[dims0[0] * dims0[1] * dims0[2] * dims0[3]];
+                std::size_t array1_size =
+                    std::accumulate(dims1.cbegin(),
+                                    dims1.cend(),
+                                    std::size_t(1),
+                                    std::multiplies<std::size_t>());
+                float *array1 = new float[array1_size];
+                float *result = new float[dimsOut[0] * dimsOut[1] *
+                                          dimsOut[2] * dimsOut[3]];
+
+                for (std::size_t i = 0;
+                     i < (dims0[0] * dims0[1] * dims0[2] * dims0[3]);
+                     ++i) {
                     array0[i] = valueDist(gen);
                 }
                 for (std::size_t i = 0; i < array1_size; ++i) {
@@ -621,27 +614,48 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
 
                 // compute true result
                 auto dims1_tmp = dims1;
-                dims1_tmp.insert(dims1_tmp.cbegin(), 4 - dims1_tmp.size(), std::size_t(1));
-
-                const std::size_t strides0[nbDims] = {dims0[1]*dims0[2]*dims0[3], dims0[2]*dims0[3], dims0[3], 1};
-                const std::size_t strides1[nbDims] = {dims1_tmp[1]*dims1_tmp[2]*dims1_tmp[3], dims1_tmp[2]*dims1_tmp[3], dims1_tmp[3], 1};
+                dims1_tmp.insert(dims1_tmp.cbegin(),
+                                 4 - dims1_tmp.size(),
+                                 std::size_t(1));
+
+                const std::size_t strides0[nbDims] = {
+                    dims0[1] * dims0[2] * dims0[3],
+                    dims0[2] * dims0[3],
+                    dims0[3],
+                    1};
+                const std::size_t strides1[nbDims] = {
+                    dims1_tmp[1] * dims1_tmp[2] * dims1_tmp[3],
+                    dims1_tmp[2] * dims1_tmp[3],
+                    dims1_tmp[3],
+                    1};
                 for (std::size_t a = 0; a < dimsOut[0]; ++a) {
                     for (std::size_t b = 0; b < dimsOut[1]; ++b) {
-                        const std::size_t idx0_0 = strides0[0] * ((dims0[0] > 1) ? a : 0)
-                                                    + strides0[1] * ((dims0[1] > 1) ? b : 0);
-                        const std::size_t idx1_0 = strides1[0] * ((dims1_tmp[0] > 1) ? a : 0)
-                                                    + strides1[1] * ((dims1_tmp[1] > 1) ? b : 0);
+                        const std::size_t idx0_0 =
+                            strides0[0] * ((dims0[0] > 1) ? a : 0) +
+                            strides0[1] * ((dims0[1] > 1) ? b : 0);
+                        const std::size_t idx1_0 =
+                            strides1[0] * ((dims1_tmp[0] > 1) ? a : 0) +
+                            strides1[1] * ((dims1_tmp[1] > 1) ? b : 0);
                         for (std::size_t c = 0; c < dimsOut[2]; ++c) {
-                            const std::size_t idx_out = dimsOut[3] * (c + dimsOut[2] * (b + dimsOut[1] * a));
+                            const std::size_t idx_out =
+                                dimsOut[3] *
+                                (c + dimsOut[2] * (b + dimsOut[1] * a));
                             for (std::size_t d = 0; d < dimsOut[3]; ++d) {
-                                std::size_t idx0 = idx0_0
-                                                    + strides0[2] * ((dims0[2] > 1) ? c : 0)
-                                                    + ((dims0[3] > 1) ? d : 0);
-                                std::size_t idx1 = idx1_0
-                                                    + strides1[2] * ((dims1_tmp[2] > 1) ? c : 0)
-                                                    + ((dims1_tmp[3] > 1) ? d : 0);
-                                result[idx_out + d] = array0[idx0] * array1[idx1];
-                                // std::cout << "(" << idx0 << ", " << idx1 << ") -> " << array0[idx0] << " * " << array1[idx1] << " -> " << idx_out + d << std::endl;
+                                std::size_t idx0 =
+                                    idx0_0 +
+                                    strides0[2] * ((dims0[2] > 1) ? c : 0) +
+                                    ((dims0[3] > 1) ? d : 0);
+                                std::size_t idx1 =
+                                    idx1_0 +
+                                    strides1[2] *
+                                        ((dims1_tmp[2] > 1) ? c : 0) +
+                                    ((dims1_tmp[3] > 1) ? d : 0);
+                                result[idx_out + d] =
+                                    array0[idx0] * array1[idx1];
+                                // std::cout << "(" << idx0 << ", " << idx1 <<
+                                // ") -> " << array0[idx0] << " * " <<
+                                // array1[idx1] << " -> " << idx_out + d <<
+                                // std::endl;
                             }
                         }
                     }
@@ -650,22 +664,28 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
                 // conversion to Aidge::Tensors
                 // input0
                 T0->resize(dims0);
-                T0 -> getImpl() -> setRawPtr(array0, dims0[0]*dims0[1]*dims0[2]*dims0[3]);
+                T0->getImpl()->setRawPtr(
+                    array0,
+                    dims0[0] * dims0[1] * dims0[2] * dims0[3]);
 
                 // input1
                 T1->resize(dims1);
-                T1 -> getImpl() -> setRawPtr(array1, array1_size);
+                T1->getImpl()->setRawPtr(array1, array1_size);
 
                 // results
                 Tres->resize(dimsOut);
-                Tres -> getImpl() -> setRawPtr(result, dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]);
+                Tres->getImpl()->setRawPtr(
+                    result,
+                    dimsOut[0] * dimsOut[1] * dimsOut[2] * dimsOut[3]);
 
                 // compute result
                 op->forwardDims();
                 start = std::chrono::system_clock::now();
-                myMul->forward();
+                op->forward();
                 end = std::chrono::system_clock::now();
-                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+                duration +=
+                    std::chrono::duration_cast<std::chrono::microseconds>(
+                        end - start);
 
                 // comparison between truth and computed result
                 REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
@@ -674,13 +694,18 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
                 delete[] array1;
                 delete[] result;
 
-                const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                const std::size_t nb_elements =
+                    std::accumulate(dimsOut.cbegin(),
+                                    dimsOut.cend(),
+                                    std::size_t(1),
+                                    std::multiplies<std::size_t>());
                 number_of_operation += nb_elements;
             }
 
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {}Î¼s\n", duration.count());
         }
     }
 }
 } // namespace Aidge
+
diff --git a/unit_tests/operator/Test_PowImpl.cpp b/unit_tests/operator/Test_PowImpl.cpp
index cb5d8872c9c7242bb4aa4efca388d53b578417f9..55a416c3f404506359e06f9937dd958503236901 100644
--- a/unit_tests/operator/Test_PowImpl.cpp
+++ b/unit_tests/operator/Test_PowImpl.cpp
@@ -9,18 +9,26 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
-#include <cmath>
-#include <cstddef>   // std::size_t
-#include <cstdint>   // std::uint16_t
-#include <chrono>
-#include <iostream>
+#include <chrono>      // std::micro, std::chrono::time_point,
+                       // std::chrono::system_clock, std::chrono::duration
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::uint16_t
+#include <functional>  // std::multiplies
 #include <memory>
-#include <numeric>   // std::accumulate
-#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <numeric>     // std::accumulate
+#include <random>      // std::random_device, std::mt19937
+                       // std::uniform_int_distribution, std::uniform_real_distribution
+#include <vector>
+
+#include <catch2/catch_test_macros.hpp>
+#include <fmt/core.h>
 
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/PowImpl.hpp"
+#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/Pow.hpp"
+#include "aidge/utils/ArrayHelpers.hpp"
 #include "aidge/utils/TensorUtils.hpp"
 
 namespace Aidge {
@@ -118,8 +126,8 @@ TEST_CASE("[cpu/operator] Pow", "[Pow][CPU]") {
 
                 // with broadcasting
             }
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {} Î¼s\n", duration.count());
         }
 
         SECTION("+1-D Tensor / +1-D Tensor - broadcasting") {
@@ -213,8 +221,8 @@ TEST_CASE("[cpu/operator] Pow", "[Pow][CPU]") {
                 const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
                 number_of_operation += nb_elements;
             }
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {} Î¼s\n", duration.count());
         }
         SECTION("+1-D Tensor / 1-D Tensor") {
             std::size_t number_of_operation = 0;
@@ -309,8 +317,8 @@ TEST_CASE("[cpu/operator] Pow", "[Pow][CPU]") {
                 number_of_operation += nb_elements;
             }
 
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {} Î¼s\n", duration.count());
         }
     }
 
@@ -440,7 +448,7 @@ TEST_CASE("[cpu/operator] Pow", "[Pow][CPU]") {
                     }
                 }
             ));
-            const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+            const Tensor expectedGrad0 = Array3D<float, 2, 2, 3>(
                 {
                     {
                         {
@@ -453,18 +461,13 @@ TEST_CASE("[cpu/operator] Pow", "[Pow][CPU]") {
                         }
                     }
                 }
-            ));
-            const auto expectedGrad1 = std::make_shared<Tensor>(Array1D<float, 3>(
+            );
+            const Tensor expectedGrad1 = Array1D<float, 3>(
                 {
                     {14.14779854, 22.99299049, 33.56402588}
                 }
-            ));
+            );
 
-            for(const auto T: {input0, input1, gradOut, expectedGrad0, expectedGrad1})
-            {
-                    T->setBackend("cpu") ;
-                    T->setDataType(DataType::Float32);
-            }
             std::shared_ptr<Node> powOp = Pow();
             auto opr = std::static_pointer_cast<OperatorTensor>(powOp-> getOperator());
             opr->setDataType(DataType::Float32);
@@ -475,8 +478,8 @@ TEST_CASE("[cpu/operator] Pow", "[Pow][CPU]") {
             powOp->forward();
 
             powOp->backward();
-            REQUIRE(approxEq<float>(*(opr->getInput(0)->grad()), *expectedGrad0));
-            REQUIRE(approxEq<float>(*(opr->getInput(1)->grad()), *expectedGrad1));
+            REQUIRE(approxEq<float>(*(opr->getInput(0)->grad()), expectedGrad0));
+            REQUIRE(approxEq<float>(*(opr->getInput(1)->grad()), expectedGrad1));
         }
     }
 }
diff --git a/unit_tests/operator/Test_ReduceSumImpl.cpp b/unit_tests/operator/Test_ReduceSumImpl.cpp
index 49569d1f65ff6c51f9681632b16375605ab326e7..0aa543da4f9d55dfa672790a5d99467cd794001a 100644
--- a/unit_tests/operator/Test_ReduceSumImpl.cpp
+++ b/unit_tests/operator/Test_ReduceSumImpl.cpp
@@ -9,17 +9,22 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
+#include <cstddef>  // std::size_t
+#include <cstdint>  // std::uint16_t, std::int32_t
 #include <memory>
-#include <numeric>   // std::accumulate
 #include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <vector>
+
+#include <catch2/catch_test_macros.hpp>
 
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/data/Data.hpp"  // DataType
 #include "aidge/data/Tensor.hpp"
+#include "aidge/graph/Node.hpp"
+#include "aidge/operator/OperatorTensor.hpp"
 #include "aidge/operator/ReduceSum.hpp"
-#include "aidge/operator/Conv.hpp"
-
-#include "aidge/backend/cpu.hpp"
 #include "aidge/utils/TensorUtils.hpp"
+#include "aidge/utils/Types.h"
 
 using namespace Aidge;
 
@@ -112,7 +117,7 @@ TEST_CASE("[cpu/operator] ReduceSum(forward)", "[ReduceSum][CPU]") {
                 std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(dims);
                 myInput->setBackend("cpu");
                 myInput->setDataType(DataType::Float32);
-                std::shared_ptr<Node> myReduceSum = ReduceSum(std::vector<int32_t>{}, false, true);
+                std::shared_ptr<Node> myReduceSum = ReduceSum(std::vector<std::int32_t>{}, false, true);
                 auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
                 op->associateInput(0,myInput);
                 op->setDataType(DataType::Float32);
diff --git a/unit_tests/operator/Test_RoundImpl.cpp b/unit_tests/operator/Test_RoundImpl.cpp
index b4cf9ffbedc18b35b42ebbc05971f86e0fa584e3..e658b0616683633ce19b2284abb9d4fae7942a23 100644
--- a/unit_tests/operator/Test_RoundImpl.cpp
+++ b/unit_tests/operator/Test_RoundImpl.cpp
@@ -9,15 +9,23 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
-#include <cstddef>   // std::size_t
-#include <cstdint>   // std::uint16_t
-#include <chrono>
-#include <iostream>
+#include <chrono>      // std::micro, std::chrono::time_point,
+                       // std::chrono::system_clock, std::chrono::duration
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::uint16_t
+#include <functional>  // std::multiplies
 #include <memory>
-#include <numeric>   
-#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
-#include <iomanip>
+#include <numeric>     // std::accumulate
+#include <random>      // std::random_device, std::mt19937
+                       // std::uniform_int_distribution, std::uniform_real_distribution
+#include <vector>
+
+#include <catch2/catch_test_macros.hpp>
+#include <fmt/core.h>
+
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/RoundImpl.hpp"
+#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/Round.hpp"
 #include "aidge/utils/TensorUtils.hpp"
@@ -29,7 +37,7 @@ TEST_CASE("[cpu/operator] Round_Test", "[Round][CPU]") {
     // Create a random number generator
     std::random_device rd;
     std::mt19937 gen(rd());
-    std::uniform_real_distribution<float> valueDist(-15, 15); 
+    std::uniform_real_distribution<float> valueDist(-15, 15);
     std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(5));
     std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(3));
 
@@ -59,7 +67,7 @@ TEST_CASE("[cpu/operator] Round_Test", "[Round][CPU]") {
             std::size_t number_of_operation = 0;
 
             for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
-                
+
                 // generate 2 random Tensors
                 const std::size_t nbDims = nbDimsDist(gen);
                 std::vector<std::size_t> dims;
@@ -72,7 +80,7 @@ TEST_CASE("[cpu/operator] Round_Test", "[Round][CPU]") {
                 // without broadcasting
                 float* array0 = new float[nb_elements];
                 float* result = new float[nb_elements];
-                
+
                 for (std::size_t i = 0; i < nb_elements; ++i) {
                     array0[i] = valueDist(gen);
                     result[i] = std::nearbyint(array0[i]);
@@ -86,29 +94,22 @@ TEST_CASE("[cpu/operator] Round_Test", "[Round][CPU]") {
                 // results
                 Tres->resize(dims);
                 Tres -> getImpl() -> setRawPtr(result, nb_elements);
-                
+
                 op->forwardDims();
                 start = std::chrono::system_clock::now();
                 myRound->forward();
                 end = std::chrono::system_clock::now();
                 duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
 
-                bool is_eq = approxEq<float>(*(op->getOutput(0)), *Tres);
-
-                auto Output = *(op->getOutput(0));
-                
-                auto prt = Output.getImpl()->rawPtr();
-
-                REQUIRE(is_eq);
-                
+                REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
 
                 delete[] array0;
                 delete[] result;
 
 
             }
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {} Î¼s\n", duration.count());
         }
     }
 } // namespace Aidge
diff --git a/unit_tests/operator/Test_SubImpl.cpp b/unit_tests/operator/Test_SubImpl.cpp
index 44666ae631152c8898e24f7003b0c2ede8c67b84..1317e88a371e9a6e7a3deae5b7f662a9cd879a60 100644
--- a/unit_tests/operator/Test_SubImpl.cpp
+++ b/unit_tests/operator/Test_SubImpl.cpp
@@ -9,17 +9,26 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
-#include <cstddef>   // std::size_t
-#include <cstdint>   // std::uint16_t
-#include <chrono>
-#include <iostream>
+#include <chrono>      // std::micro, std::chrono::time_point,
+                       // std::chrono::system_clock
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::uint16_t
+#include <functional>  // std::multiplies
 #include <memory>
-#include <numeric>   // std::accumulate
-#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <numeric>     // std::accumulate
+#include <random>      // std::random_device, std::mt19937
+                       // std::uniform_int_distribution, std::uniform_real_distribution
+#include <vector>
+
+#include <catch2/catch_test_macros.hpp>
+#include <fmt/core.h>
 
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/SubImpl.hpp"
+#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/Sub.hpp"
+#include "aidge/operator/OperatorTensor.hpp"
 #include "aidge/utils/TensorUtils.hpp"
 
 namespace Aidge {
@@ -117,8 +126,8 @@ TEST_CASE("[cpu/operator] Sub", "[Sub][CPU]") {
 
                 // with broadcasting
             }
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {}Î¼s\n", duration.count());
         }
 
         SECTION("+1-D Tensor / +1-D Tensor - broadcasting") {
@@ -212,8 +221,8 @@ TEST_CASE("[cpu/operator] Sub", "[Sub][CPU]") {
                 const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
                 number_of_operation += nb_elements;
             }
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {}Î¼s\n", duration.count());
         }
         SECTION("+1-D Tensor / 1-D Tensor") {
             std::size_t number_of_operation = 0;
@@ -308,8 +317,8 @@ TEST_CASE("[cpu/operator] Sub", "[Sub][CPU]") {
                 number_of_operation += nb_elements;
             }
 
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {}Î¼s\n", duration.count());
         }
     }
 }
diff --git a/unit_tests/operator/Test_WeightInterleavingImpl.cpp b/unit_tests/operator/Test_WeightInterleavingImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c95c8fca19eb79eb78fc19e93ded3383054383e7
--- /dev/null
+++ b/unit_tests/operator/Test_WeightInterleavingImpl.cpp
@@ -0,0 +1,436 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/WeightInterleaving.hpp"
+#include "aidge/recipes/Recipes.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+#include "aidge/backend/cpu.hpp"
+
+#include <memory>
+
+using namespace Aidge;
+
+TEST_CASE("[cpu/operator] WeightInterleaving", "[WeightInterleaving][CPU]") {
+
+    std::shared_ptr<Node> myWeightInterleaving = WeightInterleaving();
+    auto opWeightInterleaving = std::static_pointer_cast<WeightInterleaving_Op>(myWeightInterleaving -> getOperator());
+
+    SECTION("CompactDataSize - Single element cases") {
+        REQUIRE(opWeightInterleaving->compactDataSize(1, 1) == 1);  // 1 bit, needs 1 byte
+        REQUIRE(opWeightInterleaving->compactDataSize(1, 7) == 1);  // 7 bits, needs 1 byte
+    }
+
+    SECTION("CompactDataSize - Boundary cases for different nb_bits values") {
+        REQUIRE(opWeightInterleaving->compactDataSize(8, 1) == 1);  // 8 elements at 1 bit each, fits in 1 byte
+        REQUIRE(opWeightInterleaving->compactDataSize(8, 2) == 2);  // 8 elements at 2 bits each, needs 2 bytes
+        REQUIRE(opWeightInterleaving->compactDataSize(8, 3) == 4);  // 8 elements at 3 bits each, needs 4 bytes
+        REQUIRE(opWeightInterleaving->compactDataSize(8, 4) == 4);  // 8 elements at 4 bits each, needs 4 bytes
+    }
+
+    SECTION("CompactDataSize - Larger dataSize values") {
+        REQUIRE(opWeightInterleaving->compactDataSize(16, 1) == 2);  // 16 elements at 1 bit each, fits in 2 bytes
+        REQUIRE(opWeightInterleaving->compactDataSize(16, 2) == 4);  // 16 elements at 2 bits each, needs 4 bytes
+        REQUIRE(opWeightInterleaving->compactDataSize(16, 3) == 8);  // 16 elements at 3 bits each, needs 6 bytes
+        REQUIRE(opWeightInterleaving->compactDataSize(16, 4) == 8);  // 16 elements at 4 bits each, needs 8 bytes
+    }
+
+    SECTION("CompactDataSize - Odd dataSize values with varying nb_bits") {
+        REQUIRE(opWeightInterleaving->compactDataSize(7, 1) == 1);  // 7 elements at 1 bit each, fits in 1 byte
+        REQUIRE(opWeightInterleaving->compactDataSize(7, 2) == 2);  // 7 elements at 2 bits each, needs 2 bytes
+        REQUIRE(opWeightInterleaving->compactDataSize(7, 3) == 4);  // 7 elements at 3 bits each, needs 4 bytes
+        REQUIRE(opWeightInterleaving->compactDataSize(7, 4) == 4);  // 7 elements at 4 bits each, needs 4 bytes
+    }
+
+    SECTION("CompactDataSize - Minimum and maximum values for nb_bits") {
+        REQUIRE(opWeightInterleaving->compactDataSize(5, 1) == 1);  // 5 elements at 1 bit each, fits in 1 byte
+    }
+
+    SECTION("CompactDataSize - Edge Case - dataSize of 0 should result in 0 required size") {
+        REQUIRE(opWeightInterleaving->compactDataSize(0, 1) == 0);  // No data elements
+    }
+
+
+    SECTION("CompactData - 4-bit compaction") {
+        std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array1D<std::int8_t, 4>{
+                                                                {static_cast<std::int8_t>(0x0F),
+                                                                static_cast<std::int8_t>(0xF5),
+                                                                static_cast<std::int8_t>(0xB3),
+                                                                static_cast<std::int8_t>(0x9C)}
+                                                                });
+
+        weight->setDataFormat(Aidge::DataFormat::NHWC);
+        weight->setDataType(Aidge::DataType::Int4);
+
+        std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array1D<std::int8_t, 2>{
+                                                                {static_cast<int8_t>(0xF5),
+                                                                static_cast<int8_t>(0x3C)}
+                                                                });
+
+        expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
+        expectedWeightInterleaving->setDataType(WeightInterleavedType_v<Aidge::DataType::Int4>);
+
+        std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
+        auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
+        op->associateInput(0,weight);
+        op->setDataType(WeightInterleavedType_v<Aidge::DataType::Int4>);
+        op->setDataFormat(DataFormat::NHWC);
+        op->setBackend("cpu");
+        myWeightInterleavingNode->forward();
+        REQUIRE(*(op->getOutput(0)) == *expectedWeightInterleaving);
+    }
+
+    SECTION("CompactData - 3-bit compaction") {
+        std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array1D<std::int8_t, 4>{
+                                                                {static_cast<int8_t>(0x0F),
+                                                                static_cast<int8_t>(0x05),
+                                                                static_cast<int8_t>(0x04),
+                                                                static_cast<int8_t>(0xD3)}
+                                                                });
+
+        weight->setDataFormat(Aidge::DataFormat::NHWC);
+        weight->setDataType(Aidge::DataType::Int3);
+
+        std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array1D<std::int8_t, 2>{
+                                                                {static_cast<int8_t>(0x75),
+                                                                static_cast<int8_t>(0x43)}
+                                                                });
+
+        expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
+        expectedWeightInterleaving->setDataType(WeightInterleavedType_v<Aidge::DataType::Int3>);
+
+        std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
+        auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
+        op->associateInput(0,weight);
+        op->setDataType(WeightInterleavedType_v<Aidge::DataType::Int3>);
+        op->setDataFormat(DataFormat::NHWC);
+        op->setBackend("cpu");
+        myWeightInterleavingNode->forward();
+        REQUIRE(*(op->getOutput(0)) == *expectedWeightInterleaving);
+    }
+
+    SECTION("CompactData - 2-bit compaction") {
+        std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array1D<std::int8_t, 4>{
+                                                                {static_cast<std::int8_t>(0x03),
+                                                                 static_cast<std::int8_t>(0x02),
+                                                                 static_cast<std::int8_t>(0x01),
+                                                                 static_cast<std::int8_t>(0x00)}
+                                                                 });
+
+        weight->setDataFormat(Aidge::DataFormat::NHWC);
+        weight->setDataType(Aidge::DataType::Int2);
+
+        std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array1D<std::int8_t, 1>{
+                                                                {static_cast<int8_t>(0xE4)}
+                                                                });
+
+        expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
+        expectedWeightInterleaving->setDataType(WeightInterleavedType_v<Aidge::DataType::Int2>);
+
+        std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
+        auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
+        op->associateInput(0,weight);
+        op->setDataType(WeightInterleavedType_v<Aidge::DataType::Int2>);
+        op->setDataFormat(DataFormat::NHWC);
+        op->setBackend("cpu");
+        myWeightInterleavingNode->forward();
+        REQUIRE(*(op->getOutput(0)) == *expectedWeightInterleaving);
+    }
+
+    SECTION("CompactData - Edge Cases - Single element data") {
+        std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array1D<std::int8_t, 1>{
+                                                                {static_cast<int8_t>(0x0F)}
+                                                                });
+
+        weight->setDataFormat(Aidge::DataFormat::NHWC);
+        weight->setDataType(Aidge::DataType::Int4);
+
+        std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array1D<std::int8_t, 1>{
+                                                                {static_cast<int8_t>(0xF0)}
+                                                                });
+
+        expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
+        expectedWeightInterleaving->setDataType(WeightInterleavedType_v<Aidge::DataType::Int4>);
+
+        std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
+        auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
+        op->associateInput(0,weight);
+        op->setDataType(WeightInterleavedType_v<Aidge::DataType::Int4>);
+        op->setDataFormat(DataFormat::NHWC);
+        op->setBackend("cpu");
+        myWeightInterleavingNode->forward();
+        REQUIRE(*(op->getOutput(0)) == *expectedWeightInterleaving);
+    }
+
+    SECTION("CompactData - Edge Cases - Non-divisible dataSize for nbSlot with nbbits=4") {
+        std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array1D<std::int8_t, 3>{
+                                                                {static_cast<int8_t>(0x0F),
+                                                                static_cast<int8_t>(0xA5),
+                                                                static_cast<int8_t>(0x34)}
+                                                                });
+
+        weight->setDataFormat(Aidge::DataFormat::NHWC);
+        weight->setDataType(Aidge::DataType::Int4);
+
+        std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array1D<std::int8_t, 2>{
+                                                                {static_cast<int8_t>(0xF5),
+                                                                static_cast<int8_t>(0x40)}
+                                                                });
+
+        expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
+        expectedWeightInterleaving->setDataType(WeightInterleavedType_v<Aidge::DataType::Int4>);
+
+        std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
+        auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
+        op->associateInput(0,weight);
+        op->setDataType(WeightInterleavedType_v<Aidge::DataType::Int4>);
+        op->setDataFormat(DataFormat::NHWC);
+        op->setBackend("cpu");
+        myWeightInterleavingNode->forward();
+        REQUIRE(*(op->getOutput(0)) == *expectedWeightInterleaving);
+
+    }
+
+    SECTION("CompactData - Edge Cases - Non-divisible dataSize for nbSlot with nbbits=3") {
+
+        std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array1D<std::int8_t, 3>{
+                                                                {static_cast<int8_t>(0x0F),
+                                                                static_cast<int8_t>(0x05),
+                                                                static_cast<int8_t>(0x04)}
+                                                                });
+
+        weight->setDataFormat(Aidge::DataFormat::NHWC);
+        weight->setDataType(Aidge::DataType::Int3);
+
+        std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array1D<std::int8_t, 2>{
+                                                                {static_cast<int8_t>(0x75),
+                                                                static_cast<int8_t>(0x40)}
+                                                                });
+
+        expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
+        expectedWeightInterleaving->setDataType(WeightInterleavedType_v<Aidge::DataType::Int3>);
+
+        std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
+        auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
+        op->associateInput(0,weight);
+        op->setDataType(WeightInterleavedType_v<Aidge::DataType::Int3>);
+        op->setDataFormat(DataFormat::NHWC);
+        op->setBackend("cpu");
+        myWeightInterleavingNode->forward();
+        REQUIRE(*(op->getOutput(0)) == *expectedWeightInterleaving);
+
+    }
+
+    SECTION("Forward Op - Convolution weight interleaving") {
+
+        // Weight [Cout = 2, H = 3, W = 3, Cin = 4]:
+        std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array4D<std::int8_t,2,3,3,4> {
+            {
+                {
+                    {
+                        {-6,  0,  5, -8}, // 'A' '0' '5' '8' in hexadecimal format
+                        { 5,  5,  4, -5}, // '5' '5' '4' 'B' in hexadecimal format
+                        {-7, -1,  4, -7}  // '9' 'F' '4' '9' in hexadecimal format
+                    },
+                    {
+                        { 3, -3, -3, -3}, // '3' 'D' 'D' 'D' in hexadecimal format
+                        { 1,  3,  1, -1}, // '1' '3' '1' 'F' in hexadecimal format
+                        { 7, -3, -1,  4}  // '7' 'D' 'F' '4' in hexadecimal format
+                    },
+                    {
+                        {-1,  3,  5,  6}, // 'F' '3' '5' '6' in hexadecimal format
+                        {-8,  4,  7,  1}, // '8' '4' '7' '1' in hexadecimal format
+                        {-5,  0, -1, -2}  // 'B' '0' 'F' 'E' in hexadecimal format
+                    }
+                },
+                {
+                    {
+                        { 2, -7,  7, -4}, // '2' '9' '7' 'C' in hexadecimal format
+                        {-7,  3,  0,  2}, // '9' '3' '0' '2' in hexadecimal format
+                        { 1, -1,  2,  3}  // '1' 'F' '2' '3' in hexadecimal format
+                    },
+                    {
+                        {-1, -5, -3, -7}, // 'F' 'B' 'D' '9' in hexadecimal format
+                        {-8,  3,  5, -1}, // '8' '3' '5' 'F' in hexadecimal format
+                        {-7, -4, -6, -1}  // '9' 'C' 'A' 'F' in hexadecimal format
+                    },
+                    {
+                        { 1,  7,  5, -1}, // '1' '7' '5' 'F' in hexadecimal format
+                        { 1, -8,  1,  2}, // '1' '8' '1' '2' in hexadecimal format
+                        {-1, -6, -3,  0}  // 'F' 'A' 'D' '0' in hexadecimal format
+                    }
+                }
+            }
+        });
+
+        std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array4D<std::int8_t,2,3,3,2> {
+            {
+                {
+                    {
+                        {static_cast<int8_t>(0xA0), static_cast<int8_t>(0x58)}, // 'A' '0' '5' '8' in hexadecimal format
+                        {static_cast<int8_t>(0x55), static_cast<int8_t>(0x4B)}, // '5' '5' '4' 'B' in hexadecimal format
+                        {static_cast<int8_t>(0x9F), static_cast<int8_t>(0x49)}  // '9' 'F' '4' '9' in hexadecimal format
+                    },
+                    {
+                        {static_cast<int8_t>(0x3D), static_cast<int8_t>(0xDD)}, // '3' 'D' 'D' 'D' in hexadecimal format
+                        {static_cast<int8_t>(0x13), static_cast<int8_t>(0x1F)}, // '1' '3' '1' 'F' in hexadecimal format
+                        {static_cast<int8_t>(0x7D), static_cast<int8_t>(0xF4)}  // '7' 'D' 'F' '4' in hexadecimal format
+                    },
+                    {
+                        {static_cast<int8_t>(0xF3), static_cast<int8_t>(0x56)}, // 'F' '3' '5' '6' in hexadecimal format
+                        {static_cast<int8_t>(0x84), static_cast<int8_t>(0x71)}, // '8' '4' '7' '1' in hexadecimal format
+                        {static_cast<int8_t>(0xB0), static_cast<int8_t>(0xFE)}  // 'B' '0' 'F' 'E' in hexadecimal format
+                    }
+                },
+                {
+                    {
+                        {static_cast<int8_t>(0x29), static_cast<int8_t>(0x7C)}, // '2' '9' '7' 'C' in hexadecimal format
+                        {static_cast<int8_t>(0x93), static_cast<int8_t>(0x02)}, // '9' '3' '0' '2' in hexadecimal format
+                        {static_cast<int8_t>(0x1F), static_cast<int8_t>(0x23)}  // '1' 'F' '2' '3' in hexadecimal format
+                    },
+                    {
+                        {static_cast<int8_t>(0xFB), static_cast<int8_t>(0xD9)}, // 'F' 'B' 'D' '9' in hexadecimal format
+                        {static_cast<int8_t>(0x83), static_cast<int8_t>(0x5F)}, // '8' '3' '5' 'F' in hexadecimal format
+                        {static_cast<int8_t>(0x9C), static_cast<int8_t>(0xAF)}  // '9' 'C' 'A' 'F' in hexadecimal format
+                    },
+                    {
+                        {static_cast<int8_t>(0x17), static_cast<int8_t>(0x5F)}, // '1' '7' '5' 'F' in hexadecimal format
+                        {static_cast<int8_t>(0x18), static_cast<int8_t>(0x12)}, // '1' '8' '1' '2' in hexadecimal format
+                        {static_cast<int8_t>(0xFA), static_cast<int8_t>(0xD0)}  // 'F' 'A' 'D' '0' in hexadecimal format
+                    }
+                }
+            }
+        });
+
+        weight->setDataFormat(Aidge::DataFormat::NHWC);
+        weight->setDataType(Aidge::DataType::Int4);
+
+        expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
+        expectedWeightInterleaving->setDataType(WeightInterleavedType_v<Aidge::DataType::Int4>);
+
+        std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
+        auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
+        op->associateInput(0,weight);
+        op->setDataType(WeightInterleavedType_v<Aidge::DataType::Int4>);
+        op->setDataFormat(DataFormat::NHWC);
+        op->setBackend("cpu");
+        myWeightInterleavingNode->forward();
+        REQUIRE(*(op->getOutput(0)) == *expectedWeightInterleaving);
+    }
+
+    SECTION("Recipie ApplyWeightInterleaving") {
+
+        // Weight [Cout = 2, H = 3, W = 3, Cin = 4]:
+        std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array4D<std::int8_t,2,3,3,4> {
+            {
+                {
+                    {
+                        {-6,  0,  5, -8}, // 'A' '0' '5' '8' in hexadecimal format
+                        { 5,  5,  4, -5}, // '5' '5' '4' 'B' in hexadecimal format
+                        {-7, -1,  4, -7}  // '9' 'F' '4' '9' in hexadecimal format
+                    },
+                    {
+                        { 3, -3, -3, -3}, // '3' 'D' 'D' 'D' in hexadecimal format
+                        { 1,  3,  1, -1}, // '1' '3' '1' 'F' in hexadecimal format
+                        { 7, -3, -1,  4}  // '7' 'D' 'F' '4' in hexadecimal format
+                    },
+                    {
+                        {-1,  3,  5,  6}, // 'F' '3' '5' '6' in hexadecimal format
+                        {-8,  4,  7,  1}, // '8' '4' '7' '1' in hexadecimal format
+                        {-5,  0, -1, -2}  // 'B' '0' 'F' 'E' in hexadecimal format
+                    }
+                },
+                {
+                    {
+                        { 2, -7,  7, -4}, // '2' '9' '7' 'C' in hexadecimal format
+                        {-7,  3,  0,  2}, // '9' '3' '0' '2' in hexadecimal format
+                        { 1, -1,  2,  3}  // '1' 'F' '2' '3' in hexadecimal format
+                    },
+                    {
+                        {-1, -5, -3, -7}, // 'F' 'B' 'D' '9' in hexadecimal format
+                        {-8,  3,  5, -1}, // '8' '3' '5' 'F' in hexadecimal format
+                        {-7, -4, -6, -1}  // '9' 'C' 'A' 'F' in hexadecimal format
+                    },
+                    {
+                        { 1,  7,  5, -1}, // '1' '7' '5' 'F' in hexadecimal format
+                        { 1, -8,  1,  2}, // '1' '8' '1' '2' in hexadecimal format
+                        {-1, -6, -3,  0}  // 'F' 'A' 'D' '0' in hexadecimal format
+                    }
+                }
+            }
+        });
+
+        std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array4D<std::int8_t,2,3,3,2> {
+            {
+                {
+                    {
+                        {static_cast<int8_t>(0xA0), static_cast<int8_t>(0x58)}, // 'A' '0' '5' '8' in hexadecimal format
+                        {static_cast<int8_t>(0x55), static_cast<int8_t>(0x4B)}, // '5' '5' '4' 'B' in hexadecimal format
+                        {static_cast<int8_t>(0x9F), static_cast<int8_t>(0x49)}  // '9' 'F' '4' '9' in hexadecimal format
+                    },
+                    {
+                        {static_cast<int8_t>(0x3D), static_cast<int8_t>(0xDD)}, // '3' 'D' 'D' 'D' in hexadecimal format
+                        {static_cast<int8_t>(0x13), static_cast<int8_t>(0x1F)}, // '1' '3' '1' 'F' in hexadecimal format
+                        {static_cast<int8_t>(0x7D), static_cast<int8_t>(0xF4)}  // '7' 'D' 'F' '4' in hexadecimal format
+                    },
+                    {
+                        {static_cast<int8_t>(0xF3), static_cast<int8_t>(0x56)}, // 'F' '3' '5' '6' in hexadecimal format
+                        {static_cast<int8_t>(0x84), static_cast<int8_t>(0x71)}, // '8' '4' '7' '1' in hexadecimal format
+                        {static_cast<int8_t>(0xB0), static_cast<int8_t>(0xFE)}  // 'B' '0' 'F' 'E' in hexadecimal format
+                    }
+                },
+                {
+                    {
+                        {static_cast<int8_t>(0x29), static_cast<int8_t>(0x7C)}, // '2' '9' '7' 'C' in hexadecimal format
+                        {static_cast<int8_t>(0x93), static_cast<int8_t>(0x02)}, // '9' '3' '0' '2' in hexadecimal format
+                        {static_cast<int8_t>(0x1F), static_cast<int8_t>(0x23)}  // '1' 'F' '2' '3' in hexadecimal format
+                    },
+                    {
+                        {static_cast<int8_t>(0xFB), static_cast<int8_t>(0xD9)}, // 'F' 'B' 'D' '9' in hexadecimal format
+                        {static_cast<int8_t>(0x83), static_cast<int8_t>(0x5F)}, // '8' '3' '5' 'F' in hexadecimal format
+                        {static_cast<int8_t>(0x9C), static_cast<int8_t>(0xAF)}  // '9' 'C' 'A' 'F' in hexadecimal format
+                    },
+                    {
+                        {static_cast<int8_t>(0x17), static_cast<int8_t>(0x5F)}, // '1' '7' '5' 'F' in hexadecimal format
+                        {static_cast<int8_t>(0x18), static_cast<int8_t>(0x12)}, // '1' '8' '1' '2' in hexadecimal format
+                        {static_cast<int8_t>(0xFA), static_cast<int8_t>(0xD0)}  // 'F' 'A' 'D' '0' in hexadecimal format
+                    }
+                }
+            }
+        });
+
+        expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
+        expectedWeightInterleaving->setDataType(Aidge::DataType::Dual_Int4);
+
+        // Create convolution node
+        std::shared_ptr<Node> conv = Conv(4, 2, {3, 3}, "conv1");
+
+        // Place the weight tensor in the weight producer of the conv
+        auto weightProducer = conv->getParent(1);
+        weightProducer->getOperator()->setOutput(0, weight);
+
+        // Set dataType, dataformat and backend of convolution
+        conv->getOperator()->setDataFormat(Aidge::DataFormat::NHWC);
+        conv->getOperator()->setDataType(Aidge::DataType::Int4);
+        conv->getOperator()->setBackend("cpu");
+
+        // Apply recipie
+        applyWeightInterleaving(conv);
+
+        // Compare the weight producer output tensor with the expected weights with interleaving
+        auto newProdOp = std::static_pointer_cast<OperatorTensor>(conv->getParent(1)->getOperator());
+        REQUIRE(*(newProdOp->getOutput(0)) == *expectedWeightInterleaving);
+    }
+
+}
diff --git a/unit_tests/scheduler/Test_Scheduler.cpp b/unit_tests/scheduler/Test_Scheduler.cpp
index 78a10c308a60f026b83ea64cfbd25a848099eb90..9224d6f99ce6af17441c9dac85549ad9f544cb89 100644
--- a/unit_tests/scheduler/Test_Scheduler.cpp
+++ b/unit_tests/scheduler/Test_Scheduler.cpp
@@ -18,6 +18,10 @@
 #include "aidge/graph/GraphView.hpp"
 #include "aidge/graph/OpArgs.hpp"
 #include "aidge/operator/Memorize.hpp"
+#include "aidge/operator/Pop.hpp"
+#include "aidge/operator/Stack.hpp"
+#include "aidge/operator/Identity.hpp"
+#include "aidge/operator/MetaOperator.hpp"
 #include "aidge/scheduler/SequentialScheduler.hpp"
 #include "aidge/scheduler/ParallelScheduler.hpp"
 
@@ -438,4 +442,69 @@ TEST_CASE("[cpu/scheduler] SequentialScheduler(backward)", "[scheduler][backward
     predictedOutput->setGrad(targetOutput);
     REQUIRE_NOTHROW(scheduler.backward());
 }
+
+std::shared_ptr<Node> Accumulate(int seqLength, const std::string& name) {
+    auto input = Identity((!name.empty()) ? name + "_input" : "");
+    auto hiddenState = Memorize(seqLength, (!name.empty()) ? name + "_hidden_state" : "");
+    auto add = Add((!name.empty()) ? name + "_add" : "");
+
+    input->addChild(add, 0, 0);
+    add->addChild(hiddenState, 0,0);
+    hiddenState->addChild(/*otherNode=*/add, /*outId=*/1, /*otherInId=*/1);
+
+    std::shared_ptr<GraphView> microGraph = std::make_shared<GraphView>();
+    microGraph->add(input);
+    microGraph->add({hiddenState, add});
+    microGraph->setOrderedInputs({{input, 0}, {hiddenState, 1}});
+    microGraph->setOrderedOutputs({{hiddenState, 0}});
+
+    auto metaOp = MetaOperator("Accumulate", microGraph, {}, name);
+    return metaOp;
+}
+
+TEST_CASE("[cpu/scheduler] Accumulate", "[scheduler]") {
+    std::shared_ptr<Tensor> Input = std::make_shared<Tensor>(
+        Array3D<float, 2, 3, 2>{{{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}},
+                                 {{2.0, 3.0}, {4.0, 5.0}, {6.0, 7.0}}}});
+
+    std::shared_ptr<Tensor> MemInit =
+        std::make_shared<Tensor>(Array2D<float, 3, 2>{
+            {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}});
+
+    auto meta = Accumulate(2, "accumulate");
+    auto op = std::static_pointer_cast<MetaOperator_Op>(meta->getOperator());
+    auto pop_i = Pop("pop_input");
+    auto pop_o = Identity("pop_output"); // NOTE: Could be Identity/Stack/Whatever node you want, this is is not the problem here
+
+    pop_i->getOperator()->associateInput(0, Input);
+    pop_i->addChild(op->getMicroGraph()->getOrderedInputs()[0].first, 0, 0);
+    op->getMicroGraph()->getOrderedOutputs()[0].first->addChild(pop_o, 0, 0);
+
+    //pop_i->addChild(meta, 0, 0);
+    //meta->addChild(pop_o, 0, 0);
+
+    //op->associateInput(1, MemInit);
+    op->getMicroGraph()->getNode("accumulate_hidden_state")->getOperator()->associateInput(1, MemInit);
+
+    // Build the graph.
+    auto myGraph = std::make_shared<GraphView>();
+    myGraph->add(pop_i);
+    myGraph->add(op->getMicroGraph());
+    //myGraph->add(meta);
+    myGraph->add(pop_o);
+    myGraph->compile("cpu", DataType::Float32);
+
+    myGraph->save("accumulate_graph", true);
+
+    // Schedule and run
+    auto scheduler = SequentialScheduler(myGraph);
+    scheduler.generateScheduling();
+    scheduler.saveStaticSchedulingDiagram("accumulate_scheduling");
+    REQUIRE_NOTHROW(scheduler.forward(true));
+
+    std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(
+        Array2D<float, 3, 2>{{{3.0, 5.0}, {7.0, 9.0}, {11.0, 13.0}}});
+    std::shared_ptr<Tensor> output = std::static_pointer_cast<OperatorTensor>(pop_o->getOperator())->getOutput(0);
+    REQUIRE(*output == *expectedOutput);
+}
 } // namespace Aidge