diff --git a/.gitignore b/.gitignore
index 0e14676b900cb1418593019be70cc4d20aba2883..9877699f938bdbf94d0383b5b9db6f5f4cf1023e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@
 # C++ Build
 build*/
 install*/
+include/aidge/backend/cpu_version.h
 
 # VSCode
 .vscode
diff --git a/CHANGELOG b/CHANGELOG
index a461371a17b586e8ebc65172282153a6ae8e09e2..9153a9a2d4506bcd147c7cb1a114c6a3db18b268 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,5 @@
+# Verson 0.5.0 (January 31, 2025)
+
 # Verson 0.4.0 (December 6, 2024)
 
 # Version 0.2.2 (May 14, 2024)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e9e191c36d5ad57a9a9dbed378154db6676ec796..66ef8ff28503a70de816d546b72e21d8528f0e33 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,17 +1,23 @@
 cmake_minimum_required(VERSION 3.18)
-set(CXX_STANDARD 14)
+
+set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS  OFF)
 
 file(STRINGS "${CMAKE_SOURCE_DIR}/version.txt" version)
 
+# Parse version.txt to retrieve Major, Minor and Path
+string(REGEX MATCH "([0-9]+\\.[0-9]+\\.[0-9]+)" _ MATCHES ${version})
+set(PROJECT_VERSION_MAJOR ${CMAKE_MATCH_1})
+set(PROJECT_VERSION_MINOR ${CMAKE_MATCH_2})
+set(PROJECT_VERSION_PATCH ${CMAKE_MATCH_3})
+
 project(aidge_backend_cpu
         VERSION ${version}
         DESCRIPTION "CPU implementations of the operators of aidge framework."
         LANGUAGES CXX)
 
-message(STATUS "Project name: ${CMAKE_PROJECT_NAME}")
-message(STATUS "Project version: ${version}")
-add_definitions(-DPROJECT_VERSION="${version}")
-
+# Retrieve latest git commit
 execute_process(
     COMMAND git rev-parse --short HEAD
     WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
@@ -19,8 +25,10 @@ execute_process(
     OUTPUT_STRIP_TRAILING_WHITESPACE
     ERROR_QUIET
 )
+
+message(STATUS "Project name: ${CMAKE_PROJECT_NAME}")
+message(STATUS "Project version: ${version}")
 message(STATUS "Latest git commit: ${GIT_COMMIT_HASH}")
-add_definitions(-DGIT_COMMIT_HASH="${GIT_COMMIT_HASH}")
 
 # helper for LSP users
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@@ -64,6 +72,8 @@ file(GLOB_RECURSE inc_files "include/*.hpp")
 add_library(${module_name} ${src_files} ${inc_files})
 
 target_link_libraries(${module_name}
+    PRIVATE
+        fmt::fmt
     PUBLIC
         _aidge_core # _ is added because we link the exported target and not the project
 )
@@ -115,6 +125,13 @@ if(CMAKE_COMPILER_IS_GNUCXX AND COVERAGE)
     append_coverage_compiler_flags()
 endif()
 
+message(STATUS "Creating ${CMAKE_CURRENT_SOURCE_DIR}/include/aidge/backend/cpu_version.h")
+# Generate version.h file from config file version.h.in
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/include/aidge/backend/version.h.in"
+    "${CMAKE_CURRENT_SOURCE_DIR}/include/aidge/backend/cpu_version.h"
+)
+
 ##############################################
 # Installation instructions
 include(GNUInstallDirs)
diff --git a/aidge_backend_cpu/__init__.py b/aidge_backend_cpu/__init__.py
index a7fe1ea3abdea25b18af6e7e0a1958f01f928433..bb320b2fe436a3be81dde8d643728bd5a30942e7 100644
--- a/aidge_backend_cpu/__init__.py
+++ b/aidge_backend_cpu/__init__.py
@@ -1,3 +1,2 @@
 import aidge_core
 from aidge_backend_cpu.aidge_backend_cpu import * # import so generated by PyBind
-from ._version import *
diff --git a/include/aidge/backend/cpu.hpp b/include/aidge/backend/cpu.hpp
index caa75328e58f6c9581f81368a3981bb79a069d49..5db19a2b7a2f88dae13d8baf24cf95f961e730a0 100644
--- a/include/aidge/backend/cpu.hpp
+++ b/include/aidge/backend/cpu.hpp
@@ -12,6 +12,8 @@
 #ifndef AIDGE_CPU_IMPORTS_H_
 #define AIDGE_CPU_IMPORTS_H_
 
+#include "aidge/backend/cpu_version.h"
+
 #include "aidge/backend/cpu/operator/AbsImpl.hpp"
 #include "aidge/backend/cpu/operator/AddImpl.hpp"
 #include "aidge/backend/cpu/operator/AndImpl.hpp"
@@ -28,9 +30,11 @@
 #include "aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp"
 #include "aidge/backend/cpu/operator/DivImpl.hpp"
 #include "aidge/backend/cpu/operator/ErfImpl.hpp"
+#include "aidge/backend/cpu/operator/ExpandImpl.hpp"
 #include "aidge/backend/cpu/operator/FCImpl.hpp"
 #include "aidge/backend/cpu/operator/FoldImpl.hpp"
 #include "aidge/backend/cpu/operator/GlobalAveragePoolingImpl.hpp"
+#include "aidge/backend/cpu/operator/HeavisideImpl.hpp"
 #include "aidge/backend/cpu/operator/LRNImpl.hpp"
 #include "aidge/backend/cpu/operator/LeakyReLUImpl.hpp"
 #include "aidge/backend/cpu/operator/LnImpl.hpp"
@@ -51,8 +55,8 @@
 #include "aidge/backend/cpu/operator/SoftmaxImpl.hpp"
 #include "aidge/backend/cpu/operator/SubImpl.hpp"
 #include "aidge/backend/cpu/operator/TanhImpl.hpp"
+#include "aidge/backend/cpu/operator/WeightInterleavedImpl.hpp"
 
 #include "aidge/backend/cpu/data/TensorImpl.hpp"
 
 #endif /* AIDGE_CPU_IMPORTS_H_ */
-
diff --git a/include/aidge/backend/cpu/operator/ExpandImpl.hpp b/include/aidge/backend/cpu/operator/ExpandImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..adfc6ab1ef2e6550c6307fb93d0079ea3b5fc5a2
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/ExpandImpl.hpp
@@ -0,0 +1,35 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_EXPANDIMPL_H_
+#define AIDGE_CPU_OPERATOR_EXPANDIMPL_H_
+
+#include <memory>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Expand.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using ExpandImpl_cpu = OperatorImpl_cpu<Expand_Op,
+                                        void(const std::shared_ptr<Tensor> &,
+                                             const std::shared_ptr<Tensor> &,
+                                             void *,
+                                             const std::vector<DimSize_t> &)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(Expand_Op, "cpu", Aidge::ExpandImpl_cpu::create);
+} // namespace Aidge
+
+#endif /* _AIDGE_CPU_OPERATOR_EXPANDIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/ExpandImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ExpandImpl_kernels.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3f4341c333ffd4032a90b8ee5d50a8475e81253e
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/ExpandImpl_kernels.hpp
@@ -0,0 +1,215 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_EXPANDIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_EXPANDIMPL_KERNELS_H_
+
+#include "aidge/backend/cpu/operator/ExpandImpl.hpp"
+#include "aidge/utils/Registrar.hpp"
+
+#include <aidge/data/Data.hpp>
+#include <aidge/data/Tensor.hpp>
+#include <aidge/data/half.hpp>
+#include <aidge/scheduler/ProdConso.hpp>
+#include <aidge/utils/Types.h>
+#include <cmath>
+#include <cstdint> // std::int32_t, std::int64_t
+#include <memory>
+#include <numeric>
+
+namespace {
+// suppose values are contiguous in memory
+template <class IO>
+void expandContiguousArray(const std::size_t inputStackSize,
+                           const std::size_t outputStackSize,
+                           const IO *input,
+                           IO *output) {
+    for (std::size_t i = 0; i < outputStackSize; ++i) {
+        output[i] = (inputStackSize == 1) ? input[0] : input[i];
+    }
+    return;
+}
+} // namespace
+
+namespace Aidge {
+
+template <class IO>
+void ExpandImpl_cpu_forward_kernel(
+    const std::shared_ptr<Tensor> &inData,
+    const std::shared_ptr<Tensor> &_inExpandShape,
+    void *_output,
+    const std::vector<DimSize_t> &outputDims) {
+
+    // retrieving data of inputShape & dimensions of inputDims
+    // as the process will require to modify the values
+    IO *output = static_cast<IO *>(_output);
+    std::vector<DimSize_t> inExpandShape(_inExpandShape->size());
+    for (DimSize_t i = 0; i < _inExpandShape->size(); ++i) {
+        inExpandShape[i] = _inExpandShape->get<std::int64_t>(i);
+    }
+    std::vector<DimSize_t> inDataDims = inData->dims();
+
+    // Example with 2 tensors
+    // [5,2,1,7] & [2,6,7]
+    // 1. Same number of dimensions but adding 1s to le left of "smallest"
+    // tensor -> [5,2,1,7] & [1,2,6,7]
+    // 2. Find the highest equal dimension -> 3
+    //    Exception: if the first diverging dimension is the last one, then ->
+    //    4 (dims.size())
+    // 3. Compute the highest number of contiguous data -> 7
+    // 4. Compute stride and offset step for the broadcast mechanism
+    // 5. Call a simple kernel
+
+    // ## Compute compatible input dimensions
+    // special case for equal dimensions, the kernel is called with the entire
+    // arrays at once
+
+    if (inDataDims == inExpandShape) {
+        const std::size_t input0ContiguousSize =
+            std::accumulate(inDataDims.cbegin(),
+                            inDataDims.cend(),
+                            static_cast<std::size_t>(1),
+                            std::multiplies<std::size_t>());
+        for (std::size_t i = 0; i < input0ContiguousSize; ++i) {
+            output[i] = inData->get<IO>(i);
+        }
+        return;
+    }
+
+    // set dimensions to be of equal size by filling the smallest one with
+    // ones.
+    if (inDataDims.size() > inExpandShape.size()) {
+        inExpandShape.insert(inExpandShape.cbegin(),
+                             inDataDims.size() - inExpandShape.size(),
+                             static_cast<DimSize_t>(1));
+    } else if (_inExpandShape->size() > inDataDims.size()) {
+        inDataDims.insert(inDataDims.cbegin(),
+                          inExpandShape.size() - inDataDims.size(),
+                          static_cast<DimSize_t>(1));
+    }
+
+    const std::size_t nbDims = inDataDims.size();
+
+    // Find the highest equal dimension
+    // std::size_t contiguousIdx = nbDims - 1;
+    std::size_t contiguousIdx = nbDims;
+    while (contiguousIdx-- > 0) {
+        // for (; contiguousIdx+1 > 0; --contiguousIdx) {
+        if (inDataDims[contiguousIdx] != inExpandShape[contiguousIdx]) {
+            break;
+        }
+    }
+    if (contiguousIdx == (nbDims - 1)) {
+        // last dimensions of one of the input Tensor are of size 1
+        const std::vector<std::size_t> &dims =
+            (inDataDims[contiguousIdx] == 1) ? inDataDims : inExpandShape;
+        while ((contiguousIdx + 1 > 0) && (dims[contiguousIdx] == 1)) {
+            --contiguousIdx;
+        }
+    }
+    ++contiguousIdx;
+
+    // Compute the highest number of contiguous data for each Tensor
+    const std::size_t inputDataContiguousSize =
+        std::accumulate(inDataDims.cbegin() + contiguousIdx,
+                        inDataDims.cend(),
+                        static_cast<std::size_t>(1),
+                        std::multiplies<std::size_t>());
+    const std::size_t outputContiguousSize =
+        std::accumulate(outputDims.cbegin() + contiguousIdx,
+                        outputDims.cend(),
+                        static_cast<std::size_t>(1),
+                        std::multiplies<std::size_t>());
+
+    // initialize strides to iterate through data because of broadcasting
+    std::unique_ptr<std::int32_t[]> stridePostIn =
+        std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> strideStepIn =
+        std::make_unique<std::int32_t[]>(contiguousIdx);
+    if (contiguousIdx > 0) {
+        stridePostIn[contiguousIdx - 1] = 1;
+        for (std::size_t i = contiguousIdx - 2;
+             i != static_cast<std::size_t>(-1);
+             --i) {
+            stridePostIn[i] = stridePostIn[i + 1] *
+                              static_cast<std::int32_t>(inDataDims[i + 1]);
+        }
+        for (std::size_t i = 0; i != contiguousIdx; ++i) {
+            strideStepIn[i] = (inDataDims[i] == 1) ? 1 - stridePostIn[i] : 1;
+        }
+    }
+
+    // variables for arrays offsets
+    std::size_t offsetInData = 0;
+    std::size_t offsetOut = 0;
+
+    std::size_t dim = contiguousIdx - 1;
+    const std::size_t nbStacks =
+        std::accumulate(outputDims.cbegin(),
+                        outputDims.cbegin() + contiguousIdx,
+                        static_cast<std::size_t>(1),
+                        std::multiplies<std::size_t>());
+
+    for (std::size_t stack = 0; stack < nbStacks;) {
+        expandContiguousArray<IO>(
+            inputDataContiguousSize,
+            outputContiguousSize,
+            &static_cast<const IO *>(
+                inData->getImpl()
+                    ->rawPtr())[offsetInData * inputDataContiguousSize],
+            &output[offsetOut * outputContiguousSize]);
+        if (++stack < nbStacks) {
+            std::size_t tmpStack = stack;
+            while (tmpStack % outputDims[dim] == 0) {
+                tmpStack /= outputDims[dim];
+                dim--;
+            }
+            offsetInData += strideStepIn[dim];
+            ++offsetOut;
+            dim = contiguousIdx - 1;
+        }
+    }
+}
+
+REGISTRAR(ExpandImpl_cpu,
+          {{DataType::Int16, DataType::Int64}, {DataType::Int16}},
+          {ProdConso::inPlaceModel,
+           Aidge::ExpandImpl_cpu_forward_kernel<std::int16_t>,
+           nullptr});
+REGISTRAR(ExpandImpl_cpu,
+          {{DataType::Int32, DataType::Int64}, {DataType::Int32}},
+          {ProdConso::inPlaceModel,
+           Aidge::ExpandImpl_cpu_forward_kernel<std::int32_t>,
+           nullptr});
+REGISTRAR(ExpandImpl_cpu,
+          {{DataType::Int64, DataType::Int64}, {DataType::Int64}},
+          {ProdConso::inPlaceModel,
+           Aidge::ExpandImpl_cpu_forward_kernel<std::int64_t>,
+           nullptr});
+
+REGISTRAR(ExpandImpl_cpu,
+          {{DataType::Float16, DataType::Int64}, {DataType::Float16}},
+          {ProdConso::inPlaceModel,
+           Aidge::ExpandImpl_cpu_forward_kernel<half_float::half>,
+           nullptr});
+REGISTRAR(ExpandImpl_cpu,
+          {{DataType::Float32, DataType::Int64}, {DataType::Float32}},
+          {ProdConso::inPlaceModel,
+           Aidge::ExpandImpl_cpu_forward_kernel<float>,
+           nullptr});
+REGISTRAR(ExpandImpl_cpu,
+          {{DataType::Float64, DataType::Int64}, {DataType::Float64}},
+          {ProdConso::inPlaceModel,
+           Aidge::ExpandImpl_cpu_forward_kernel<double>,
+           nullptr});
+} // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_EXPANDIMPL_KERNELS_H_ */
diff --git a/include/aidge/backend/cpu/operator/HeavisideImpl.hpp b/include/aidge/backend/cpu/operator/HeavisideImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7a3ba9add1e98580c51a8416adc0d1feb5e1317a
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/HeavisideImpl.hpp
@@ -0,0 +1,32 @@
+/********************************************************************************
+ * Copyright (c) 2025 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_HEAVISIDEIMPL_H_
+#define AIDGE_CPU_OPERATOR_HEAVISIDEIMPL_H_
+
+#include <cstddef> // std::size_t
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Heaviside.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/future_std/span.hpp"
+
+namespace Aidge {
+using HeavisideImplCpu =
+    OperatorImpl_cpu<Heaviside_Op,
+                     void(std::size_t, const void *, void *, const float),
+                     void(const float, std::size_t, const void *, void *)>;
+
+// Implementation entry point registration for operator Heaviside
+REGISTRAR(Heaviside_Op, "cpu", HeavisideImplCpu::create);
+} // namespace Aidge
+
+#endif // AIDGE_CPU_OPERATOR_HEAVISIDEIMPL_H_
diff --git a/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp b/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3fd6ca7de348ff18e75b2a88281d4db980b58774
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp
@@ -0,0 +1,46 @@
+/********************************************************************************
+ * Copyright (c) 2025 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_HEAVISIDEIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_HEAVISIDEIMPL_KERNELS_H_
+
+#include "aidge/utils/Registrar.hpp"
+
+#include <cstddef> // std::size_t
+
+#include "aidge/backend/cpu/operator/HeavisideImpl.hpp"
+#include "aidge/utils/ErrorHandling.hpp"
+
+
+namespace Aidge {
+
+template <class I, class O>
+void HeavisideImplCpuForwardKernel(std::size_t inputLenght,
+                                   const void *input_,
+                                   void *output_,
+                                   const float value) {
+    const I *input = static_cast<const I *>(input_);
+    O *output = static_cast<O *>(output_);
+
+    for (std::size_t i = 0; i < inputLenght; ++i) {
+        output[i] = (input[i] > 0) ? 1 : (input[i] == 0 ? value : 0);
+    }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(HeavisideImplCpu,
+          {DataType::Float32},
+          {ProdConso::inPlaceModel,
+           Aidge::HeavisideImplCpuForwardKernel<float, float>,
+           nullptr});
+} // namespace Aidge
+
+#endif // AIDGE_CPU_OPERATOR_HEAVISIDEIMPL_KERNELS_H__H_
diff --git a/include/aidge/backend/cpu/operator/MulImpl.hpp b/include/aidge/backend/cpu/operator/MulImpl.hpp
index c927af9ebd4d658c764cc059df9778c273ba178e..eec5583bb548a3d3343b966c54cfccd1600b8f76 100644
--- a/include/aidge/backend/cpu/operator/MulImpl.hpp
+++ b/include/aidge/backend/cpu/operator/MulImpl.hpp
@@ -34,6 +34,7 @@ using MulImpl_cpu = OperatorImpl_cpu<Mul_Op,
         const std::size_t,
         const std::vector<std::size_t>,
         const std::vector<std::size_t>,
+        const std::vector<std::size_t>,
         const void*,
         const void*,
         const void*,
diff --git a/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp
index 556dd56cd32f28de14a43d20b97deb0083341fee..36acb9199c51e900287ca9b262322aa86287d838 100644
--- a/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp
@@ -149,61 +149,53 @@ void MulImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
 
 template <class I1, class I2, class O>
 void MulImpl_cpu_backward_kernel(const std::size_t input0Length,
-                                 const std::size_t input1Length,
-                                 const std::size_t grad0Length,
-                                 const std::vector<std::size_t> input0Dims,
-                                 const std::vector<std::size_t> input1Dims,
-                                 const void* input0_,
-                                 const void* input1_,
-                                 const void* grad_output_,
-                                 void* gradientInput0,
-                                 void* gradientInput1)
+                                  const std::size_t input1Length,
+                                  const std::size_t gradOutputLength,
+                                  const std::vector<std::size_t>& dims0,
+                                  const std::vector<std::size_t>& dims1,
+                                  const std::vector<std::size_t>& outputDims,
+                                  const void* input0_,
+                                  const void* input1_,
+                                  const void* grad_output_,
+                                  void* gradientInput0_,
+                                  void* gradientInput1_)
 {
-    const auto* input0 = static_cast<const I1*>(input0_);
-    const auto* input1 = static_cast<const I1*>(input1_);
-    const auto* grad_output = static_cast<const O*>(grad_output_);
-    auto* grad_input_0 = static_cast<I1*>(gradientInput0);
-    auto* grad_input_1 = static_cast<I2*>(gradientInput1);
-
-
-    if(input0Dims.size() >= input1Dims.size())
-    {
-        AIDGE_ASSERT(input0Length == grad0Length, "Incorrect dimensions between Mul input and output tensors");
-
-        for(auto i = 0U; i < input0Length; ++i)
-        {
-            const auto indices = getMultiDimIndices(input1Dims, i);
-            const auto flattenedIndex = getFlattenedIndex(input1Dims, indices);
-
-            grad_input_0[i] = input1[flattenedIndex] * grad_output[i];
-        }
-
-        for(std::size_t i = 0 ; i < grad0Length; ++i)
-        {
-            const auto indices = getMultiDimIndices(input1Dims, i);
-            const auto flattenedIndex = getFlattenedIndex(input1Dims, indices);
-
-            grad_input_1[flattenedIndex] += input0[i] * grad_output[i];
+    const I1* input0 = static_cast<const I1*>(input0_);
+    const I2* input1 = static_cast<const I2*>(input1_);
+    const O* grad_output = static_cast<const O*>(grad_output_);
+    auto* grad_input_0 = static_cast<I1*>(gradientInput0_);
+    auto* grad_input_1 = static_cast<I2*>(gradientInput1_);
+
+    std::fill_n(grad_input_0, input0Length, static_cast<I1>(0));
+    std::fill_n(grad_input_1, input1Length, static_cast<I2>(0));
+
+    // Broadcast dims0 and dims1 to match the shape of outputDims
+    auto broadcastedDims0 = getBroadcastedDims(outputDims, dims0);
+    auto broadcastedDims1 = getBroadcastedDims(outputDims, dims1);
+
+    for (std::size_t i = 0; i < gradOutputLength; ++i) {
+        auto idxOutputGrad = getMultiDimIndices(outputDims, i);
+        std::vector<std::size_t> idxInput0(broadcastedDims0.size());
+        std::vector<std::size_t> idxInput1(broadcastedDims1.size());
+
+        // Map output indices to input0 indices, considering broadcasting
+        for (std::size_t dimension = 0; dimension < broadcastedDims0.size(); ++dimension) {
+            // If input0 is broadcasted along this dimension (== 1) or both dimensions are 1, index is 0.
+            // idxInput0 represent the multi dim index of input0 contributing
+            // to the output at index i.
+            idxInput0[dimension] = (broadcastedDims0[dimension] == 1) ? 0 : idxOutputGrad[dimension];
         }
 
-    } else {
-        AIDGE_ASSERT(input1Length == grad0Length, "Incorrect dimensions between Mul input and output tensors");
-
-        for(auto i = 0U; i < input1Length; ++i)
-        {
-            const auto indices = getMultiDimIndices(input0Dims, i);
-            const auto flattenedIndex = getFlattenedIndex(input0Dims, indices);
-
-            grad_input_1[i] = input0[flattenedIndex] * grad_output[i];
+        for (std::size_t dimension = 0; dimension < broadcastedDims1.size(); ++dimension) {
+            idxInput1[dimension] = (broadcastedDims1[dimension] == 1) ? 0 : idxOutputGrad[dimension];
         }
 
-        for(std::size_t i = 0 ; i < grad0Length; ++i)
-        {
-            const auto indices = getMultiDimIndices(input0Dims, i);
-            const auto flattenedIndex = getFlattenedIndex(input0Dims, indices);
+        // We have to access tensors with a flat index, hence the conversion
+        auto idx0 = getFlattenedIndex(broadcastedDims0, idxInput0);
+        auto idx1 = getFlattenedIndex(broadcastedDims1, idxInput1);
 
-            grad_input_0[flattenedIndex] += input1[i] * grad_output[i];
-        }
+        grad_input_0[idx0] += static_cast<I1>(grad_output[i] * input1[idx1]);
+        grad_input_1[idx1] += static_cast<I2>(grad_output[i] * input0[idx0]);
     }
 }
 
@@ -211,6 +203,9 @@ void MulImpl_cpu_backward_kernel(const std::size_t input0Length,
 REGISTRAR(MulImpl_cpu,
     {DataType::Float32},
     {ProdConso::inPlaceModel, Aidge::MulImpl_cpu_forward_kernel<float, float, float>, Aidge::MulImpl_cpu_backward_kernel<float, float, float>});
+REGISTRAR(MulImpl_cpu,
+    {{{DataType::Float32}, {DataType::Float64}}, {DataType::Float32}},
+    {ProdConso::inPlaceModel, Aidge::MulImpl_cpu_forward_kernel<float, double, float>, Aidge::MulImpl_cpu_backward_kernel<float, double, float>});
 REGISTRAR(MulImpl_cpu,
     {DataType::Float64},
     {ProdConso::inPlaceModel, Aidge::MulImpl_cpu_forward_kernel<double, double, double>, Aidge::MulImpl_cpu_backward_kernel<double, double, double>});
diff --git a/include/aidge/backend/cpu/operator/ResizeImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ResizeImpl_kernels.hpp
index 6a22ff4ec9d7beaf05be3b479b43dd3ad69bc74b..6449417baf855620669aba11ebca16d9384c4e7c 100644
--- a/include/aidge/backend/cpu/operator/ResizeImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ResizeImpl_kernels.hpp
@@ -99,30 +99,31 @@ void ResizeImpl_cpu_forward_kernel(
     }
     return;
 }
+
 // Kernels registration to implementation entry point
 REGISTRAR(ResizeImpl_cpu,
           {{{DataType::Int16},
-            {DataType::Float32},
-            {DataType::Float32},
-            {DataType::UInt64}},
+            {DataType::Any},
+            {DataType::Any},
+            {DataType::Any}},
            {DataType::Int16}},
           {ProdConso::inPlaceModel,
            ResizeImpl_cpu_forward_kernel<int16_t>,
            nullptr});
 REGISTRAR(ResizeImpl_cpu,
           {{{DataType::Int32},
-            {DataType::Float32},
-            {DataType::Float32},
-            {DataType::UInt64}},
+            {DataType::Any},
+            {DataType::Any},
+            {DataType::Any}},
            {DataType::Int32}},
           {ProdConso::inPlaceModel,
            ResizeImpl_cpu_forward_kernel<int32_t>,
            nullptr});
 REGISTRAR(ResizeImpl_cpu,
           {{{DataType::Int64},
-            {DataType::Float32},
-            {DataType::Float32},
-            {DataType::Int64}},
+            {DataType::Any},
+            {DataType::Any},
+            {DataType::Any}},
            {DataType::UInt64}},
           {ProdConso::inPlaceModel,
            ResizeImpl_cpu_forward_kernel<int64_t>,
@@ -130,27 +131,27 @@ REGISTRAR(ResizeImpl_cpu,
 
 REGISTRAR(ResizeImpl_cpu,
           {{{DataType::Float16},
-            {DataType::Float32},
-            {DataType::Float32},
-            {DataType::UInt64}},
+            {DataType::Any},
+            {DataType::Any},
+            {DataType::Any}},
            {DataType::Float16}},
           {ProdConso::inPlaceModel,
            ResizeImpl_cpu_forward_kernel<half_float::half>,
            nullptr});
 REGISTRAR(ResizeImpl_cpu,
           {{{DataType::Float32},
-            {DataType::Float32},
-            {DataType::Float32},
-            {DataType::UInt64}},
+            {DataType::Any},
+            {DataType::Any},
+            {DataType::Any}},
            {DataType::Float32}},
           {ProdConso::inPlaceModel,
            ResizeImpl_cpu_forward_kernel<float>,
            nullptr});
 REGISTRAR(ResizeImpl_cpu,
           {{{DataType::Float64},
-            {DataType::Float32},
-            {DataType::Float32},
-            {DataType::UInt64}},
+            {DataType::Any},
+            {DataType::Any},
+            {DataType::Any}},
            {DataType::Float64}},
           {ProdConso::inPlaceModel,
            ResizeImpl_cpu_forward_kernel<double>,
diff --git a/include/aidge/backend/cpu/operator/WeightInterleavedImpl.hpp b/include/aidge/backend/cpu/operator/WeightInterleavedImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ff5c4778f530912e8bdf97ffadb2f546789e2c48
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/WeightInterleavedImpl.hpp
@@ -0,0 +1,37 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_WEIGHTINTERLEAVINGIMPL_H_
+#define AIDGE_CPU_OPERATOR_WEIGHTINTERLEAVINGIMPL_H_
+
+#include <array>
+#include <memory>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/WeightInterleaving.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using WeightInterleavedImpl_cpu = OperatorImpl_cpu<WeightInterleaving_Op,
+    void(const DimSize_t,
+        const DimSize_t,
+        const DimSize_t,
+        const void *,
+        void *)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(WeightInterleaving_Op, "cpu", Aidge::WeightInterleavedImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_WeightInterleavingIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/WeightInterleavedImpl_kernels.hpp b/include/aidge/backend/cpu/operator/WeightInterleavedImpl_kernels.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..18557f8fb5fcdd31476904d273d4d2d7f37a66b5
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/WeightInterleavedImpl_kernels.hpp
@@ -0,0 +1,143 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_WEIGHTINTERLEAVEDIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_WEIGHTINTERLEAVEDIMPL_KERNELS_H_
+
+#include <cstddef>  // std::size_t
+#include <cstdint>  // std::int8_t, std::uint8_t
+
+#include "aidge/backend/cpu/operator/WeightInterleavedImpl.hpp"
+#include "aidge/data/DataType.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/ErrorHandling.hpp"
+
+
+namespace Aidge {
+
+    /**
+     * @brief Compacts 8-bit data into a smaller bit-width representation.
+     *
+     * This function takes an array of 8-bit data and compacts it into smaller chunks
+     * based on the specified bit-width `nb_bits`. Each element in `compactData` will
+     * store multiple packed `nb_bits` segments extracted from `data`.
+     *
+     * @param data The input array of 8-bit values to be compacted.
+     * @param dataSize The size of the input `data` array.
+     * @param compactData The output array storing the compacted data.
+     * @param nb_bits The number of bits to extract from each `data` element (must be less than 8).
+     */
+    template <typename T>
+    void compact_data(const T* data, std::size_t dataSize, T* compactData, std::uint8_t nb_bits) {
+        AIDGE_ASSERT(nb_bits > 0 && nb_bits < 5, "Cannot compact with the given nb_bits"); // Ensure valid bit width
+
+        // Mask to extract `nb_bits` from each data element
+        const unsigned int mask = (1U << nb_bits) - 1;
+
+        // Calculate the number of `nb_bits` segments that fit into an 8-bit compacted value
+        const unsigned int nbSlot = 8 / nb_bits;
+
+        // Case nb_bits=3 or 4, then shift is 4
+        // Case nb_bits=2, then shift is 2
+        // Case nb_bits=1, then shift is 1
+        std::uint8_t shift = 8 / nbSlot;
+
+        const unsigned int nbFullCompactbytes = dataSize / nbSlot;
+
+        // Main loop to process data in groups of `nbSlot`
+        for (std::size_t i = 0; i < nbFullCompactbytes; ++i) {
+            T compact = 0;
+
+            for (unsigned int j = 0; j < nbSlot; ++j) {
+                compact |= (data[i * nbSlot + j] & mask);    // Apply mask to keep `nb_bits` only
+
+                // Shift only if not on the last slot to make room for the next `nb_bits`
+                if (j < nbSlot - 1) {
+                    compact <<= shift;
+                }
+            }
+            // Store the compacted value in the output array
+            compactData[i] = compact;
+        }
+
+
+        // Handle any remaining data elements (if dataSize is not a multiple of nbSlot).
+        std::size_t remaining = dataSize % nbSlot;
+        if (remaining != 0) {
+            std::int8_t compact = 0;
+            for (std::size_t j = 0; j < remaining; ++j) {
+                compact |= (data[nbFullCompactbytes*nbSlot + j] & mask);
+
+                if (j < remaining - 1) {
+                    compact <<= shift;
+                }
+            }
+            compact <<= (shift*(nbSlot - remaining));
+            // Store the last compacted value
+            compactData[dataSize / nbSlot] = compact;
+        }
+    }
+
+template <class I, class O, int nb_bits>
+void WeightInterleavedImpl_cpu_forward_kernel(const DimSize_t input_interleaving,
+                            const DimSize_t nb_interleaving,
+                            const DimSize_t output_interleaving,
+                            const void* input_,
+                            void* output_) {
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+
+    // Aidge::compact_data(const std::int8_t* data, std::size_t dataSize, std::int8_t* compactData, std::uint8_t nb_bits) {
+    for (std::size_t i=0; i<nb_interleaving; ++i){
+        compact_data(input+(i*input_interleaving), input_interleaving, output+(i*output_interleaving), static_cast<std::uint8_t>(nb_bits));
+    }
+
+}
+
+
+REGISTRAR(WeightInterleavedImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Int4, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavedType_v<DataType::Int4>, DataFormat::NHWC}},
+    {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<int8_t, int8_t, 4>, nullptr});
+REGISTRAR(WeightInterleavedImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Int3, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavedType_v<DataType::Int3>, DataFormat::NHWC}},
+    {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<int8_t, int8_t, 3>, nullptr});
+REGISTRAR(WeightInterleavedImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Int2, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavedType_v<DataType::Int2>, DataFormat::NHWC}},
+    {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<int8_t, int8_t, 2>, nullptr});
+REGISTRAR(WeightInterleavedImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Binary, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavedType_v<DataType::Binary>, DataFormat::NHWC}},
+    {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<int8_t, int8_t, 1>, nullptr});
+
+REGISTRAR(WeightInterleavedImpl_cpu,
+    {ImplSpec::IOSpec{DataType::UInt4, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavedType_v<DataType::UInt4>, DataFormat::NHWC}},
+    {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<uint8_t, uint8_t, 4>, nullptr});
+REGISTRAR(WeightInterleavedImpl_cpu,
+    {ImplSpec::IOSpec{DataType::UInt3, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavedType_v<DataType::UInt3>, DataFormat::NHWC}},
+    {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<uint8_t, uint8_t, 3>, nullptr});
+REGISTRAR(WeightInterleavedImpl_cpu,
+    {ImplSpec::IOSpec{DataType::UInt2, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavedType_v<DataType::UInt2>, DataFormat::NHWC}},
+    {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<uint8_t, uint8_t, 2>, nullptr});
+
+
+// REGISTRAR(WeightInterleavedImpl_cpu,
+//     {ImplSpec::IOSpec{DataType::Int4, DataFormat::NHWC}},
+//     {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<int8_t, int8_t, 4>, nullptr});
+// REGISTRAR(WeightInterleavedImpl_cpu,
+//     {ImplSpec::IOSpec{DataType::Int3, DataFormat::NHWC}},
+//     {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<int8_t, int8_t, 3>, nullptr});
+// REGISTRAR(WeightInterleavedImpl_cpu,
+//     {ImplSpec::IOSpec{DataType::Int2, DataFormat::NHWC}},
+//     {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<int8_t, int8_t, 2>, nullptr});
+
+
+}
+
+#endif /* AIDGE_CPU_OPERATOR_WEIGHTINTERLEAVEDIMPL_KERNELS_H_ */
\ No newline at end of file
diff --git a/include/aidge/backend/version.h.in b/include/aidge/backend/version.h.in
new file mode 100644
index 0000000000000000000000000000000000000000..4b876f63002972c1f8f1340b70cdecdace911012
--- /dev/null
+++ b/include/aidge/backend/version.h.in
@@ -0,0 +1,11 @@
+#ifndef VERSION_H
+#define VERSION_H
+
+namespace Aidge {
+static constexpr const int PROJECT_VERSION_MAJOR = @PROJECT_VERSION_MAJOR@;
+static constexpr const int PROJECT_VERSION_MINOR = @PROJECT_VERSION_MINOR@;
+static constexpr const int PROJECT_VERSION_PATCH = @PROJECT_VERSION_PATCH@;
+static constexpr const char * PROJECT_VERSION = "@PROJECT_VERSION_MAJOR@.@PROJECT_VERSION_MINOR@.@PROJECT_VERSION_PATCH@";
+static constexpr const char * PROJECT_GIT_HASH = "@GIT_COMMIT_HASH@";
+}
+#endif // VERSION_H
diff --git a/include/aidge/utils/sys_info/CpuVersionInfo.hpp b/include/aidge/utils/sys_info/CpuVersionInfo.hpp
index 887ce839e079349d9d64505f7184831ffc4cf1c2..3df70d139bd4ad26e0c88d2ee5192e508bc3f71a 100644
--- a/include/aidge/utils/sys_info/CpuVersionInfo.hpp
+++ b/include/aidge/utils/sys_info/CpuVersionInfo.hpp
@@ -2,17 +2,20 @@
 #define AIDGE_UTILS_SYS_INFO_CPU_VERSION_INFO_H
 
 #include "aidge/utils/Log.hpp"
+#include "aidge/backend/cpu_version.h"
 
 namespace Aidge {
 
-#ifndef PROJECT_VERSION // Normally defined in CMakeLists.txt
-#define PROJECT_VERSION "Unknown version"
-#endif
-#ifndef GIT_COMMIT_HASH
-#define GIT_COMMIT_HASH ""
-#endif
-void showCpuVersion() {
-    Log::info("Aidge backend CPU: {} ({}), {} {}", PROJECT_VERSION, GIT_COMMIT_HASH, __DATE__, __TIME__);
+constexpr inline const char * getBackendCPUProjectVersion(){
+    return PROJECT_VERSION;
+}
+
+constexpr inline const char * getBackendCPUGitHash(){
+    return PROJECT_GIT_HASH;
+}
+
+void showBackendCpuVersion() {
+    Log::info("Aidge backend CPU: {} ({}), {} {}", getBackendCPUProjectVersion(), getBackendCPUGitHash(), __DATE__, __TIME__);
         // Compiler version
     #if defined(__clang__)
     /* Clang/LLVM. ---------------------------------------------- */
diff --git a/project_name.txt b/project_name.txt
new file mode 100644
index 0000000000000000000000000000000000000000..25caafdd8ab794dbcb8c8cdef5097e2143accc6a
--- /dev/null
+++ b/project_name.txt
@@ -0,0 +1 @@
+aidge_backend_cpu
diff --git a/pyproject.toml b/pyproject.toml
index 3c08302d0fcd4e77943d165fab802a22f4dc39cc..39bed4d209581b272a8491fbce6c3f28029fdd57 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,20 +4,28 @@ description="CPU implementation of operators of the AIDGE framework"
 dependencies = [
     "numpy",
 ]
-requires-python = ">= 3.7"
+requires-python = ">= 3.8"
 readme = "README.md"
 license = { file = "LICENSE" }
-classifiers = [ 
+classifiers = [
     "Development Status :: 2 - Pre-Alpha",
     "Programming Language :: Python :: 3"
     ]
-dynamic = ["version"] # defined in tool.setuptools_scm
+dynamic = ["version"] # defined by pbr
+
+
+[project.urls]
+Homepage = "https://www.deepgreen.ai/en/platform"
+Documentation = "https://eclipse-aidge.readthedocs.io/en/latest/"
+Repository = "https://gitlab.eclipse.org/eclipse/aidge/aidge_backend_cpu"
+Issues = "https://gitlab.eclipse.org/eclipse/aidge/aidge_backend_cpu/-/issues"
+Changelog = "https://gitlab.eclipse.org/eclipse/aidge/aidge_backend_cpu/-/releases"
 
 [build-system]
 requires = [
     "setuptools>=64",
-    "setuptools_scm[toml]==7.1.0",
-    "cmake>=3.18.4.post1"
+    "cmake>=3.18.4.post1",
+    "pbr"
 ]
 build-backend = "setuptools.build_meta"
 
@@ -29,9 +37,6 @@ where = ["."]  # list of folders that contain the packages (["."] by default)
 include = ["aidge_backend_cpu*"]  # package names should match these glob patterns (["*"] by default)
 exclude = ["aidge_backend_cpu.unit_tests*"]  # exclude packages matching these glob patterns (empty by default)
 namespaces = false  # to disable scanning PEP 420 namespaces (true by default)
-# SETUPTOOLS_SCM
-[tool.setuptools_scm]
-write_to = "aidge_backend_cpu/_version.py"
 
 #####################################################
 # CIBUILDWHEEL
diff --git a/python_binding/pybind_cpu.cpp b/python_binding/pybind_cpu.cpp
index d5022e1d469ae4171e796baed6c1aa061dd95765..e576de0849ec6e0739b7cad32e8e8b003c092b7b 100644
--- a/python_binding/pybind_cpu.cpp
+++ b/python_binding/pybind_cpu.cpp
@@ -6,10 +6,10 @@ namespace py = pybind11;
 
 namespace Aidge {
 
-void init_cpu_sys_info(py::module& m);
+void init_CpuVersionInfo(py::module& m);
 
 void init_Aidge(py::module& m){
-    init_cpu_sys_info(m);
+    init_CpuVersionInfo(m);
 }
 
 
diff --git a/python_binding/utils/sys_info/pybind_CpuVersionInfo.cpp b/python_binding/utils/sys_info/pybind_CpuVersionInfo.cpp
index 573bee3659c65f90935e03c06eff5a2998bb9f5b..7461dd955e8b3e64b086a51664b3d73b6bfaed93 100644
--- a/python_binding/utils/sys_info/pybind_CpuVersionInfo.cpp
+++ b/python_binding/utils/sys_info/pybind_CpuVersionInfo.cpp
@@ -3,7 +3,9 @@
 
 namespace py = pybind11;
 namespace Aidge {
-void init_cpu_sys_info(py::module& m){
-    m.def("show_cpu_version", &showCpuVersion);
+void init_CpuVersionInfo(py::module& m){
+    m.def("show_version", &showBackendCpuVersion);
+    m.def("get_project_version", &getBackendCPUProjectVersion);
+    m.def("get_git_hash", &getBackendCPUGitHash);
 }
 }
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..aa0f227f6688468a5ab93384f7b1670086000035
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,3 @@
+# pbr file
+[metadata]
+version = file: version.txt
diff --git a/setup.py b/setup.py
index 22cbd9732c8b9e1099c3e322032e8377f6d4506b..366a4825da5d8ad369834b2231152a8c0424c9e8 100644
--- a/setup.py
+++ b/setup.py
@@ -11,8 +11,10 @@ from math import ceil
 from setuptools import setup, Extension
 from setuptools.command.build_ext import build_ext
 
+def get_project_name() -> str:
+    return open(pathlib.Path().absolute() / "project_name.txt", "r").read().strip()
 
-PROJECT_NAME = "aidge_backend_cpu"
+PROJECT_NAME = get_project_name()
 
 SETUP_DIR = pathlib.Path(__file__).parent
 
@@ -37,6 +39,7 @@ class AidgePkgBuild(build_ext):
         # This lists the number of processors available on the machine
         # The compilation will use half of them
         max_jobs = str(ceil(multiprocessing.cpu_count() / 2))
+        max_jobs = os.environ.get("AIDGE_NB_PROC", max_jobs)
 
         cwd = pathlib.Path().absolute()
 
@@ -51,14 +54,19 @@ class AidgePkgBuild(build_ext):
         package_prefix = build_lib if not self.editable_mode else SETUP_DIR
         pybind_install_prefix = (package_prefix / PROJECT_NAME).absolute()
 
-        os.chdir(str(build_temp))
-
-        compile_type = os.environ.get("AIDGE_PYTHON_BUILD_TYPE", "Release")
         install_path = (
             os.path.join(sys.prefix, "lib", "libAidge")
             if "AIDGE_INSTALL" not in os.environ
             else os.environ["AIDGE_INSTALL"]
         )
+
+        # Read environment variables for CMake options
+        c_compiler = os.environ.get("AIDGE_C_COMPILER", "gcc")
+        cxx_compiler = os.environ.get("AIDGE_CXX_COMPILER", "g++")
+        build_type = os.environ.get("AIDGE_BUILD_TYPE", "Release")
+        asan = os.environ.get("AIDGE_ASAN", "OFF")
+        cmake_arch = os.environ.get("AIDGE_CMAKE_ARCH", "")
+
         build_gen = os.environ.get("AIDGE_BUILD_GEN", "")
         build_gen_opts = (
             ["-G", build_gen]
@@ -66,27 +74,36 @@ class AidgePkgBuild(build_ext):
             else []
         )
         test_onoff = os.environ.get("AIDGE_BUILD_TEST", "OFF")
-        
-        self.spawn(
-            [
-                "cmake",
-                *build_gen_opts,
-                str(cwd),
-                f"-DTEST={test_onoff}",
-                f"-DCMAKE_INSTALL_PREFIX:PATH={install_path}",
-                f"-DCMAKE_BUILD_TYPE={compile_type}",
-                "-DPYBIND=ON",
-                f"-DPYBIND_INSTALL_PREFIX:PATH={pybind_install_prefix}",
-                "-DCMAKE_EXPORT_COMPILE_COMMANDS=ON",
-                "-DCOVERAGE=OFF",
-            ]
-        )
+
+        os.chdir(str(build_temp))
+
+        cmake_cmd = [
+            "cmake",
+            *build_gen_opts,
+            str(cwd),
+            f"-DTEST={test_onoff}",
+            f"-DCMAKE_INSTALL_PREFIX:PATH={install_path}",
+            f"-DCMAKE_BUILD_TYPE={build_type}",
+            f"-DCMAKE_C_COMPILER={c_compiler}",
+            f"-DCMAKE_CXX_COMPILER={cxx_compiler}",
+            f"-DENABLE_ASAN={asan}",
+            "-DPYBIND=ON",
+            f"-DPYBIND_INSTALL_PREFIX:PATH={pybind_install_prefix}",
+            "-DCMAKE_EXPORT_COMPILE_COMMANDS=1",
+            "-DCOVERAGE=OFF",
+        ]
+
+        # Append architecture-specific arguments if provided
+        if cmake_arch:
+            cmake_cmd.append(cmake_arch)
+
+        self.spawn(cmake_cmd)
 
         if not self.dry_run:
             self.spawn(
-                ["cmake", "--build", ".", "--config", compile_type, "-j", max_jobs]
+                ["cmake", "--build", ".", "--config", build_type, "-j", max_jobs]
             )
-            self.spawn(["cmake", "--install", ".", "--config", compile_type])
+            self.spawn(["cmake", "--install", ".", "--config", build_type])
         os.chdir(str(cwd))
 
 
diff --git a/src/operator/ExpandImpl.cpp b/src/operator/ExpandImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dfd4d2d82edc4dfb5bbaec6f5b33bf1c00bf3c75
--- /dev/null
+++ b/src/operator/ExpandImpl.cpp
@@ -0,0 +1,56 @@
+
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include "aidge/backend/cpu/operator/ExpandImpl.hpp"
+
+#include <vector>
+
+#include "aidge/backend/cpu/operator/ExpandImpl_kernels.hpp"
+#include "aidge/data/Data.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/Expand.hpp"
+#include "aidge/utils/ErrorHandling.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+
+template <> void ExpandImpl_cpu::forward() {
+    const Expand_Op &op_ = static_cast<const Expand_Op &>(mOp);
+    // Check if input are provided
+    AIDGE_ASSERT(op_.getInput(0),
+                 "{}: missing input 0: {}",
+                 Expand_Op::Type,
+                 Expand_Op::getInputsName()[0]);
+    AIDGE_ASSERT(op_.getInput(1),
+                 "{}: missing input 1: {}",
+                 Expand_Op::Type,
+                 Expand_Op::getInputsName()[1]);
+
+    // Find the correct kernel type
+    const auto impl =
+        Registrar<ExpandImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Call kernel
+    impl.forward(op_.getInput(0),
+                 op_.getInput(1),
+                 op_.getOutput(0)->getImpl()->rawPtr(),
+                 op_.getOutput(0)->dims());
+}
+
+template <> void ExpandImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(
+        std::runtime_error,
+        "Backward not yet implemented for Expand_Op on backend cpu");
+}
+
+} // namespace Aidge
diff --git a/src/operator/HeavisideImpl.cpp b/src/operator/HeavisideImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..56ceb9b0b474d416f25d77b533373d4b193532b8
--- /dev/null
+++ b/src/operator/HeavisideImpl.cpp
@@ -0,0 +1,37 @@
+/********************************************************************************
+ * Copyright (c) 2025 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include "aidge/backend/cpu/operator/HeavisideImpl.hpp"
+
+#include <stdexcept>
+
+#include "aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include "aidge/utils/ErrorHandling.hpp"
+
+template <> void Aidge::HeavisideImplCpu::forward() {
+    const Heaviside_Op &op_ = dynamic_cast<const Heaviside_Op &>(mOp);
+    std::shared_ptr<Tensor> input0 = op_.getInput(0);
+    std::shared_ptr<Tensor> output0 = op_.getOutput(0);
+    AIDGE_ASSERT(input0, "missing input #0");
+
+    const auto impl =
+        Registrar<HeavisideImplCpu>::create(getBestMatch(getRequiredSpec()));
+
+    impl.forward(input0->size(),
+                 getCPUPtr(mOp.getRawInput(0)),
+                 getCPUPtr(mOp.getRawOutput(0)),
+                 op_.value());
+}
+
+template <> void Aidge::HeavisideImplCpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Heaviside backward not implemented yet");
+}
diff --git a/src/operator/MulImpl.cpp b/src/operator/MulImpl.cpp
index 422bdd005f058fc9200cf5f7962bfc8d5877e6e1..a90d521a759f1ce6f4883bdd0bc05d84daa0f668 100644
--- a/src/operator/MulImpl.cpp
+++ b/src/operator/MulImpl.cpp
@@ -58,6 +58,7 @@ void Aidge::MulImpl_cpu::backward() {
                /* grad0Length  */ out0grad->size(),
                /* input0Dims   */ in0->dims(),
                /* input1Dims   */ in1->dims(),
+               out0grad->dims(),
                getCPUPtr(in0),
                getCPUPtr(in1),
                getCPUPtr(out0grad),
diff --git a/src/operator/WeightInterleavedImpl.cpp b/src/operator/WeightInterleavedImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2c9f3a6e8df35616a4f7ffae86cbeacd841f44bf
--- /dev/null
+++ b/src/operator/WeightInterleavedImpl.cpp
@@ -0,0 +1,75 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include "aidge/backend/cpu/operator/WeightInterleavedImpl.hpp"
+
+#include <cstddef>  // std::size_t
+#include <functional>
+#include <memory>
+#include <tuple>
+
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include "aidge/backend/cpu/operator/WeightInterleavedImpl_kernels.hpp"
+#include "aidge/operator/WeightInterleaving.hpp"
+#include "aidge/utils/ErrorHandling.hpp"
+#include "aidge/utils/Types.h"
+
+
+template <>
+void Aidge::WeightInterleavedImpl_cpu::forward()
+{
+    const WeightInterleaving_Op& op_ = dynamic_cast<const WeightInterleaving_Op&>(mOp);
+    AIDGE_ASSERT(op_.getInput(0), "missing input #0");
+
+    const auto impl = Registrar<WeightInterleavedImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Convert input data (no overhead if not needed!)
+    // TODO: right now, if needed, memory will be allocated/deallocated at each
+    // call to forward(). We might put the following shared_ptr as members of
+    // this class to avoid that.
+    std::shared_ptr<Tensor> input0Fallback;
+    const auto& input0 = op_.getInput(0)->refCastFrom(input0Fallback, *(op_.getOutput(0)));
+
+    // inputInterleaving is the number of consecutive input elements that will be compacted
+    // Here the interleaving is the last dimension (cf STM32 low bit kernels)
+    std::size_t inputInterleaving = input0.dims().back();
+
+    // The resulting compacted dimension was computed in forwardDims and the output tensor was resized
+    std::size_t outputInterleaving = op_.getOutput(0)->dims().back();
+
+    // nb_interleaving is the number of compacted segments
+    std::size_t nbInterleaving;
+
+    // Determine the number of segment to compact
+    if (input0.dims().size() > 1){
+        nbInterleaving = std::accumulate(
+        input0.dims().cbegin(),
+        std::prev(input0.dims().cend()), // Exclude the last element
+        std::size_t(1),
+        std::multiplies<std::size_t>());
+    } else {
+        // Case when the weight tensor is only one dimension
+        nbInterleaving = 1;
+    }
+
+    impl.forward(inputInterleaving,
+        nbInterleaving,
+        outputInterleaving,
+        input0.getImpl()->rawPtr(),
+        getCPUPtr(mOp.getRawOutput(0)));
+
+
+}
+
+template <>
+void Aidge::WeightInterleavedImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for WeightInterleaving_Op on backend cpu");
+}
\ No newline at end of file
diff --git a/unit_tests/CMakeLists.txt b/unit_tests/CMakeLists.txt
index 5984524fdc8c596641e505897d16e12de78024cc..6c7af9c376a0a58a361880edad9340e3d845febc 100644
--- a/unit_tests/CMakeLists.txt
+++ b/unit_tests/CMakeLists.txt
@@ -1,12 +1,21 @@
-Include(FetchContent)
+set(CATCH2_MIN_VERSION 3.3.0)
 
-FetchContent_Declare(
-  Catch2
-  GIT_REPOSITORY https://github.com/catchorg/Catch2.git
-  GIT_TAG        v3.7.1 # or a later release
-)
+find_package(Catch2 ${CATCH2_MIN_VERSION} QUIET)
 
-FetchContent_MakeAvailable(Catch2)
+if(NOT Catch2_FOUND)
+    message(STATUS "Catch2 not found in system, retrieving from git")
+    Include(FetchContent)
+
+    FetchContent_Declare(
+      Catch2
+      GIT_REPOSITORY https://github.com/catchorg/Catch2.git
+      GIT_TAG        devel # or a later release
+    )
+
+    FetchContent_MakeAvailable(Catch2)
+else()
+    message(STATUS "Found system Catch2 version ${Catch2_VERSION}")
+endif()
 
 file(GLOB_RECURSE src_files "*.cpp")
 
diff --git a/unit_tests/data/Test_TensorImpl.cpp b/unit_tests/data/Test_TensorImpl.cpp
index fd938f10a947d1520600a1d00022eeb970cd76e6..2bc1e7d4c6f8a7cfbae8807e3021f9c5dd89fff6 100644
--- a/unit_tests/data/Test_TensorImpl.cpp
+++ b/unit_tests/data/Test_TensorImpl.cpp
@@ -9,19 +9,23 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
-#include <cstddef>   // std::size_t
-#include <cstdint>   // std::uint16_t
-#include <chrono>
-#include <iostream>
+#include <chrono>      // std::micro, std::chrono::time_point,
+                       // std::chrono::system_clock
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::int32_t, std::uint16_t
 #include <memory>
-#include <numeric>   // std::accumulate
-#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <random>      // std::random_device, std::mt19937
+                       // std::uniform_int_distribution, std::uniform_real_distribution
+#include <vector>
+
+#include <catch2/catch_test_macros.hpp>
+#include <fmt/core.h>
 
-#include "aidge/data/Tensor.hpp"
 #include "aidge/backend/cpu/data/TensorImpl.hpp"
-#include "aidge/operator/Add.hpp"
 #include "aidge/backend/cpu/operator/AddImpl.hpp"
+#include "aidge/data/Data.hpp"
+#include "aidge/operator/Add.hpp"
+#include "aidge/utils/ArrayHelpers.hpp"
 
 namespace Aidge {
 
@@ -35,8 +39,7 @@ TEST_CASE("Test addition of Tensors","[TensorImpl][Add][Data]") {
     std::uniform_int_distribution<int> boolDist(0,1);
 
     // Create MatMul Operator
-    std::shared_ptr<Node> mySub = Add();
-    auto op = std::static_pointer_cast<OperatorTensor>(mySub-> getOperator());
+    std::shared_ptr<Add_Op> op = std::make_shared<Add_Op>();
     op->setDataType(DataType::Float32);
     op->setBackend("cpu");
 
diff --git a/unit_tests/operator/Test_AddImpl.cpp b/unit_tests/operator/Test_AddImpl.cpp
index bca4025705cb1c851dcf3e9accbf016c4535120a..bff9629be152163b2aa92bdc9d0c3029d7987b9b 100644
--- a/unit_tests/operator/Test_AddImpl.cpp
+++ b/unit_tests/operator/Test_AddImpl.cpp
@@ -9,12 +9,16 @@
  *
  ********************************************************************************/
 
+#include <memory>
+
 #include <catch2/catch_test_macros.hpp>
 
+#include "aidge/backend/cpu/operator/AddImpl.hpp"
+#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
+#include "aidge/graph/Node.hpp"
 #include "aidge/operator/Add.hpp"
-
-#include "aidge/backend/cpu.hpp"
+#include "aidge/utils/ArrayHelpers.hpp"
 
 using namespace Aidge;
 
@@ -96,7 +100,7 @@ TEST_CASE("[cpu/operator] Add(forward)", "[Add][CPU]") {
         });                                     //
 
         std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<int,2> {{100,200}});
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
+        Tensor expectedOutput = Array4D<int,3,3,3,2> {
             {                                               //
                 {                                           //
                     {{ 120, 222},{ 124, 226},{ 128, 230}},  //
@@ -114,7 +118,7 @@ TEST_CASE("[cpu/operator] Add(forward)", "[Add][CPU]") {
                     {{ 144, 246},{ 148, 250},{152, 254}}    //
                 }                                           //
             }                                               //
-        });                                                 //
+        };                                                 //
 
         std::shared_ptr<Node> myAdd_0 = Add();
         std::shared_ptr<Node> myAdd_1 = Add();
@@ -131,8 +135,8 @@ TEST_CASE("[cpu/operator] Add(forward)", "[Add][CPU]") {
         op_1->setBackend("cpu");
         myAdd_0->forward();
         myAdd_1->forward();
-        op_1->getOutput(0)->print();
-        expectedOutput->print();
-        REQUIRE(*op_1->getOutput(0) == *expectedOutput);
+        Log::info("Add_1 Tensor:\n{}", *(op_1->getOutput(0)));
+        Log::info("Expected Add_1 Tensor:\n{}", expectedOutput);
+        REQUIRE(*op_1->getOutput(0) == expectedOutput);
     }
 }
\ No newline at end of file
diff --git a/unit_tests/operator/Test_AndImpl.cpp b/unit_tests/operator/Test_AndImpl.cpp
index 053bb3ea4ed913bd388f3ae049c4d6402ad58d59..c2309dce5f32862ad9aeceaf98430b75ab7be6ef 100644
--- a/unit_tests/operator/Test_AndImpl.cpp
+++ b/unit_tests/operator/Test_AndImpl.cpp
@@ -9,13 +9,19 @@
  *
  ********************************************************************************/
 
+#include <cstddef> // std::size_t
+#include <cstdint> // std::uint16_t
+#include <memory>
+#include <random>  // std::random_device, std::mt19937, std::uniform_int_distribution, std::uniform_real_distribution
+
 #include <catch2/catch_test_macros.hpp>
-#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
 
+#include "aidge/backend/cpu/operator/AndImpl.hpp"
+#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
+#include "aidge/graph/Node.hpp"
 #include "aidge/operator/And.hpp"
-
-#include "aidge/backend/cpu.hpp"
+#include "aidge/utils/ArrayHelpers.hpp"
 
 using namespace Aidge;
 
@@ -180,7 +186,7 @@ TEST_CASE("[cpu/operator] And(forward)", "[And][CPU]") {
         }                                       //
         });                                     //
 
-        std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<int,2> {{10, 20}});  
+        std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<int,2> {{10, 20}});
         std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,1,3,3,2> {
             {                                   //
                 {                               //
diff --git a/unit_tests/operator/Test_ArgMaxImpl.cpp b/unit_tests/operator/Test_ArgMaxImpl.cpp
index 9915d90423e976db1bdd2a694a2cfd7beb380cee..894697f65a6f73af27a568b994c1dd2dc6b118f3 100644
--- a/unit_tests/operator/Test_ArgMaxImpl.cpp
+++ b/unit_tests/operator/Test_ArgMaxImpl.cpp
@@ -9,17 +9,20 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
+#include <cstddef> // std::size_t
+#include <cstdint> // std::uint16_t
 #include <memory>
-#include <numeric>   // std::accumulate
-#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <random>  // std::random_device, std::mt19937, std::uniform_int_distribution, std::uniform_real_distribution
+
+#include <catch2/catch_test_macros.hpp>
+#include <fmt/core.h>
 
+#include "aidge/backend/cpu/operator/ArgMaxImpl.hpp"
+#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
+#include "aidge/graph/Node.hpp"
 #include "aidge/operator/ArgMax.hpp"
-#include "aidge/operator/Conv.hpp"
-
-#include "aidge/backend/cpu.hpp"
-#include "aidge/utils/TensorUtils.hpp"
+#include "aidge/utils/ArrayHelpers.hpp"
 
 using namespace Aidge;
 
@@ -118,8 +121,8 @@ TEST_CASE("[cpu/operator] ArgMax(forward)", "[ArgMax][CPU]") {
         SECTION("Axis 2") {
 
             Tensor myOutput = Tensor(Array3D<float,2,3, 1> {
-               { 
-                    { 
+               {
+                    {
                         {3.0},
                         {2.0},
                         {1.0}
@@ -144,7 +147,7 @@ TEST_CASE("[cpu/operator] ArgMax(forward)", "[ArgMax][CPU]") {
         SECTION("Axis 2 with keep_dims false") {
 
             Tensor myOutput = Tensor(Array2D<float,2,3> {
-               { 
+               {
                     { 3.0, 2.0, 1.0 },
                     { 2.0, 1.0, 0.0 }
                }
@@ -196,10 +199,11 @@ TEST_CASE("[cpu/operator] ArgMax(forward)", "[ArgMax][CPU]") {
             op->associateInput(0,myInput);
             op->setDataType(DataType::Float32);
             op->setBackend("cpu");
-            std::cout << " ...............  "<< std::endl;
+            fmt::print("{:.^20}\n", "forward");
             myArgMax->forward();
+            fmt::print("{:.^20}\n", "result");
             op->getOutput(0)->print();
-            std::cout <<"------"<<std::endl;
+            fmt::print("{:.^20}\n", "truth");
             myOutput.print();
 
             REQUIRE(*(op->getOutput(0)) == myOutput);
diff --git a/unit_tests/operator/Test_Atan.cpp b/unit_tests/operator/Test_Atan.cpp
index 9548e35d81b0423125424a4198d82558c4e57df4..b9438db0b38642e8c49e46451544a68714ac4de6 100644
--- a/unit_tests/operator/Test_Atan.cpp
+++ b/unit_tests/operator/Test_Atan.cpp
@@ -9,14 +9,18 @@
  *
  ********************************************************************************/
 
+#include <cmath>    // std::abs
+#include <cstddef>  // std::size_t
+#include <memory>
+
 #include <catch2/catch_test_macros.hpp>
 
+#include "aidge/backend/cpu/operator/AtanImpl.hpp"
+#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
+#include "aidge/graph/Node.hpp"
 #include "aidge/operator/Atan.hpp"
-
-#include "aidge/backend/cpu.hpp"
-
-#include <memory>
+#include "aidge/utils/ArrayHelpers.hpp"
 
 using namespace Aidge;
 
@@ -32,7 +36,7 @@ TEST_CASE("[cpu/operator] Atan(forward)") {
              0.09486303, 0.16007232, 0.40421187, 0.4102045, 0.39055911}});
 
     std::shared_ptr<Node> myAtan = Atan();
-    auto op = std::static_pointer_cast<OperatorTensor>(myAtan->getOperator());
+    auto op = std::static_pointer_cast<Atan_Op>(myAtan->getOperator());
     op->associateInput(0, input0);
     op->setDataType(DataType::Float32);
     op->setBackend("cpu");
@@ -61,7 +65,7 @@ TEST_CASE("[cpu/operator] Atan(forward)") {
                                   {0.75377332, 0.77411225, 0.32928031}}}});
 
     std::shared_ptr<Node> myAtan = Atan();
-    auto op = std::static_pointer_cast<OperatorTensor>(myAtan->getOperator());
+    auto op = std::static_pointer_cast<Atan_Op>(myAtan->getOperator());
     op->associateInput(0, input0);
     op->setDataType(DataType::Float32);
     op->setBackend("cpu");
diff --git a/unit_tests/operator/Test_AvgPoolingImpl.cpp b/unit_tests/operator/Test_AvgPoolingImpl.cpp
index aaa2757830c245275d02792a7a5a2eb1db32d7b8..372febc61d04c2ba983dd33f009fe5bf1d2908a0 100644
--- a/unit_tests/operator/Test_AvgPoolingImpl.cpp
+++ b/unit_tests/operator/Test_AvgPoolingImpl.cpp
@@ -9,14 +9,18 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
+#include <cmath>    // std::abs
+#include <cstddef>  // std::size_t
 #include <memory>
-#include <cstdlib>
 
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu/operator/AvgPoolingImpl.hpp"
+#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
+#include "aidge/graph/Node.hpp"
 #include "aidge/operator/AvgPooling.hpp"
-
-#include "aidge/backend/cpu.hpp"
+#include "aidge/utils/ArrayHelpers.hpp"
 
 using namespace Aidge;
 
@@ -53,7 +57,7 @@ TEST_CASE("[cpu/operator] AvgPooling(forward)", "[AvgPooling][CPU]") {
     });
     SECTION("Stride") {
         std::shared_ptr<Node> myAvgPool = AvgPooling({2,2}, "mycdw", {2,2});
-        auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool -> getOperator());
+        auto op = std::static_pointer_cast<AvgPooling_Op<2>>(myAvgPool -> getOperator());
 
         std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,2,2,2> {
             {
@@ -90,7 +94,7 @@ TEST_CASE("[cpu/operator] AvgPooling(forward)", "[AvgPooling][CPU]") {
         }
         });
         std::shared_ptr<Node> myAvgPool = AvgPooling({3,3}, "mycdw", {3,3});
-        auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool -> getOperator());
+        auto op = std::static_pointer_cast<AvgPooling_Op<2>>(myAvgPool -> getOperator());
 
         Tensor myOutput = Array4D<float,1,1,1,1> {
             {{{{(0.3745 + 0.9507 + 0.7320 + 0.5987 + 0.1560 + 0.1560 + 0.0581 + 0.8662 + 0.6011)/9.0}}}}
diff --git a/unit_tests/operator/Test_BatchNormImpl.cpp b/unit_tests/operator/Test_BatchNormImpl.cpp
index 1b42c90dd09d63cd319f19bd29751da816db06c0..26e964f9386e19a6070d75a4106b6b46a29e455d 100644
--- a/unit_tests/operator/Test_BatchNormImpl.cpp
+++ b/unit_tests/operator/Test_BatchNormImpl.cpp
@@ -9,20 +9,24 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
+#include <cmath>    // std::abs
+#include <cstddef>  // std::size_t
 #include <memory>
 
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu/operator/BatchNormImpl.hpp"
+#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
+#include "aidge/graph/Node.hpp"
 #include "aidge/operator/BatchNorm.hpp"
-#include "aidge/scheduler/SequentialScheduler.hpp"
-
-#include "aidge/backend/cpu.hpp"
+#include "aidge/utils/ArrayHelpers.hpp"
 
 using namespace Aidge;
 
 TEST_CASE("[cpu/operator] BatchNorm(forward)", "[BatchNorm][CPU]") {
     std::shared_ptr<Node> myBatchNorm = BatchNorm<2>(3, 0.00001F, 0.1F, "mybatchnorm");
-    auto op = std::static_pointer_cast<OperatorTensor>(myBatchNorm -> getOperator());
+    auto op = std::static_pointer_cast<BatchNorm_Op<2>>(myBatchNorm -> getOperator());
     std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array1D<float,3> {{0.9044, 0.3028, 0.0218}});
     std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float,3> {{0.1332, 0.7503, 0.0878}});
     std::shared_ptr<Tensor> myMean = std::make_shared<Tensor>(Array1D<float,3> {{0.9931, 0.8421, 0.9936}});
diff --git a/unit_tests/operator/Test_BitShift.cpp b/unit_tests/operator/Test_BitShift.cpp
index a52990bc7991a325ce151cf6634b0d5a831992c8..33ab932e296be717604be42716d7abe2b61f65ee 100644
--- a/unit_tests/operator/Test_BitShift.cpp
+++ b/unit_tests/operator/Test_BitShift.cpp
@@ -9,15 +9,20 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
+#include <chrono>      // std::micro, std::chrono::time_point,
+                       // std::chrono::system_clock
 #include <cstddef>   // std::size_t
 #include <cstdint>   // std::uint16_t
 #include <chrono>
-#include <iostream>
 #include <memory>
-#include <numeric>   
+#include <numeric>
 #include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
-#include <iomanip>
+
+#include <catch2/catch_test_macros.hpp>
+#include <fmt/core.h>
+
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/BitShiftImpl.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/BitShift.hpp"
 #include "aidge/utils/TensorUtils.hpp"
@@ -29,7 +34,7 @@ TEST_CASE("[cpu/operator] BitShift_TEST", "[BitShift][CPU]") {
     // Create a random number generator
     std::random_device rd;
     std::mt19937 gen(rd());
-    std::uniform_int_distribution<int> valueDist(-15, 15); 
+    std::uniform_int_distribution<int> valueDist(-15, 15);
     std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(5));
     std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(3));
     std::uniform_int_distribution<int> boolDist(0,1);
@@ -131,8 +136,8 @@ TEST_CASE("[cpu/operator] BitShift_TEST", "[BitShift][CPU]") {
 
 
             }
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {}Î¼s\n", duration.count());
         }
         SECTION("Test BitShift kernels with Broadcasting") {
             std::size_t number_of_operation = 0;
@@ -194,7 +199,7 @@ TEST_CASE("[cpu/operator] BitShift_TEST", "[BitShift][CPU]") {
                                 }
                                 else
                                 {
-                                    result[idx_out + d] = array0[idx0] >> array1[idx1];                               
+                                    result[idx_out + d] = array0[idx0] >> array1[idx1];
                                 }
                             }
                         }
@@ -222,12 +227,7 @@ TEST_CASE("[cpu/operator] BitShift_TEST", "[BitShift][CPU]") {
                 duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
 
                 // comparison between truth and computed result
-                bool equiv = (approxEq<int>(*(op->getOutput(0)), *Tres));
-                if(equiv == false)
-                {
-                    std::cout << "Problem\n";
-                }
-                REQUIRE(equiv);
+                REQUIRE(approxEq<int>(*(op->getOutput(0)), *Tres));
 
                 delete[] array0;
                 delete[] array1;
@@ -236,8 +236,8 @@ TEST_CASE("[cpu/operator] BitShift_TEST", "[BitShift][CPU]") {
                 const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
                 number_of_operation += nb_elements;
             }
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {}Î¼s\n", duration.count());
         }
 
 }
diff --git a/unit_tests/operator/Test_ClipImpl.cpp b/unit_tests/operator/Test_ClipImpl.cpp
index 45c8da5bf7ecc84fad6b3e694fe204540f579af3..99147ac93bd659dd91897f6b7f1f3f33e5552ef6 100644
--- a/unit_tests/operator/Test_ClipImpl.cpp
+++ b/unit_tests/operator/Test_ClipImpl.cpp
@@ -9,36 +9,37 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
+#include <algorithm>  // std::max, std::min
+#include <chrono>
 #include <cstddef>  // std::size_t
 #include <cstdint>  // std::uint16_t
-#include <chrono>
-#include <iostream>
-#include <vector>
-#include <algorithm>
-#include <iomanip>
 #include <memory>
-#include <random>   // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <random>      // std::random_device, std::mt19937
+                       // std::uniform_int_distribution, std::uniform_real_distribution
+#include <vector>
+
+#include <catch2/catch_test_macros.hpp>
+#include <fmt/core.h>
 
+#include "aidge/backend/cpu/operator/ClipImpl.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/Clip.hpp"
 #include "aidge/operator/OperatorTensor.hpp"
 #include "aidge/utils/TensorUtils.hpp"
-#include "aidge/backend/cpu.hpp"
 
 void ComputeClipBackward(const std::vector<float>& vec1, std::vector<float>& vec2, float min, float max) {
     if (vec1.size() != vec2.size()) {
-        std::cerr << "Vectors should have the same sizes." << std::endl;
+        fmt::print(stderr, "Vectors should have the same sizes.\n");
         return;
     }
 
-    for (size_t i = 0; i < vec1.size(); ++i) {
+    for (std::size_t i = 0; i < vec1.size(); ++i) {
         if (vec1[i] < min || vec1[i] > max) {
             vec2[i] = 0.0f;
         }
     }
 }
-namespace Aidge 
+namespace Aidge
 {
 TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
  {
@@ -47,8 +48,8 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
     std::random_device rd;
     std::mt19937 gen(rd());
     std::uniform_real_distribution<float> dis(0.0, 10.0);
-    std::uniform_real_distribution<float> dismin(0.0, 4.5); 
-    std::uniform_real_distribution<float> dismax(5.5, 10.0); 
+    std::uniform_real_distribution<float> dismin(0.0, 4.5);
+    std::uniform_real_distribution<float> dismax(5.5, 10.0);
     std::uniform_int_distribution<std::size_t> distDims(5,15);
     std::uniform_int_distribution<std::size_t> distNbMatrix(1, 5);
 
@@ -71,7 +72,7 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
 
             // Create and populate the array with random float values
             float* Array = new float[dim0*dim1];
-            for (int i = 0; i < dim0*dim1; ++i) {
+            for (std::size_t i = 0; i < dim0*dim1; ++i) {
                 Array[i] = dis(gen); // Generate random float value
             }
 
@@ -80,7 +81,7 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
             TInput -> resize({dim0,dim1});
             TInput -> setBackend("cpu");
             TInput -> getImpl() -> setRawPtr(Array, dim0*dim1);
-            
+
             float min = dismin(gen);
             std::shared_ptr<Tensor> Tmin = std::make_shared<Tensor>(DataType::Float32);
             Tmin -> resize({});
@@ -109,7 +110,7 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
             op->setDataType(DataType::Float32);
             op->setBackend("cpu");
             op->forwardDims(true);
-            
+
             start = std::chrono::system_clock::now();
             myClip->forward();
             end = std::chrono::system_clock::now();
@@ -118,9 +119,9 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
 
             REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
         }
-        std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl;
-        std::cout << "total time: " << duration.count() << std::endl;
-    } 
+        Log::info("multiplications over time spent: {}\n", totalComputation/duration.count());
+        Log::info("total time: {}\n", duration.count());
+    }
     SECTION("Clip test with min >= max [Forward]") {
         std::size_t totalComputation = 0;
         for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
@@ -131,7 +132,7 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
 
             // Create and populate the array with random float values
             float* Array = new float[dim0*dim1];
-            for (int i = 0; i < dim0*dim1; ++i) {
+            for (std::size_t i = 0; i < dim0*dim1; ++i) {
                 Array[i] = dis(gen); // Generate random float value
             }
 
@@ -140,7 +141,7 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
             TInput -> resize({dim0,dim1});
             TInput -> setBackend("cpu");
             TInput -> getImpl() -> setRawPtr(Array, dim0*dim1);
-            
+
             float min = dismax(gen);
             std::shared_ptr<Tensor> Tmin = std::make_shared<Tensor>(DataType::Float32);
             Tmin -> resize({});
@@ -169,7 +170,7 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
             op->setDataType(DataType::Float32);
             op->setBackend("cpu");
             op->forwardDims(true);
-            
+
             start = std::chrono::system_clock::now();
             myClip->forward();
             end = std::chrono::system_clock::now();
@@ -178,13 +179,13 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
 
             REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
         }
-        std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl;
-        std::cout << "total time: " << duration.count() << std::endl;
-    } 
+        Log::info("multiplications over time spent: {}\n", totalComputation/duration.count());
+        Log::info("total time: {}\n", duration.count());
+    }
     SECTION("Clip with Clip Attr [Forward]")
     {
         std::size_t totalComputation = 0;
-        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) 
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial)
         {
 
             float min = dismin(gen);
@@ -200,7 +201,7 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
 
             // Create and populate the array with random float values
             float* Array = new float[dim0*dim1];
-            for (int i = 0; i < dim0*dim1; ++i) {
+            for (std::size_t i = 0; i < dim0*dim1; ++i) {
                 Array[i] = dis(gen); // Generate random float value
             }
             // Convert Input to Tensor
@@ -231,8 +232,8 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
 
             REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
         }
-        std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl;
-        std::cout << "total time: " << duration.count() << std::endl;
+        Log::info("multiplications over time spent: {}\n", totalComputation/duration.count());
+        Log::info("total time: {}\n", duration.count());
     }
     SECTION("Simple clip test [Backward]") {
         std::size_t totalComputation = 0;
@@ -243,13 +244,13 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
             // generate Tensors dimensions
             const std::size_t dim0 = distDims(gen);
             const std::size_t dim1 = distDims(gen);
-  
+
             totalComputation += dim0*dim1;
 
             // Create and populate the array with random float values
             float* Array = new float[dim0*dim1];
             float* gradArray = new float[dim0*dim1];
-            for (int i = 0; i < dim0*dim1; ++i) {
+            for (std::size_t i = 0; i < dim0*dim1; ++i) {
                 Array[i] = dis(gen); // Generate random float value
                 gradArray[i] = dis(gen);
             }
@@ -264,7 +265,7 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
             TInput -> resize({dim0,dim1});
             TInput -> setBackend("cpu");
             TInput -> getImpl() -> setRawPtr(Array, dim0*dim1);
-            
+
             float min = dismin(gen);
             std::shared_ptr<Tensor> Tmin = std::make_shared<Tensor>(DataType::Float32);
             Tmin -> resize({});
@@ -296,7 +297,7 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
             myClip->forward();
 
             op->getOutput(0)->setGrad(TGrad);
-            
+
             start = std::chrono::system_clock::now();
             REQUIRE_NOTHROW(myClip->backward());
             end = std::chrono::system_clock::now();
@@ -310,9 +311,9 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
             duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
             REQUIRE(GT1 == BackwardTensorVec);
         }
-        std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl;
-        std::cout << "total time: " << duration.count() << std::endl;
+        Log::info("multiplications over time spent: {}\n", totalComputation/duration.count());
+        Log::info("total time: {}\n", duration.count());
     }
  }
-} // namespace Aidge 
+} // namespace Aidge
 }
\ No newline at end of file
diff --git a/unit_tests/operator/Test_ConstantOfShapeImpl.cpp b/unit_tests/operator/Test_ConstantOfShapeImpl.cpp
index 42505d385fde7e72e09531f1607287ffc6978f75..8ec1669b92a5116999413cf55a8c5113363ef330 100644
--- a/unit_tests/operator/Test_ConstantOfShapeImpl.cpp
+++ b/unit_tests/operator/Test_ConstantOfShapeImpl.cpp
@@ -9,32 +9,27 @@
  *
  ********************************************************************************/
 
-#include <algorithm>
-#include <chrono>
-#include <cmath>
-#include <cstddef> // std::size_t
-#include <cstdint> // std::uint16_t
-#include <iostream>
+#include <chrono>      // std::micro, std::chrono::time_point,
+                       // std::chrono::system_clock
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::int64_t, std::uint16_t
 #include <memory>
-#include <numeric> // std::accumulate
-#include <ostream>
-#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <random>      // std::random_device, std::mt19937
+                       // std::uniform_int_distribution, std::uniform_real_distribution
+#include <vector>
 
-#include "catch2/internal/catch_compiler_capabilities.hpp"
-#include "catch2/internal/catch_enforce.hpp"
 #include <catch2/catch_test_macros.hpp>
 #include <catch2/generators/catch_generators_random.hpp>
 
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp"
+#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
+#include "aidge/filler/Filler.hpp"
 #include "aidge/operator/ConstantOfShape.hpp"
+#include "aidge/operator/OperatorTensor.hpp"
 #include "aidge/utils/TensorUtils.hpp"
-#include <aidge/data/Data.hpp>
-#include <aidge/data/half.hpp>
-#include <aidge/filler/Filler.hpp>
-#include <aidge/operator/OperatorTensor.hpp>
-#include <aidge/operator/Reshape.hpp>
-#include <aidge/utils/TensorUtils.hpp>
-#include <aidge/utils/Types.h>
+#include "aidge/utils/Types.h"
 
 namespace Aidge {
 TEST_CASE("[cpu/operator] ConstantOfShape", "[ConstantOfShape][CPU]") {
@@ -62,7 +57,7 @@ TEST_CASE("[cpu/operator] ConstantOfShape", "[ConstantOfShape][CPU]") {
     result->setDataType(DataType::Int64);
     result->setBackend("cpu");
     for (DimSize_t i = 0; i < result->size(); ++i) {
-      result->set<int64_t>(i, input_tensor_values_dist(gen));
+      result->set<std::int64_t>(i, input_tensor_values_dist(gen));
     }
     return result;
   };
diff --git a/unit_tests/operator/Test_DivImpl.cpp b/unit_tests/operator/Test_DivImpl.cpp
index 5d7dfdf12032d4c444e38cda6d2a4298fc552b14..4037b2ad4e117573279f07d0c1819d3435ee7ada 100644
--- a/unit_tests/operator/Test_DivImpl.cpp
+++ b/unit_tests/operator/Test_DivImpl.cpp
@@ -9,17 +9,26 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
-#include <cstddef>   // std::size_t
-#include <cstdint>   // std::uint16_t
-#include <chrono>
-#include <iostream>
+#include <chrono>      // std::micro, std::chrono::time_point,
+                       // std::chrono::system_clock
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::uint16_t
+#include <functional>  // std::multiplies
 #include <memory>
-#include <numeric>   // std::accumulate
-#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <numeric>     // std::accumulate
+#include <random>      // std::random_device, std::mt19937
+                       // std::uniform_int_distribution, std::uniform_real_distribution
+#include <vector>
+
+#include <catch2/catch_test_macros.hpp>
+#include <fmt/core.h>
 
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/DivImpl.hpp"
+#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/Div.hpp"
+#include "aidge/operator/OperatorTensor.hpp"
 #include "aidge/utils/TensorUtils.hpp"
 
 namespace Aidge {
@@ -117,8 +126,8 @@ TEST_CASE("[cpu/operator] Div", "[Div][CPU]") {
 
                 // with broadcasting
             }
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {} Î¼s\n", duration.count());
         }
 
         SECTION("+1-D Tensor / +1-D Tensor - broadcasting") {
@@ -212,8 +221,8 @@ TEST_CASE("[cpu/operator] Div", "[Div][CPU]") {
                 const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
                 number_of_operation += nb_elements;
             }
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {} Î¼s\n", duration.count());
         }
         SECTION("+1-D Tensor / 1-D Tensor") {
             std::size_t number_of_operation = 0;
@@ -308,8 +317,8 @@ TEST_CASE("[cpu/operator] Div", "[Div][CPU]") {
                 number_of_operation += nb_elements;
             }
 
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {} Î¼s\n", duration.count());
         }
     }
 }
diff --git a/unit_tests/operator/Test_ErfImpl.cpp b/unit_tests/operator/Test_ErfImpl.cpp
index 2826b5b57d431cf8296a9869f88f7d642c59c963..c2fdd1c8606804e4a9f63051fa66667ae374fb9d 100644
--- a/unit_tests/operator/Test_ErfImpl.cpp
+++ b/unit_tests/operator/Test_ErfImpl.cpp
@@ -9,14 +9,16 @@
  *
  ********************************************************************************/
 
+#include <memory>
+
 #include <catch2/catch_test_macros.hpp>
 
+#include "aidge/backend/cpu/operator/ErfImpl.hpp"
+#include "aidge/data/DataType.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/Erf.hpp"
-
-#include "aidge/backend/cpu.hpp"
-
-#include <memory>
+#include "aidge/utils/ArrayHelpers.hpp"
+#include "aidge/utils/TensorUtils.hpp"
 
 
 using namespace Aidge;
@@ -27,23 +29,18 @@ TEST_CASE("[cpu/operator] Erf(forward)") {
             {0.41384590, 0.43120754, 0.93762982, 0.31049860, 0.77547199, 0.09514862,
               0.16145366, 0.42776686, 0.43487436, 0.41170865}
         });
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array1D<float,10> {
+        Tensor expectedOutput = Array1D<float,10> {
                 {0.44163144, 0.45801866, 0.81516320, 0.33941913, 0.72722000, 0.10704061,
               0.18061027, 0.45479023, 0.46144873, 0.43959764}
-        });
+        };
 
-        std::shared_ptr<Node> myErf = Erf();
-        auto op = std::static_pointer_cast<OperatorTensor>(myErf -> getOperator());
+        auto op = std::make_shared<Erf_Op>();
         op->associateInput(0,input0);
         op->setDataType(DataType::Float32);
         op->setBackend("cpu");
-        myErf->forward();
+        op->forward();
 
-        float* resPtr = static_cast<float*>(op->getOutput(0)->getImpl()->rawPtr());
-        float* expectedPtr = static_cast<float*>(expectedOutput->getImpl()->rawPtr());
-        for (std::size_t i = 0; i< expectedOutput->size(); ++i) {
-            REQUIRE(std::abs(resPtr[i]-expectedPtr[i]) < 0.00001);
-        }
+        REQUIRE(approxEq<float>(*(op->getOutput(0)), expectedOutput, 1e-5f, 1e-8f));
     }
 
     SECTION("3D Tensor") {
@@ -59,7 +56,7 @@ TEST_CASE("[cpu/operator] Erf(forward)") {
                 }
             }
         });
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array3D<float,2,2,3> {
+        Tensor expectedOutput = Array3D<float,2,2,3> {
             {
                 {
                     {0.83003384, 0.77721894, 0.72857803},
@@ -70,19 +67,14 @@ TEST_CASE("[cpu/operator] Erf(forward)") {
                     {0.81564975, 0.83322692, 0.37109339}
                 }
             }
-        });
+        };
 
-        std::shared_ptr<Node> myErf = Erf();
-        auto op = std::static_pointer_cast<OperatorTensor>(myErf -> getOperator());
+        auto op = std::make_shared<Erf_Op>();
         op->associateInput(0,input0);
         op->setDataType(DataType::Float32);
         op->setBackend("cpu");
-        myErf->forward();
+        op->forward();
 
-        float* resPtr = static_cast<float*>(op->getOutput(0)->getImpl()->rawPtr());
-        float* expectedPtr = static_cast<float*>(expectedOutput->getImpl()->rawPtr());
-        for (std::size_t i = 0; i< expectedOutput->size(); ++i) {
-            REQUIRE(std::abs(resPtr[i]-expectedPtr[i]) < 0.00001);
-        }
+        REQUIRE(approxEq<float>(*(op->getOutput(0)), expectedOutput, 1e-5f, 1e-8f));
     }
 }
\ No newline at end of file
diff --git a/unit_tests/operator/Test_ExpandImpl.cpp b/unit_tests/operator/Test_ExpandImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..878c608110eabb824d8a6c0d1ceb0853b3c1449d
--- /dev/null
+++ b/unit_tests/operator/Test_ExpandImpl.cpp
@@ -0,0 +1,103 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <memory>
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/ExpandImpl.hpp"
+#include "aidge/data/DataType.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/Expand.hpp"
+#include "aidge/utils/ArrayHelpers.hpp"
+
+using std::shared_ptr;
+
+using namespace Aidge;
+
+void setupTestExpand(shared_ptr<Tensor> inputData,
+                     shared_ptr<Tensor> inputShape,
+                     shared_ptr<Expand_Op> &op) {
+
+    op->getOutput(0)->setDataType(inputData->dataType());
+
+    inputData->setBackend("cpu");
+    op->associateInput(0, inputData);
+
+    inputShape->setBackend("cpu");
+    op->associateInput(1, inputShape);
+}
+
+TEST_CASE("[cpu/operator] Expand(forward)", "[Expand][CPU]") {
+    std::shared_ptr<Expand_Op> op = std::make_shared<Expand_Op>();
+    op->setBackend("cpu");
+
+    SECTION("Expand shape is bigger than inputData") {
+        auto inputData = std::make_shared<Tensor>(Array1D<int, 2>({1, 3}));
+        auto inputShape =
+            std::make_shared<Tensor>(Array1D<std::int64_t, 4>({1, 3, 4, 2}));
+        Tensor expectedOutput =
+            Array4D<cpptype_t<DataType::Int32>, 1, 3, 4, 2>({{{{{1, 3}, {1, 3}, {1, 3}, {1, 3}},
+                                        {{1, 3}, {1, 3}, {1, 3}, {1, 3}},
+                                        {{1, 3}, {1, 3}, {1, 3}, {1, 3}}}}});
+        setupTestExpand(inputData, inputShape, op);
+
+        // forwardDims has already been tested in core
+        CHECK(op->forwardDims(true));
+        REQUIRE_NOTHROW(op->forward());
+        REQUIRE(expectedOutput == *op->getOutput(0));
+    }
+    SECTION("Expand shape has less dimensions than inputData") {
+        auto inputData = std::make_shared<Tensor>(
+            Array3D<int, 2, 1, 3>({{{2, 1, 3}, {2, 1, 3}}}));
+        auto inputShape =
+            std::make_shared<Tensor>(Array1D<std::int64_t, 2>({2, 3}));
+        Tensor expectedOutput = Array3D<cpptype_t<DataType::Int32>, 2, 2, 3>(
+            {{{{2, 1, 3}, {2, 1, 3}}, {{2, 1, 3}, {2, 1, 3}}}});
+        setupTestExpand(inputData, inputShape, op);
+
+        // forwardDims has already been tested in core
+        CHECK(op->forwardDims(true));
+        REQUIRE_NOTHROW(op->forward());
+        REQUIRE(expectedOutput == *op->getOutput(0));
+    }
+    SECTION("Expand shape = {1} leads to input equal to output.") {
+        auto inputData = std::make_shared<Tensor>(
+            Array4D<int, 2, 1, 3, 1>({{{2, 1, 3}, {2, 1, 3}}}));
+        auto inputShape =
+            std::make_shared<Tensor>(Array1D<std::int64_t, 1>({1}));
+        Tensor expectedOutput =
+            Array4D<cpptype_t<DataType::Int32>, 2, 1, 3, 1>({{{2, 1, 3}, {2, 1, 3}}});
+        setupTestExpand(inputData, inputShape, op);
+
+        // forwardDims has already been tested in core
+        CHECK(op->forwardDims(true));
+        REQUIRE_NOTHROW(op->forward());
+        REQUIRE(expectedOutput == *op->getOutput(0));
+    }
+    SECTION("The only common dimension is the last one & its equal to 1") {
+        auto inputData = std::make_shared<Tensor>(
+            Array4D<int, 1, 1, 3, 1>({{{{2, 1, 3}}}}));
+        auto inputShape =
+            std::make_shared<Tensor>(Array1D<std::int64_t, 3>({2, 1, 1}));
+        Tensor expectedOutput =
+            Array4D<cpptype_t<DataType::Int32>, 1, 2, 3, 1>({{{{2, 1, 3}, {2, 1, 3}}}});
+        setupTestExpand(inputData, inputShape, op);
+
+        // forwardDims has already been tested in core
+        CHECK(op->forwardDims(true));
+        REQUIRE_NOTHROW(op->forward());
+        REQUIRE(expectedOutput == *op->getOutput(0));
+    }
+    SECTION("N-Dim to N-Dim") {}
+    auto inputData = std::shared_ptr<Tensor>();
+}
diff --git a/unit_tests/operator/Test_FCImpl.cpp b/unit_tests/operator/Test_FCImpl.cpp
index b2566f26d984fb1d89052745ec35870c6b935d48..8ac0afc33152f4ae110b1c3ef0b4e88f37b00e99 100644
--- a/unit_tests/operator/Test_FCImpl.cpp
+++ b/unit_tests/operator/Test_FCImpl.cpp
@@ -9,13 +9,16 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
 #include <memory>
 
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/FCImpl.hpp"
+#include "aidge/data/DataType.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/FC.hpp"
-
-#include "aidge/backend/cpu.hpp"
+#include "aidge/utils/ArrayHelpers.hpp"
 
 using namespace Aidge;
 
@@ -42,11 +45,13 @@ TEST_CASE("[cpu/oeprator] FC(forward)", "[FC][CPU]") {
               9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
               13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}}});
     std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<int, 5>{{1, 2, 3, 4, 5}});
-    std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<int, 2, 5>{
-            {{23601, 23602, 23603, 23604, 23605}, {68601, 68602, 68603, 68604, 68605}}});
+    Tensor myOutput = Array2D<int, 2, 5>{
+            {{23601, 23602, 23603, 23604, 23605}, {68601, 68602, 68603, 68604, 68605}}};
 
     std::shared_ptr<Node> myFC = FC(75, 5, false, "myfc");
-    auto op = std::static_pointer_cast<OperatorTensor>(myFC -> getOperator());
+    auto op = std::static_pointer_cast<FC_Op>(myFC -> getOperator());
+    op -> setDataType(DataType::Int32);
+    op -> setBackend("cpu");
     op -> associateInput(1, myWeights);
     op -> associateInput(2, myBias);
 
@@ -62,10 +67,8 @@ TEST_CASE("[cpu/oeprator] FC(forward)", "[FC][CPU]") {
                   120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
                   135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149}}});
         op->associateInput(0, myInput);
-        op -> setDataType(DataType::Int32);
-        op -> setBackend("cpu");
         myFC->forward();
-        REQUIRE(*(op->getOutput(0)) == *myOutput);
+        REQUIRE(*(op->getOutput(0)) == myOutput);
     }
     SECTION("4D input") {
         std::shared_ptr<Tensor> myInput =
@@ -100,10 +103,8 @@ TEST_CASE("[cpu/oeprator] FC(forward)", "[FC][CPU]") {
                                                                      {140, 141, 142, 143, 144},
                                                                      {145, 146, 147, 148, 149}}}}});
         op->associateInput(0, myInput);
-        op -> setDataType(DataType::Int32);
-        op -> setBackend("cpu");
         myFC->forward();
-        REQUIRE(*(op->getOutput(0)) == *myOutput);
+        REQUIRE(*(op->getOutput(0)) == myOutput);
     }
 
     // std::cout << static_cast<Tensor>((*myFC->getOperator())["weight"])[0][0][0][0] << std::endl;
diff --git a/unit_tests/operator/Test_FoldImpl.cpp b/unit_tests/operator/Test_FoldImpl.cpp
index 6832f5a42d796d9261495794e0758ce1b6df0346..184b9e9acfe2cd5e86f74d304e37ba2aeacc7cf5 100644
--- a/unit_tests/operator/Test_FoldImpl.cpp
+++ b/unit_tests/operator/Test_FoldImpl.cpp
@@ -13,6 +13,7 @@
 #include <cstdlib>
 #include <memory>
 
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/graph/GraphView.hpp"
 #include "aidge/scheduler/SequentialScheduler.hpp"
@@ -21,8 +22,6 @@
 #include "aidge/operator/MatMul.hpp"
 #include "aidge/operator/Reshape.hpp"
 
-#include "aidge/backend/cpu.hpp"
-
 using namespace Aidge;
 
 TEST_CASE("[cpu/operator] Fold(forward)", "[Fold][CPU]") {
diff --git a/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp b/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp
index 43af544871ad6c2ac319de09f3c6fce5065e60d5..8e8536accadcb874f74d4d962aae435bc1351d6e 100644
--- a/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp
+++ b/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp
@@ -9,34 +9,29 @@
  *
  ********************************************************************************/
 
-#include <aidge/utils/Types.h>
-#include <catch2/catch_test_macros.hpp>
 #include <chrono>
-#include <cmath>
 #include <cstddef> // std::size_t
 #include <cstdint> // std::uint16_t
-#include <iostream>
+#include <functional>  // std::multiplies
 #include <memory>
 #include <numeric> // std::accumulate
-#include <ostream>
-#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <random>      // std::random_device, std::mt19937
+                       // std::uniform_int_distribution, std::uniform_real_distribution
+#include <vector>
+
+#include <catch2/catch_test_macros.hpp>
+#include <fmt/core.h>
 
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/GlobalAveragePoolingImpl.hpp"
+#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/GlobalAveragePooling.hpp"
 #include "aidge/utils/TensorUtils.hpp"
-
-// debug print function
-void print_tensor(Aidge::Tensor &T) {
-  // Print tensors
-  std::cout << "Tensor : size = Â [";
-  for (auto &dim : T.dims()) {
-    std::cout << dim << " , ";
-  }
-  std::cout << "]" << std::endl;
-  T.print();
-}
+#include "aidge/utils/Types.h"
 
 namespace Aidge {
+
 TEST_CASE("[cpu/operator] GlobalAveragePooling",
           "[GlobalAveragePooling][CPU]") {
   constexpr std::uint16_t NBTRIALS = 10;
@@ -54,9 +49,7 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling",
                                                             std::size_t(7));
 
   // Create MatGlobalAveragePooling Operator
-  std::shared_ptr<Node> globAvgPool = GlobalAveragePooling();
-  auto op =
-      std::static_pointer_cast<OperatorTensor>(globAvgPool->getOperator());
+  std::shared_ptr<GlobalAveragePooling_Op> op = std::make_shared<GlobalAveragePooling_Op>();
   op->setDataType(DataType::Float32);
   op->setBackend("cpu");
 
@@ -99,7 +92,7 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling",
       T0->resize(dims);
       T0->getImpl()->setRawPtr(array0, nb_elements);
 
-      REQUIRE_THROWS(globAvgPool->forward());
+      REQUIRE_THROWS(op->forward());
       delete[] array0;
     }
 
@@ -158,7 +151,7 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling",
 
         op->forwardDims();
         start = std::chrono::system_clock::now();
-        REQUIRE_NOTHROW(globAvgPool->forward());
+        REQUIRE_NOTHROW(op->forward());
         end = std::chrono::system_clock::now();
         duration +=
             std::chrono::duration_cast<std::chrono::microseconds>(end - start);
@@ -231,7 +224,7 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling",
 
           op->forwardDims();
           start = std::chrono::system_clock::now();
-          REQUIRE_NOTHROW(globAvgPool->forward());
+          REQUIRE_NOTHROW(op->forward());
           end = std::chrono::system_clock::now();
           duration += std::chrono::duration_cast<std::chrono::microseconds>(
               end - start);
@@ -358,7 +351,7 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling",
           Tres->getImpl()->setRawPtr(result, out_nb_elems);
           op->forwardDims();
           start = std::chrono::system_clock::now();
-          REQUIRE_NOTHROW(globAvgPool->forward());
+          REQUIRE_NOTHROW(op->forward());
           end = std::chrono::system_clock::now();
           duration += std::chrono::duration_cast<std::chrono::microseconds>(
               end - start);
@@ -547,7 +540,7 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling",
           Tres->getImpl()->setRawPtr(result, out_nb_elems);
           op->forwardDims();
           start = std::chrono::system_clock::now();
-          REQUIRE_NOTHROW(globAvgPool->forward());
+          REQUIRE_NOTHROW(op->forward());
           end = std::chrono::system_clock::now();
           duration += std::chrono::duration_cast<std::chrono::microseconds>(
               end - start);
@@ -561,12 +554,9 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling",
           delete[] result;
         }
       }
-      std::cout << "GlobalAveragePooling total execution time : "
-                << duration.count() << "Âµs" << std::endl;
-      std::cout << "Number of operations : " << number_of_operation
-                << std::endl;
-      std::cout << "Operation / Âµs = " << number_of_operation / duration.count()
-                << std::endl;
+      Log::info("GlobalAveragePooling total execution time: {}Âµs\n", duration.count());
+      Log::info("Number of operations : {}\n", number_of_operation);
+      Log::info("Operation / Âµs = {}\n", number_of_operation / duration.count());
     }
   }
 }
diff --git a/unit_tests/operator/Test_HeavisideImpl.cpp b/unit_tests/operator/Test_HeavisideImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4cbdf1a0e29f8670e45897236374726dac62bb43
--- /dev/null
+++ b/unit_tests/operator/Test_HeavisideImpl.cpp
@@ -0,0 +1,98 @@
+/********************************************************************************
+ * Copyright (c) 2025 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include "aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp"
+
+#include <memory>
+#include <cstdlib>
+#include <random>
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/data/Tensor.hpp"
+#include "aidge/backend/cpu/operator/HeavisideImpl.hpp"
+#include "aidge/graph/Node.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+namespace Aidge
+{
+
+TEST_CASE("[cpu/operator] Heaviside(forward)", "[Heaviside][CPU]") {
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> valueDist(-1.0f, 1.0f);
+    std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
+    std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5));
+
+    SECTION("1D Tensor") {
+
+        std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array1D<float,10> {
+            {0, 1, 2,-3, 4,-5,-6, 7, 8, 9}
+        });
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array1D<float,10> {
+            {0.5, 1, 1, 0, 1, 0, 0, 1, 1, 1}
+        });
+
+        std::shared_ptr<Node> heaviside = Heaviside(0.5);
+        auto op = std::static_pointer_cast<OperatorTensor>(heaviside->getOperator());
+        op->associateInput(0, input0);
+        op->setBackend("cpu");
+        op->setDataType(DataType::Float32);
+
+        op->forward();
+        REQUIRE(approxEq<float>(*op->getOutput(0),*expectedOutput));
+    }
+
+    SECTION("+1-D Tensor")
+    {
+        auto dims = std::vector<std::size_t>();
+        auto nbDims = nbDimsDist(gen);
+
+        for (auto i = 0u; i < nbDims; ++i) {
+            dims.push_back(dimSizeDist(gen));
+        }
+
+        auto numberOfElements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+        float* inputArray = new float[numberOfElements];
+        float* resultArray = new float[numberOfElements];
+
+        for(auto i = 0u; i < numberOfElements; ++i)
+        {
+            inputArray[i] = valueDist(gen);
+            resultArray[i] = inputArray[i] > 0 ? 1 : (inputArray[i] == 0 ? 0.5 : 0);
+        }
+
+        auto T0 = std::make_shared<Tensor>();
+        T0->setDataType(DataType::Float32);
+        T0->setBackend("cpu");
+
+        auto T1 = std::make_shared<Tensor>();
+        T1->setDataType(DataType::Float32);
+        T1->setBackend("cpu");
+
+        T0->resize(dims);
+        T0->getImpl()->setRawPtr(inputArray, numberOfElements);
+        T1->resize(dims);
+        T1->getImpl()->setRawPtr(resultArray, numberOfElements);
+
+        std::shared_ptr<Node> heaviside = Heaviside(0.5);
+        auto op = std::static_pointer_cast<OperatorTensor>(heaviside->getOperator());
+        op->associateInput(0, T0);
+        op->setBackend("cpu");
+        op->setDataType(DataType::Float32);
+
+        op->forward();
+
+        REQUIRE(approxEq<float>(*(op->getOutput(0)), *T1));
+    }
+}
+}
diff --git a/unit_tests/operator/Test_LeakyReLUImpl.cpp b/unit_tests/operator/Test_LeakyReLUImpl.cpp
index 85dd9f99ee425216f8495e7813b35ce69be9c806..b60b8bb3e50a33eb339f66905c81d6824e28a835 100644
--- a/unit_tests/operator/Test_LeakyReLUImpl.cpp
+++ b/unit_tests/operator/Test_LeakyReLUImpl.cpp
@@ -9,13 +9,16 @@
  *
  ********************************************************************************/
 
+#include <memory>
+
 #include <catch2/catch_test_macros.hpp>
 
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/LeakyReLUImpl.hpp"
+#include "aidge/data/DataType.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/LeakyReLU.hpp"
 
-#include "aidge/backend/cpu.hpp"
-
 using namespace Aidge;
 
 TEST_CASE("[cpu/operator] LeakyReLU(forward)", "[LeakyReLU][CPU]") {
diff --git a/unit_tests/operator/Test_MatMulImpl.cpp b/unit_tests/operator/Test_MatMulImpl.cpp
index d6e934b4dc8d84e8a595eb74d1af9d2c68c892d1..f062f06cddfbd04217d63e1edcb6505914bc77e9 100644
--- a/unit_tests/operator/Test_MatMulImpl.cpp
+++ b/unit_tests/operator/Test_MatMulImpl.cpp
@@ -9,21 +9,26 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
-#include <cstddef>  // std::size_t
-#include <cstdint>  // std::uint16_t
-#include <chrono>
-#include <iostream>
+#include <chrono>      // std::micro, std::chrono::time_point,
+                       // std::chrono::system_clock, std::chrono::duration
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::uint16_t
 #include <memory>
-#include <random>   // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <random>      // std::random_device, std::mt19937
+                       // std::uniform_int_distribution, std::uniform_real_distribution
+#include <vector>
+
+#include <catch2/catch_test_macros.hpp>
+#include <fmt/core.h>
 
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/MatMulImpl.hpp"
+#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/MatMul.hpp"
 #include "aidge/operator/OperatorTensor.hpp"
 #include "aidge/utils/TensorUtils.hpp"
 
-#include "aidge/backend/cpu/operator/MatMulImpl.hpp"
-
 namespace Aidge {
 
 TEST_CASE("[cpu/operator] MatMul(forward)", "[MatMul][CPU]") {
@@ -106,8 +111,8 @@ TEST_CASE("[cpu/operator] MatMul(forward)", "[MatMul][CPU]") {
             delete[] bigArray2;
             delete[] res;
         }
-        std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl;
-        std::cout << "total time: " << duration.count() << std::endl;
+        Log::info("number of multiplications over time spent: {}\n", (totalComputation / duration.count()));
+        Log::info("total time: {} Î¼s\n", duration.count());
     }
 
     SECTION("3-D Tensors") {
@@ -174,8 +179,8 @@ TEST_CASE("[cpu/operator] MatMul(forward)", "[MatMul][CPU]") {
             delete[] bigArray2;
             delete[] res;
         }
-        std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl;
-        std::cout << "total time: " << duration.count() << std::endl;
+        Log::info("number of multiplications over time spent: {}\n", (totalComputation / duration.count()));
+        Log::info("total time: {} Î¼s\n", duration.count());
     }
 
     SECTION("4-D Tensors") {
@@ -244,8 +249,8 @@ TEST_CASE("[cpu/operator] MatMul(forward)", "[MatMul][CPU]") {
             delete[] bigArray2;
             delete[] res;
         }
-        std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl;
-        std::cout << "total time: " << duration.count() << std::endl;
+        Log::info("number of multiplications over time spent: {}\n", (totalComputation / duration.count()));
+        Log::info("total time: {} Î¼s\n", duration.count());
     }
 
     SECTION("+2-D / 1-D") {
diff --git a/unit_tests/operator/Test_MaxPoolingImpl.cpp b/unit_tests/operator/Test_MaxPoolingImpl.cpp
index af04ede4e33c32ce785804e2484b6ba9ac5edc36..de02df2b73bc461bbd76b089cd555d7c82bd173e 100644
--- a/unit_tests/operator/Test_MaxPoolingImpl.cpp
+++ b/unit_tests/operator/Test_MaxPoolingImpl.cpp
@@ -9,15 +9,17 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
+#include <array>
 #include <memory>
-#include <cstdlib>
 
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/MaxPoolingImpl.hpp"
+#include "aidge/data/DataType.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/MaxPooling.hpp"
 
-#include "aidge/backend/cpu.hpp"
-
 using namespace Aidge;
 
 
@@ -53,10 +55,9 @@ TEST_CASE("[cpu/operator] MaxPooling(forward)", "[MaxPooling][CPU]") {
         }
     });
     SECTION("Stride") {
-        std::shared_ptr<Node> myMaxPool = MaxPooling({2,2}, "mycdw", {2,2});
-        auto op = std::static_pointer_cast<OperatorTensor>(myMaxPool -> getOperator());
+        std::shared_ptr<MaxPooling_Op<2>> op = std::make_shared<MaxPooling_Op<2>>(std::array<std::size_t, 2>({2,2}), std::array<std::size_t, 2>({2,2}));
 
-        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,2,2,2> {
+        Tensor myOutput = Array4D<float,2,2,2,2> {
             {
                 {
                     {{  0.7995,  0.6142},
@@ -71,12 +72,12 @@ TEST_CASE("[cpu/operator] MaxPooling(forward)", "[MaxPooling][CPU]") {
                      {0.0857,  0.6776}}
                 }
             }
-        });
-        myMaxPool->getOperator()->associateInput(0,myInput);
-        myMaxPool->getOperator()->setDataType(DataType::Float32);
-        myMaxPool->getOperator()->setBackend("cpu");
-        myMaxPool->forward();
+        };
+        op->associateInput(0,myInput);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cpu");
+        op->forward();
         op->getOutput(0)->print();
-        REQUIRE(*(op->getOutput(0)) == *myOutput);
+        REQUIRE(*(op->getOutput(0)) == myOutput);
     }
 }
\ No newline at end of file
diff --git a/unit_tests/operator/Test_Memorize.cpp b/unit_tests/operator/Test_Memorize.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6c1a617e268dfa376ab154f4085dc63ef3760ea7
--- /dev/null
+++ b/unit_tests/operator/Test_Memorize.cpp
@@ -0,0 +1,66 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <memory>
+#include <string>
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/AddImpl.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/graph/Node.hpp"
+#include "aidge/graph/GraphView.hpp"
+#include "aidge/graph/OpArgs.hpp"
+#include "aidge/operator/Add.hpp"
+#include "aidge/operator/Memorize.hpp"
+#include "aidge/operator/Producer.hpp"
+#include "aidge/recipes/GraphViewHelper.hpp"
+#include "aidge/scheduler/SequentialScheduler.hpp"
+
+namespace Aidge {
+
+TEST_CASE("[cpu/operator] Memorize(forward)", "[Memorize][CPU]") {
+    SECTION("Test simple") {
+        std::shared_ptr<Tensor> inputTensor =
+                std::make_shared<Tensor>(Array1D<int, 1>{{1}});
+
+        auto input = Producer({1}, "input");
+        auto init = Producer({1}, "init");
+        auto add = Add("add");
+        auto mem = Memorize(3, "mem");
+
+        input->addChild(add, 0, 0);
+        init->addChild(mem, 0, 1);
+        add->addChild(mem, 0,0);
+        mem->addChild(/*otherNode=*/add, /*outId=*/1, /*otherInId=*/1);
+
+        input->getOperator()->setOutput(0, inputTensor);
+        init->getOperator()->setOutput(0, inputTensor);
+
+        auto g = getConnectedGraphView(input);
+
+        g->setDataType(Aidge::DataType::Int32);
+        g->setBackend("cpu");
+        g->forwardDims();
+        g->save("simple_graph");
+
+        SequentialScheduler scheduler(g);
+        REQUIRE_NOTHROW(scheduler.forward());
+        scheduler.saveSchedulingDiagram("simple");
+
+        const Tensor expectedOutput = Array1D<int, 1>{{4}};
+        std::shared_ptr<Tensor> other = std::static_pointer_cast<OperatorTensor>(mem->getOperator())->getOutput(0);
+        other->print();
+        REQUIRE((*other == expectedOutput));
+    }
+}
+} // namespace Aidge
diff --git a/unit_tests/operator/Test_MulImpl.cpp b/unit_tests/operator/Test_MulImpl.cpp
index 3378861d0d3d7e74e7867c2765a0b09069fa8caf..2937e94938c671140eeeee87d47d5c48f685203e 100644
--- a/unit_tests/operator/Test_MulImpl.cpp
+++ b/unit_tests/operator/Test_MulImpl.cpp
@@ -9,379 +9,336 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
-#include <cstddef>   // std::size_t
-#include <cstdint>   // std::uint16_t
 #include <chrono>
-#include <iostream>
+#include <cstddef> // std::size_t
+#include <cstdint> // std::uint16_t
 #include <memory>
-#include <numeric>   // std::accumulate
-#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <numeric> // std::accumulate
+#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution,
+                   // std::uniform_int_distribution
+
+#include <catch2/catch_test_macros.hpp>
 
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/MulImpl.hpp"
+#include "aidge/data/DataType.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/Mul.hpp"
+#include "aidge/utils/ArrayHelpers.hpp"
+#include "aidge/utils/Log.hpp"
 #include "aidge/utils/TensorUtils.hpp"
 
 namespace Aidge {
 
-    TEST_CASE("[CPU/Operator] Mul Backward", "[Mul][CPU][Backward]")
-    {
-        std::shared_ptr<Node> myMul = Mul();
-        auto op = std::static_pointer_cast<OperatorTensor>(myMul->getOperator());
-        op->setDataType(DataType::Float32);
-        op->setBackend("cpu");
-
-        SECTION("Case 1: 2D and 1D tensors") {
-            const auto T0 = std::make_shared<Tensor>(Array2D<float,2,3>(
-                {
-                    {
-                        {1,2,3},{4,5,6}
-                    }
-                }
-            ));
+TEST_CASE("[CPU/Operator] Mul(Backward)", "[Mul][CPU][Backward]") {
+    std::shared_ptr<Mul_Op> op = std::make_shared<Mul_Op>();
+    op->setDataType(DataType::Float32);
+    op->setBackend("cpu");
 
-            const auto T1 = std::make_shared<Tensor>(Array1D<float,3>(
-                {0.1,0.2,0.3}
-            ));
+    // NOTE: The first four tests use fixed values, the last one uses random values but static dimensions.
 
-            T0->setDataType(DataType::Float32);
-            T0->setBackend("cpu");
-            T1->setDataType(DataType::Float32);
-            T1->setBackend("cpu");
+    SECTION("Case 1: 1D and 2D Tensors") {
+        const auto T0 = std::make_shared<Tensor>(
+            Array2D<cpptype_t<DataType::Float32>, 2, 3>({{{1, 2, 3}, {4, 5, 6}}}));
 
-            op->getOutput(0)->setGrad(std::make_shared<Tensor>(Array2D<float,2,3>({{{1.0,1.0,1.0},{1.0,1.0,1.0}}})));
+        const auto T1 =
+            std::make_shared<Tensor>(Array1D<cpptype_t<DataType::Float32>, 3>({0.1, 0.2, 0.3}));
 
-            op->associateInput(0,T0);
-            op->associateInput(1,T1);
-            op->forwardDims();
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(std::make_shared<Tensor>(
+            Array2D<float, 2, 3>({{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}})));
+        op->forwardDims();
 
-            myMul->forward();
-            myMul->backward();
+        op->backward();
 
-            auto T0Grad = std::make_shared<Tensor>(Array2D<float, 2,3>({{{0.1,0.2,0.3},{0.1, 0.2, 0.3}}}));
-            auto T1Grad = std::make_shared<Tensor>(Array1D<float, 3>({5,7,9}));
+        const Tensor expectedGrad0 =
+            Array2D<cpptype_t<DataType::Float32>, 2, 3>({{{0.1, 0.2, 0.3}, {0.1, 0.2, 0.3}}});
 
-            REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *T0Grad));
-            REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *T1Grad));
-        }
+        const Tensor expectedGrad1 = Array1D<cpptype_t<DataType::Float32>, 3>({5, 7, 9});
 
-        SECTION("Case 2: 3D and 1D tensors") {
-            const auto T0 = std::make_shared<Tensor>(Array3D<float,2,2,3>(
-                {
-                    {
-                        {
-                            {1.0, 2.0, 3.0},
-                            {4.0, 5.0, 6.0}
-                        },
-                        {
-                            {7.0, 8.0, 9.0},
-                            {10.0, 11.0, 12.0}
-                        }
-                    }
-                }
-            ));
-
-            const auto T1 = std::make_shared<Tensor>(Array1D<float, 3>({0.3,0.2,0.1}));
-
-            const auto newGrad = std::make_shared<Tensor>(Array3D<float,2,2,3>(
-                    {
-                        {
-                            {
-                                {1, 1, 1},
-                                {1, 1, 1}
-                            },
-                            {
-                                {1, 1, 1},
-                                {1, 1, 1}
-                            }
-                        }
-                    }
-                ));
-
-            const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float,2,2,3>(
-                {
-                    {
-                        {
-                            {0.3, 0.2, 0.1},
-                            {0.3, 0.2, 0.1}
-                        },
-                        {
-                            {0.3, 0.2, 0.1},
-                            {0.3, 0.2, 0.1}
-                        }
-                    }
-                }
-            ));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1));
+    }
 
-            const auto expectedGrad1 = std::make_shared<Tensor>(Array1D<float,3>(
-                {22.0, 26.0, 30.0}
-            ));
+    SECTION("Case 2: 3D and 1D tensors") {
+        const auto T0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+            {{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}},
+              {{7.0, 8.0, 9.0}, {10.0, 11.0, 12.0}}}}));
 
-            for(auto T: {T0, T1, newGrad, expectedGrad0, expectedGrad1})
-            {
-                    T->setBackend("cpu") ;
-                    T->setDataType(DataType::Float32);
-            }
+        const auto T1 =
+            std::make_shared<Tensor>(Array1D<float, 3>({0.3, 0.2, 0.1}));
 
-            op->associateInput(0, T0);
-            op->associateInput(1, T1);
-            op->getOutput(0)->setGrad(newGrad);
-            op->forwardDims();
+        const auto newGrad = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+            {{{{1, 1, 1}, {1, 1, 1}}, {{1, 1, 1}, {1, 1, 1}}}}));
 
-            myMul->backward();
+        const Tensor expectedGrad0 =
+            Array3D<float, 2, 2, 3>({{{{0.3, 0.2, 0.1}, {0.3, 0.2, 0.1}},
+                                      {{0.3, 0.2, 0.1}, {0.3, 0.2, 0.1}}}});
 
-            REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0));
-            REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1));
-        }
+        const Tensor expectedGrad1 = Array1D<cpptype_t<DataType::Float32>, 3>({22.0, 26.0, 30.0});
 
-        SECTION("Case 3: 4D and 2D tensors") {
-            const auto T0 = std::make_shared<Tensor>(Array4D<float,2, 2, 3, 3>(
-                {
-                    {
-                        {
-                            {
-                                {1.0, 2.0, 3.0},
-                                {4.0, 5.0, 6.0},
-                                {7.0, 8.0, 9.0}
-                            },
-                            {
-                                {10.0, 11.0, 12.0},
-                                {13.0, 14.0, 15.0},
-                                {16.0, 17.0, 18.0}
-                            }
-                        },
-                        {
-                            {
-                                {19.0, 20.0, 21.0},
-                                {22.0, 23.0, 24.0},
-                                {25.0, 26.0, 27.0}
-                            },
-                            {
-                                {28.0, 29.0, 30.0},
-                                {31.0, 32.0, 33.0},
-                                {34.0, 35.0, 36.0}
-                            }
-                        }
-                    }
-                }
-            ));
-
-            const auto T1 = std::make_shared<Tensor>(Array2D<float, 3,3>(
-                {
-                    {
-                        {0.5,0.3,0.1},
-                        {0.4,0.2,0.6},
-                        {0.7,0.8,0.9}
-                    }
-                }
-            ));
-
-            const auto newGrad = std::make_shared<Tensor>(Array4D<float,2, 2, 3, 3>(
-                {
-                    {
-                        {
-                            {
-                                {1.0, 1.0, 1.0},
-                                {1.0, 1.0, 1.0},
-                                {1.0, 1.0, 1.0}
-                            },
-                            {
-                                {1.0, 1.0, 1.0},
-                                {1.0, 1.0, 1.0},
-                                {1.0, 1.0, 1.0}
-                            }
-                        },
-                        {
-                            {
-                                {1.0, 1.0, 1.0},
-                                {1.0, 1.0, 1.0},
-                                {1.0, 1.0, 1.0}
-                            },
-                            {
-                                {1.0, 1.0, 1.0},
-                                {1.0, 1.0, 1.0},
-                                {1.0, 1.0, 1.0}
-                            }
-                        }
-                    }
-                }
-            ));
-
-            const auto expectedGrad0 = std::make_shared<Tensor>(Array4D<float,2,2,3,3>(
-                {
-                    {
-                        {
-                            {
-                                {0.5, 0.3, 0.1},
-                                {0.4, 0.2, 0.6},
-                                {0.7, 0.8, 0.9}
-                            },
-                            {
-                                {0.5, 0.3, 0.1},
-                                {0.4, 0.2, 0.6},
-                                {0.7, 0.8, 0.9}
-                            }
-                        },
-                        {
-                            {
-                                {0.5, 0.3, 0.1},
-                                {0.4, 0.2, 0.6},
-                                {0.7, 0.8, 0.9}
-                            },
-                            {
-                                {0.5, 0.3, 0.1},
-                                {0.4, 0.2, 0.6},
-                                {0.7, 0.8, 0.9}
-                            }
-                        }
-                    }
-                }
-            ));
-
-            const auto expectedGrad1 = std::make_shared<Tensor>(Array2D<float,3, 3>(
-                {
-                    {
-                        {58.0, 62.0, 66.0},
-                        {70.0, 74.0, 78.0},
-                        {82.0, 86.0, 90.0}
-                    }
-                }
-            ));
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(newGrad);
+        op->forwardDims();
 
-            for(const auto T: {T0, T1, newGrad, expectedGrad0, expectedGrad1})
-            {
-                    T->setBackend("cpu") ;
-                    T->setDataType(DataType::Float32);
-            }
+        op->backward();
+
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1));
+    }
 
-            op->associateInput(0, T0);
-            op->associateInput(1, T1);
-            op->getOutput(0)->setGrad(newGrad);
-            op->forwardDims();
+    SECTION("Case 3: 4D and 2D tensors") {
+        const auto T0 = std::make_shared<Tensor>(Array4D<cpptype_t<DataType::Float32>, 2, 2, 3, 3>(
+            {{{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}, {7.0, 8.0, 9.0}},
+               {{10.0, 11.0, 12.0}, {13.0, 14.0, 15.0}, {16.0, 17.0, 18.0}}},
+              {{{19.0, 20.0, 21.0}, {22.0, 23.0, 24.0}, {25.0, 26.0, 27.0}},
+               {{28.0, 29.0, 30.0},
+                {31.0, 32.0, 33.0},
+                {34.0, 35.0, 36.0}}}}}));
+
+        const auto T1 = std::make_shared<Tensor>(Array2D<cpptype_t<DataType::Float32>, 3, 3>(
+            {{{0.5, 0.3, 0.1}, {0.4, 0.2, 0.6}, {0.7, 0.8, 0.9}}}));
+
+        const auto newGrad =
+            std::make_shared<Tensor>(Array4D<cpptype_t<DataType::Float32>, 2, 2, 3, 3>(
+                {{{{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}},
+                   {{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}},
+                  {{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}},
+                   {{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}}}}));
+
+        const Tensor expectedGrad0 =
+            Array4D<cpptype_t<DataType::Float32>, 2, 2, 3, 3>(
+                {{{{{0.5, 0.3, 0.1}, {0.4, 0.2, 0.6}, {0.7, 0.8, 0.9}},
+                   {{0.5, 0.3, 0.1}, {0.4, 0.2, 0.6}, {0.7, 0.8, 0.9}}},
+                  {{{0.5, 0.3, 0.1}, {0.4, 0.2, 0.6}, {0.7, 0.8, 0.9}},
+                   {{0.5, 0.3, 0.1}, {0.4, 0.2, 0.6}, {0.7, 0.8, 0.9}}}}});
+
+        const Tensor expectedGrad1 =
+            Array2D<cpptype_t<DataType::Float32>, 3, 3>({{{58.0, 62.0, 66.0},
+                                   {70.0, 74.0, 78.0},
+                                   {82.0, 86.0, 90.0}}});
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(newGrad);
+        op->forwardDims();
+
+        op->backward();
+
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1));
+    }
+
+    SECTION("Case 4: 3D and 2D tensors") {
+        const auto T0 = std::make_shared<Tensor>(
+            Array3D<float, 2, 3, 4>({{{
+                                          {1.0, 2.0, 3.0, 4.0},
+                                          {5.0, 6.0, 7.0, 8.0},
+                                          {9.0, 10.0, 11.0, 12.0},
+                                      },
+                                      {
+                                          {13.0, 14.0, 15.0, 16.0},
+                                          {17.0, 18.0, 19.0, 20.0},
+                                          {21.0, 22.0, 23.0, 24.0},
+                                      }}}));
+
+        const auto T1 = std::make_shared<Tensor>(
+            Array2D<cpptype_t<DataType::Float32>, 3, 4>({{{0.1, 0.2, 0.3, 0.4},
+                                   {0.5, 0.6, 0.7, 0.8},
+                                   {0.9, 1.0, 1.1, 1.2}}}));
+
+        const auto newGrad = std::make_shared<Tensor>(
+            Array3D<cpptype_t<DataType::Float32>, 2, 3, 4>({{{
+                                          {1.0, 1.0, 1.0, 1.0},
+                                          {1.0, 1.0, 1.0, 1.0},
+                                          {1.0, 1.0, 1.0, 1.0},
+                                      },
+                                      {
+                                          {1.0, 1.0, 1.0, 1.0},
+                                          {1.0, 1.0, 1.0, 1.0},
+                                          {1.0, 1.0, 1.0, 1.0},
+                                      }}}));
+
+        const Tensor expectedGrad0 =
+            Array3D<cpptype_t<DataType::Float32>, 2, 3, 4>({{{{0.1, 0.2, 0.3, 0.4},
+                                       {0.5, 0.6, 0.7, 0.8},
+                                       {0.9, 1.0, 1.1, 1.2}},
+                                      {{0.1, 0.2, 0.3, 0.4},
+                                       {0.5, 0.6, 0.7, 0.8},
+                                       {0.9, 1.0, 1.1, 1.2}}}});
+
+        const Tensor expectedGrad1 =
+            Array2D<cpptype_t<DataType::Float32>, 3, 4>({{{14.0, 16.0, 18.0, 20.0},
+                                   {22.0, 24.0, 26.0, 28.0},
+                                   {30.0, 32.0, 34.0, 36.0}}});
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(newGrad);
+        op->forwardDims();
+
+        op->backward();
+
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1));
+    }
 
-            myMul->backward();
+    SECTION("Case 5: Tensors with random values") {
 
-            REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0));
-            REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1));
+        // Use random values
+        const std::vector<std::size_t> dims0 = {5, 2, 1, 7}; // First tensor
+        const std::vector<std::size_t> dims1 = {2, 6, 7};    // Second tensor
+        const std::vector<std::size_t> outputDims = {5, 2, 6, 7};
+
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> dist(0.1f, 1.0f);
+
+        auto T0 = std::make_shared<Tensor>(dims0);
+        T0->setDataType(DataType::Float32);
+        T0->setBackend("cpu");
+        float* input0Data = static_cast<float*>(T0->getImpl()->rawPtr());
+        // Fill with random values
+        for (std::size_t i = 0; i < T0->size(); ++i) {
+            input0Data[i] = dist(gen);
         }
 
-        SECTION("Case 4: 3D and 2D tensors") {
-            const auto T0 = std::make_shared<Tensor>(Array3D<float, 2, 3, 4>(
-                {
-                    {
-                        {
-                            {1.0, 2.0, 3.0, 4.0},
-                            {5.0, 6.0, 7.0, 8.0},
-                            {9.0, 10.0, 11.0, 12.0},
-                        },
-                        {
-                            {13.0, 14.0, 15.0, 16.0},
-                            {17.0, 18.0, 19.0, 20.0},
-                            {21.0, 22.0, 23.0, 24.0},
-                        }
-                    }
-                }
-            ));
-
-            const auto T1 = std::make_shared<Tensor>(Array2D<float, 3, 4>(
-                {
-                    {
-                        {0.1, 0.2, 0.3, 0.4},
-                        {0.5, 0.6, 0.7, 0.8},
-                        {0.9, 1.0, 1.1, 1.2}
-                    }
-                }
-            ));
-
-            const auto newGrad = std::make_shared<Tensor>(Array3D<float, 2,3,4>(
-                {
-                    {
-                        {
-                            {1.0, 1.0, 1.0, 1.0},
-                            {1.0, 1.0, 1.0, 1.0},
-                            {1.0, 1.0, 1.0, 1.0},
-                        },
-                        {
-                            {1.0, 1.0, 1.0, 1.0},
-                            {1.0, 1.0, 1.0, 1.0},
-                            {1.0, 1.0, 1.0, 1.0},
-                        }
-                    }
-                }
-            ));
-
-            const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float,2,3,4>(
-                {
-                    {
-                        {
-                            {0.1, 0.2, 0.3, 0.4},
-                            {0.5, 0.6, 0.7, 0.8},
-                            {0.9, 1.0, 1.1, 1.2}
-                        },
-                        {
-                            {0.1, 0.2, 0.3, 0.4},
-                            {0.5, 0.6, 0.7, 0.8},
-                            {0.9, 1.0, 1.1, 1.2}
-                        }
-                    }
-                }
-            ));
-
-            const auto expectedGrad1 = std::make_shared<Tensor>(Array2D<float,3, 4>(
-                {
-                    {
-                        {14.0, 16.0, 18.0, 20.0},
-                        {22.0, 24.0, 26.0, 28.0},
-                        {30.0, 32.0, 34.0, 36.0}
+        auto T1 = std::make_shared<Tensor>(dims1);
+        T1->setDataType(DataType::Float32);
+        T1->setBackend("cpu");
+        float* input1Data = static_cast<float*>(T1->getImpl()->rawPtr());
+        // Fill with random values
+        for (std::size_t i = 0; i < T1->size(); ++i) {
+            input1Data[i] = dist(gen);
+        }
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+
+        op->forwardDims();
+        op->forward();
+
+        Tensor expectedOutput{outputDims};
+        expectedOutput.setBackend("cpu");
+        float* expectedOutputData = static_cast<float*>(expectedOutput.getImpl()->rawPtr());
+
+        for (std::size_t n = 0; n < 5; ++n) {
+            for (std::size_t c = 0; c < 2; ++c) {
+                for (std::size_t h = 0; h < 6; ++h) {
+                    for (std::size_t w = 0; w < 7; ++w) {
+                        std::size_t outIdx = w + 7 * (h + 6 * (c + 2 * n));
+                        std::size_t in0Idx =
+                            w + 7 * (0 + 1 * (c + 2 * n)); // middle dim is 1
+                        std::size_t in1Idx =
+                            w + 7 * (h + 6 * c);           // no n dimension
+
+                        expectedOutputData[outIdx] = input0Data[in0Idx] * input1Data[in1Idx];
                     }
                 }
-            ));
-
-            for(const auto T: {T0, T1, newGrad, expectedGrad0, expectedGrad1})
-            {
-                T->setBackend("cpu") ;
-                T->setDataType(DataType::Float32);
             }
+        }
 
-            op->associateInput(0, T0);
-            op->associateInput(1, T1);
-            op->getOutput(0)->setGrad(newGrad);
-            op->forwardDims();
+        auto outputTensor = op->getOutput(0);
 
-            myMul->backward();
+        REQUIRE(approxEq<float>(*outputTensor, expectedOutput));
 
-            REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0));
-            REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1));
+        // Backward pass
+        std::vector<float> gradOutputData(expectedOutput.size());
+        for (auto &val : gradOutputData) {
+            val = dist(gen);
         }
+
+        op->getOutput(0)->setGrad(std::make_shared<Tensor>());
+        op->getOutput(0)->grad()->resize(outputDims);
+        op->getOutput(0)->grad()->getImpl()->setRawPtr(gradOutputData.data(),
+                                                       expectedOutput.size());
+
+        // Compute reference gradients
+        std::vector<float> expectedGrad0(T0->size(), 0.0f);
+        std::vector<float> expectedGrad1(T1->size(), 0.0f);
+
+        for (std::size_t n = 0; n < 5; ++n) {
+            for (std::size_t c = 0; c < 2; ++c) {
+                for (std::size_t h = 0; h < 6; ++h) {
+                    for (std::size_t w = 0; w < 7; ++w) {
+                        std::size_t outIdx = w + 7 * (h + 6 * (c + 2 * n));
+                        std::size_t in0Idx = w + 7 * (0 + 1 * (c + 2 * n));
+                        std::size_t in1Idx = w + 7 * (h + 6 * c);
+
+                        // Gradient for input0: grad_output * input1
+                        expectedGrad0[in0Idx] +=
+                            gradOutputData[outIdx] * input1Data[in1Idx];
+
+                        // Gradient for input1: grad_output * input0
+                        expectedGrad1[in1Idx] +=
+                            gradOutputData[outIdx] * input0Data[in0Idx];
+                    }
+                }
+            }
+        }
+
+        // Perform backward pass
+        op->backward();
+
+        auto expectedGrad0Tensor = std::make_shared<Tensor>();
+        expectedGrad0Tensor->resize(T0->dims());
+        expectedGrad0Tensor->setBackend("cpu");
+        expectedGrad0Tensor->setDataType(DataType::Float32);
+        expectedGrad0Tensor->getImpl()->setRawPtr(expectedGrad0.data(),
+                                                    expectedGrad0.size());
+
+        auto expectedGrad1Tensor = std::make_shared<Tensor>(T1->dims());
+        expectedGrad1Tensor->setBackend("cpu");
+        expectedGrad1Tensor->setDataType(DataType::Float32);
+        expectedGrad1Tensor->getImpl()->setRawPtr(expectedGrad1.data(),
+                                                    expectedGrad1.size());
+
+        // Verify backward pass
+        REQUIRE(approxEq<float>(*T0->grad(), *expectedGrad0Tensor));
+        REQUIRE(approxEq<float>(*T1->grad(), *expectedGrad1Tensor));
+
+        // Optional: Print some values for verification
+        // std::cout << "Input shapes: (" << dims0[0] << "," << dims0[1] <<
+        // "," << dims0[2] << "," << dims0[3]
+        //           << ") * (" << dims1[0] << "," << dims1[1] << "," <<
+        //           dims1[2]
+        //           << ") -> (" << outputDims[0] << "," << outputDims[1]
+        //           << "," << outputDims[2] << "," << outputDims[3] <<
+        //           ")\n";
+        // std::cout << "Input sizes: " << input0_size << " * " <<
+        // input1_size << " -> " << output_size << "\n";
     }
+}
 
-TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
+TEST_CASE("[cpu/operator] Mul(forward)", "[Mul][CPU]") {
     constexpr std::uint16_t NBTRIALS = 10;
     // Create a random number generator
     std::random_device rd;
     std::mt19937 gen(rd());
-    std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1
-    std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
-    std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(3));
-    std::uniform_int_distribution<int> boolDist(0,1);
-
-    // Create MatMul Operator
-    std::shared_ptr<Node> myMul = Mul();
-    auto op = std::static_pointer_cast<OperatorTensor>(myMul-> getOperator());
+    std::uniform_real_distribution<float> valueDist(
+        0.1f,
+        1.1f); // Random float distribution between 0 and 1
+    std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2),
+                                                           std::size_t(10));
+    std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1),
+                                                          std::size_t(3));
+    std::uniform_int_distribution<int> boolDist(0, 1);
+
+    std::shared_ptr<Mul_Op> op = std::make_shared<Mul_Op>();
     op->setDataType(DataType::Float32);
     op->setBackend("cpu");
 
-    // Create 2 input Tensors
     std::shared_ptr<Tensor> T0 = std::make_shared<Tensor>();
-    op->associateInput(0,T0);
+    op->associateInput(0, T0);
     T0->setDataType(DataType::Float32);
     T0->setBackend("cpu");
     std::shared_ptr<Tensor> T1 = std::make_shared<Tensor>();
-    op -> associateInput(1,T1);
+    op->associateInput(1, T1);
     T1->setDataType(DataType::Float32);
     T1->setBackend("cpu");
 
-    // Create results Tensor
     std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>();
     Tres->setDataType(DataType::Float32);
     Tres->setBackend("cpu");
@@ -391,14 +348,9 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
     std::chrono::time_point<std::chrono::system_clock> end;
     std::chrono::duration<double, std::micro> duration{};
 
-
     SECTION("MulImpl_cpu::forward()") {
-        SECTION("Scalar / Scalar") {
-
-        }
-        SECTION("Scalar / +1-D Tensor") {
-
-        }
+        SECTION("Scalar / Scalar") {}
+        SECTION("Scalar / +1-D Tensor") {}
         SECTION("+1-D Tensor / +1-D Tensor - same dimensions") {
 
             std::size_t number_of_operation = 0;
@@ -413,13 +365,17 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
                     dims.push_back(dimSizeDist(gen));
                 }
 
-                const auto nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                const auto nb_elements =
+                    std::accumulate(dims.cbegin(),
+                                    dims.cend(),
+                                    std::size_t(1),
+                                    std::multiplies<std::size_t>());
                 number_of_operation += nb_elements;
 
                 // without broadcasting
-                float* array0 = new float[nb_elements];
-                float* array1 = new float[nb_elements];
-                float* result = new float[nb_elements];
+                float *array0 = new float[nb_elements];
+                float *array1 = new float[nb_elements];
+                float *result = new float[nb_elements];
 
                 for (std::size_t i = 0; i < nb_elements; ++i) {
                     array0[i] = valueDist(gen);
@@ -429,21 +385,23 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
 
                 // input0
                 T0->resize(dims);
-                T0 -> getImpl() -> setRawPtr(array0, nb_elements);
+                T0->getImpl()->setRawPtr(array0, nb_elements);
 
                 // input1
                 T1->resize(dims);
-                T1 -> getImpl() -> setRawPtr(array1, nb_elements);
+                T1->getImpl()->setRawPtr(array1, nb_elements);
 
                 // results
                 Tres->resize(dims);
-                Tres -> getImpl() -> setRawPtr(result, nb_elements);
+                Tres->getImpl()->setRawPtr(result, nb_elements);
 
                 op->forwardDims();
                 start = std::chrono::system_clock::now();
-                myMul->forward();
+                op->forward();
                 end = std::chrono::system_clock::now();
-                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+                duration +=
+                    std::chrono::duration_cast<std::chrono::microseconds>(
+                        end - start);
 
                 REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
 
@@ -451,24 +409,23 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
                 delete[] array1;
                 delete[] result;
             }
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {}Î¼s\n", duration.count());
         }
 
-
         SECTION("+1-D Tensor / +1-D Tensor - broadcasting") {
             std::size_t number_of_operation = 0;
 
             for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
 
                 // generate 2 random Tensors
-                // handle dimensions, replace some dimensions with '1' to get broadcasting
+                // handle dimensions, replace some dimensions with '1' to get
+                // broadcasting
 
                 constexpr std::size_t nbDims = 4;
                 std::vector<std::size_t> dimensions;
 
-                for (std::size_t i = 0; i < nbDims; ++i)
-                {
+                for (std::size_t i = 0; i < nbDims; ++i) {
                     dimensions.push_back(dimSizeDist(gen));
                 }
 
@@ -476,77 +433,90 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
                 auto dims1 = dimensions;
                 auto dimsOut = dimensions;
 
-                for (std::size_t i = 0; i < nbDims; ++i)
-                {
-                    if (boolDist(gen))
-                    {
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    if (boolDist(gen)) {
                         dims0[i] = 1;
                     }
 
-                    if (boolDist(gen))
-                    {
+                    if (boolDist(gen)) {
                         dims1[i] = 1;
                     }
 
                     dimsOut[i] = (dims0[i] == 1) ? dims1[i] : dims0[i];
                 }
 
-                for(auto dim : dims0)
-                {
+                for (auto dim : dims0) {
                     Log::info("Dimension of input 0 : {}", dim);
                 }
 
-                for(auto dim : dims1)
-                {
+                for (auto dim : dims1) {
                     Log::info("Dimension of input 1 : {}", dim);
                 }
 
                 // create arrays and fill them with random values
-                float* array0 = new float[dims0[0]*dims0[1]*dims0[2]*dims0[3]];
-                float* array1 = new float[dims1[0]*dims1[1]*dims1[2]*dims1[3]];
-                float* result = new float[dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]];
-
-
-                for (std::size_t i = 0; i < dims0[0]*dims0[1]*dims0[2]*dims0[3]; ++i)
-                {
+                float *array0 =
+                    new float[dims0[0] * dims0[1] * dims0[2] * dims0[3]];
+                float *array1 =
+                    new float[dims1[0] * dims1[1] * dims1[2] * dims1[3]];
+                float *result = new float[dimsOut[0] * dimsOut[1] *
+                                          dimsOut[2] * dimsOut[3]];
+
+                for (std::size_t i = 0;
+                     i < dims0[0] * dims0[1] * dims0[2] * dims0[3];
+                     ++i) {
                     array0[i] = valueDist(gen);
                 }
 
-                for (std::size_t i = 0; i < dims1[0]*dims1[1]*dims1[2]*dims1[3]; ++i)
-                {
+                for (std::size_t i = 0;
+                     i < dims1[0] * dims1[1] * dims1[2] * dims1[3];
+                     ++i) {
                     array1[i] = valueDist(gen);
                 }
 
                 // compute true result
-                const std::size_t strides0[nbDims] = {dims0[1]*dims0[2]*dims0[3], dims0[2]*dims0[3], dims0[3], 1};
-                const std::size_t strides1[nbDims] = {dims1[1]*dims1[2]*dims1[3], dims1[2]*dims1[3], dims1[3], 1};
-
-                for (std::size_t a = 0; a < dimsOut[0]; ++a)
-                {
-                    for (std::size_t b = 0; b < dimsOut[1]; ++b)
-                    {
-                        const std::size_t idx0_0 = strides0[0] * ((dims0[0] > 1) ? a : 0)
-                                                    + strides0[1] * ((dims0[1] > 1) ? b : 0);
-
-                        const std::size_t idx1_0 = strides1[0] * ((dims1[0] > 1) ? a : 0)
-                                                    + strides1[1] * ((dims1[1] > 1) ? b : 0);
-
-                        for (std::size_t c = 0; c < dimsOut[2]; ++c)
-                        {
-                            const std::size_t idx_out = dimsOut[3] * (c + dimsOut[2] * (b + dimsOut[1] * a));
-
-                            for (std::size_t d = 0; d < dimsOut[3]; ++d)
-                            {
-                                std::size_t idx0 = idx0_0
-                                                    + strides0[2] * ((dims0[2] > 1) ? c : 0)
-                                                    + ((dims0[3] > 1) ? d : 0);
-
-                                std::size_t idx1 = idx1_0
-                                                    + strides1[2] * ((dims1[2] > 1) ? c : 0)
-                                                    + ((dims1[3] > 1) ? d : 0);
-
-                                result[idx_out + d] = array0[idx0] * array1[idx1];
-                                // std::cout << "(" << idx0 << ", " << idx1 << ") -> " << array0[idx0] << " * " << array1[idx1] << " -> " << idx_out + d << std::endl;
+                const std::size_t strides0[nbDims] = {
+                    dims0[1] * dims0[2] * dims0[3],
+                    dims0[2] * dims0[3],
+                    dims0[3],
+                    1};
+                const std::size_t strides1[nbDims] = {
+                    dims1[1] * dims1[2] * dims1[3],
+                    dims1[2] * dims1[3],
+                    dims1[3],
+                    1};
+
+                for (std::size_t a = 0; a < dimsOut[0]; ++a) {
+                    for (std::size_t b = 0; b < dimsOut[1]; ++b) {
+                        const std::size_t idx0_0 =
+                            strides0[0] * ((dims0[0] > 1) ? a : 0) +
+                            strides0[1] * ((dims0[1] > 1) ? b : 0);
+
+                        const std::size_t idx1_0 =
+                            strides1[0] * ((dims1[0] > 1) ? a : 0) +
+                            strides1[1] * ((dims1[1] > 1) ? b : 0);
+
+                        for (std::size_t c = 0; c < dimsOut[2]; ++c) {
+                            const std::size_t idx_out =
+                                dimsOut[3] *
+                                (c + dimsOut[2] * (b + dimsOut[1] * a));
+
+                            for (std::size_t d = 0; d < dimsOut[3]; ++d) {
+                                std::size_t idx0 =
+                                    idx0_0 +
+                                    strides0[2] * ((dims0[2] > 1) ? c : 0) +
+                                    ((dims0[3] > 1) ? d : 0);
+
+                                std::size_t idx1 =
+                                    idx1_0 +
+                                    strides1[2] * ((dims1[2] > 1) ? c : 0) +
+                                    ((dims1[3] > 1) ? d : 0);
+
+                                result[idx_out + d] =
+                                    array0[idx0] * array1[idx1];
+                                // std::cout << "(" << idx0 << ", " << idx1 <<
+                                // ") -> " << array0[idx0] << " * " <<
+                                // array1[idx1] << " -> " << idx_out + d <<
+                                // std::endl;
                             }
                         }
                     }
@@ -555,22 +525,30 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
                 // conversion to Aidge::Tensors
                 // input0
                 T0->resize(dims0);
-                T0 -> getImpl() -> setRawPtr(array0, dims0[0]*dims0[1]*dims0[2]*dims0[3]);
+                T0->getImpl()->setRawPtr(
+                    array0,
+                    dims0[0] * dims0[1] * dims0[2] * dims0[3]);
 
                 // input1
                 T1->resize(dims1);
-                T1 -> getImpl() -> setRawPtr(array1, dims1[0]*dims1[1]*dims1[2]*dims1[3]);
+                T1->getImpl()->setRawPtr(
+                    array1,
+                    dims1[0] * dims1[1] * dims1[2] * dims1[3]);
 
                 // results
                 Tres->resize(dimsOut);
-                Tres -> getImpl() -> setRawPtr(result, dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]);
+                Tres->getImpl()->setRawPtr(
+                    result,
+                    dimsOut[0] * dimsOut[1] * dimsOut[2] * dimsOut[3]);
 
                 // compute result
                 op->forwardDims();
                 start = std::chrono::system_clock::now();
-                myMul->forward();
+                op->forward();
                 end = std::chrono::system_clock::now();
-                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+                duration +=
+                    std::chrono::duration_cast<std::chrono::microseconds>(
+                        end - start);
 
                 // comparison between truth and computed result
                 REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
@@ -579,15 +557,21 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
                 delete[] array1;
                 delete[] result;
 
-                const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                const std::size_t nb_elements =
+                    std::accumulate(dimsOut.cbegin(),
+                                    dimsOut.cend(),
+                                    std::size_t(1),
+                                    std::multiplies<std::size_t>());
                 number_of_operation += nb_elements;
             }
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {}Î¼s\n", duration.count());
         }
         SECTION("+1-D Tensor / 1-D Tensor") {
             std::size_t number_of_operation = 0;
-            std::uniform_int_distribution<std::size_t> nbRemovedDimsDist(std::size_t(1), std::size_t(3));
+            std::uniform_int_distribution<std::size_t> nbRemovedDimsDist(
+                std::size_t(1),
+                std::size_t(3));
 
             for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
                 // generate 2 random Tensors
@@ -604,15 +588,24 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
                         dims1[i] = 1;
                     }
                 }
-                dims1.erase(dims1.cbegin(), dims1.cbegin() + nbRemovedDimsDist(gen));
+                dims1.erase(dims1.cbegin(),
+                            dims1.cbegin() + nbRemovedDimsDist(gen));
 
                 // create arrays and fill them with random values
-                float* array0 = new float[dims0[0]*dims0[1]*dims0[2]*dims0[3]];
-                std::size_t array1_size = std::accumulate(dims1.cbegin(), dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
-                float* array1 = new float[array1_size];
-                float* result = new float[dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]];
-
-                for (std::size_t i = 0; i < (dims0[0]*dims0[1]*dims0[2]*dims0[3]); ++i) {
+                float *array0 =
+                    new float[dims0[0] * dims0[1] * dims0[2] * dims0[3]];
+                std::size_t array1_size =
+                    std::accumulate(dims1.cbegin(),
+                                    dims1.cend(),
+                                    std::size_t(1),
+                                    std::multiplies<std::size_t>());
+                float *array1 = new float[array1_size];
+                float *result = new float[dimsOut[0] * dimsOut[1] *
+                                          dimsOut[2] * dimsOut[3]];
+
+                for (std::size_t i = 0;
+                     i < (dims0[0] * dims0[1] * dims0[2] * dims0[3]);
+                     ++i) {
                     array0[i] = valueDist(gen);
                 }
                 for (std::size_t i = 0; i < array1_size; ++i) {
@@ -621,27 +614,48 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
 
                 // compute true result
                 auto dims1_tmp = dims1;
-                dims1_tmp.insert(dims1_tmp.cbegin(), 4 - dims1_tmp.size(), std::size_t(1));
-
-                const std::size_t strides0[nbDims] = {dims0[1]*dims0[2]*dims0[3], dims0[2]*dims0[3], dims0[3], 1};
-                const std::size_t strides1[nbDims] = {dims1_tmp[1]*dims1_tmp[2]*dims1_tmp[3], dims1_tmp[2]*dims1_tmp[3], dims1_tmp[3], 1};
+                dims1_tmp.insert(dims1_tmp.cbegin(),
+                                 4 - dims1_tmp.size(),
+                                 std::size_t(1));
+
+                const std::size_t strides0[nbDims] = {
+                    dims0[1] * dims0[2] * dims0[3],
+                    dims0[2] * dims0[3],
+                    dims0[3],
+                    1};
+                const std::size_t strides1[nbDims] = {
+                    dims1_tmp[1] * dims1_tmp[2] * dims1_tmp[3],
+                    dims1_tmp[2] * dims1_tmp[3],
+                    dims1_tmp[3],
+                    1};
                 for (std::size_t a = 0; a < dimsOut[0]; ++a) {
                     for (std::size_t b = 0; b < dimsOut[1]; ++b) {
-                        const std::size_t idx0_0 = strides0[0] * ((dims0[0] > 1) ? a : 0)
-                                                    + strides0[1] * ((dims0[1] > 1) ? b : 0);
-                        const std::size_t idx1_0 = strides1[0] * ((dims1_tmp[0] > 1) ? a : 0)
-                                                    + strides1[1] * ((dims1_tmp[1] > 1) ? b : 0);
+                        const std::size_t idx0_0 =
+                            strides0[0] * ((dims0[0] > 1) ? a : 0) +
+                            strides0[1] * ((dims0[1] > 1) ? b : 0);
+                        const std::size_t idx1_0 =
+                            strides1[0] * ((dims1_tmp[0] > 1) ? a : 0) +
+                            strides1[1] * ((dims1_tmp[1] > 1) ? b : 0);
                         for (std::size_t c = 0; c < dimsOut[2]; ++c) {
-                            const std::size_t idx_out = dimsOut[3] * (c + dimsOut[2] * (b + dimsOut[1] * a));
+                            const std::size_t idx_out =
+                                dimsOut[3] *
+                                (c + dimsOut[2] * (b + dimsOut[1] * a));
                             for (std::size_t d = 0; d < dimsOut[3]; ++d) {
-                                std::size_t idx0 = idx0_0
-                                                    + strides0[2] * ((dims0[2] > 1) ? c : 0)
-                                                    + ((dims0[3] > 1) ? d : 0);
-                                std::size_t idx1 = idx1_0
-                                                    + strides1[2] * ((dims1_tmp[2] > 1) ? c : 0)
-                                                    + ((dims1_tmp[3] > 1) ? d : 0);
-                                result[idx_out + d] = array0[idx0] * array1[idx1];
-                                // std::cout << "(" << idx0 << ", " << idx1 << ") -> " << array0[idx0] << " * " << array1[idx1] << " -> " << idx_out + d << std::endl;
+                                std::size_t idx0 =
+                                    idx0_0 +
+                                    strides0[2] * ((dims0[2] > 1) ? c : 0) +
+                                    ((dims0[3] > 1) ? d : 0);
+                                std::size_t idx1 =
+                                    idx1_0 +
+                                    strides1[2] *
+                                        ((dims1_tmp[2] > 1) ? c : 0) +
+                                    ((dims1_tmp[3] > 1) ? d : 0);
+                                result[idx_out + d] =
+                                    array0[idx0] * array1[idx1];
+                                // std::cout << "(" << idx0 << ", " << idx1 <<
+                                // ") -> " << array0[idx0] << " * " <<
+                                // array1[idx1] << " -> " << idx_out + d <<
+                                // std::endl;
                             }
                         }
                     }
@@ -650,22 +664,28 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
                 // conversion to Aidge::Tensors
                 // input0
                 T0->resize(dims0);
-                T0 -> getImpl() -> setRawPtr(array0, dims0[0]*dims0[1]*dims0[2]*dims0[3]);
+                T0->getImpl()->setRawPtr(
+                    array0,
+                    dims0[0] * dims0[1] * dims0[2] * dims0[3]);
 
                 // input1
                 T1->resize(dims1);
-                T1 -> getImpl() -> setRawPtr(array1, array1_size);
+                T1->getImpl()->setRawPtr(array1, array1_size);
 
                 // results
                 Tres->resize(dimsOut);
-                Tres -> getImpl() -> setRawPtr(result, dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]);
+                Tres->getImpl()->setRawPtr(
+                    result,
+                    dimsOut[0] * dimsOut[1] * dimsOut[2] * dimsOut[3]);
 
                 // compute result
                 op->forwardDims();
                 start = std::chrono::system_clock::now();
-                myMul->forward();
+                op->forward();
                 end = std::chrono::system_clock::now();
-                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+                duration +=
+                    std::chrono::duration_cast<std::chrono::microseconds>(
+                        end - start);
 
                 // comparison between truth and computed result
                 REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
@@ -674,13 +694,18 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
                 delete[] array1;
                 delete[] result;
 
-                const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                const std::size_t nb_elements =
+                    std::accumulate(dimsOut.cbegin(),
+                                    dimsOut.cend(),
+                                    std::size_t(1),
+                                    std::multiplies<std::size_t>());
                 number_of_operation += nb_elements;
             }
 
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {}Î¼s\n", duration.count());
         }
     }
 }
 } // namespace Aidge
+
diff --git a/unit_tests/operator/Test_PadImpl.cpp b/unit_tests/operator/Test_PadImpl.cpp
index cdd3a5f979085f3782776ce69ddd92c0d53150c4..f7823d022c8d3b228740a3df3f1d01224cd346c6 100644
--- a/unit_tests/operator/Test_PadImpl.cpp
+++ b/unit_tests/operator/Test_PadImpl.cpp
@@ -9,15 +9,17 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
-#include <cstdlib>
 #include <memory>
 
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/PadImpl.hpp"
+#include "aidge/data/DataType.hpp"
 #include "aidge/data/Tensor.hpp"
+#include "aidge/graph/Node.hpp"
 #include "aidge/operator/Pad.hpp"
 
-#include "aidge/backend/cpu.hpp"
-
 using namespace Aidge;
 
 TEST_CASE("[cpu/operator] Pad(forward)", "[Pad][CPU]") {
diff --git a/unit_tests/operator/Test_PaddedConv.cpp b/unit_tests/operator/Test_PaddedConv.cpp
index b7584ad069336a270ed07c32d4c07552888b6587..4b76fe0638033e41adf95d3abfd10691deefe940 100644
--- a/unit_tests/operator/Test_PaddedConv.cpp
+++ b/unit_tests/operator/Test_PaddedConv.cpp
@@ -9,16 +9,16 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
-#include <cstdlib>
 #include <memory>
 
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/PaddedConvImpl.hpp"
+#include "aidge/data/DataType.hpp"
 #include "aidge/data/Tensor.hpp"
-#include "aidge/operator/MetaOperator.hpp"
+#include "aidge/graph/Node.hpp"
 #include "aidge/operator/MetaOperatorDefs.hpp"
-#include "aidge/scheduler/SequentialScheduler.hpp"
-
-#include "aidge/backend/cpu.hpp"
 
 using namespace Aidge;
 
diff --git a/unit_tests/operator/Test_PowImpl.cpp b/unit_tests/operator/Test_PowImpl.cpp
index cb5d8872c9c7242bb4aa4efca388d53b578417f9..55a416c3f404506359e06f9937dd958503236901 100644
--- a/unit_tests/operator/Test_PowImpl.cpp
+++ b/unit_tests/operator/Test_PowImpl.cpp
@@ -9,18 +9,26 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
-#include <cmath>
-#include <cstddef>   // std::size_t
-#include <cstdint>   // std::uint16_t
-#include <chrono>
-#include <iostream>
+#include <chrono>      // std::micro, std::chrono::time_point,
+                       // std::chrono::system_clock, std::chrono::duration
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::uint16_t
+#include <functional>  // std::multiplies
 #include <memory>
-#include <numeric>   // std::accumulate
-#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <numeric>     // std::accumulate
+#include <random>      // std::random_device, std::mt19937
+                       // std::uniform_int_distribution, std::uniform_real_distribution
+#include <vector>
+
+#include <catch2/catch_test_macros.hpp>
+#include <fmt/core.h>
 
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/PowImpl.hpp"
+#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/Pow.hpp"
+#include "aidge/utils/ArrayHelpers.hpp"
 #include "aidge/utils/TensorUtils.hpp"
 
 namespace Aidge {
@@ -118,8 +126,8 @@ TEST_CASE("[cpu/operator] Pow", "[Pow][CPU]") {
 
                 // with broadcasting
             }
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {} Î¼s\n", duration.count());
         }
 
         SECTION("+1-D Tensor / +1-D Tensor - broadcasting") {
@@ -213,8 +221,8 @@ TEST_CASE("[cpu/operator] Pow", "[Pow][CPU]") {
                 const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
                 number_of_operation += nb_elements;
             }
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {} Î¼s\n", duration.count());
         }
         SECTION("+1-D Tensor / 1-D Tensor") {
             std::size_t number_of_operation = 0;
@@ -309,8 +317,8 @@ TEST_CASE("[cpu/operator] Pow", "[Pow][CPU]") {
                 number_of_operation += nb_elements;
             }
 
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {} Î¼s\n", duration.count());
         }
     }
 
@@ -440,7 +448,7 @@ TEST_CASE("[cpu/operator] Pow", "[Pow][CPU]") {
                     }
                 }
             ));
-            const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+            const Tensor expectedGrad0 = Array3D<float, 2, 2, 3>(
                 {
                     {
                         {
@@ -453,18 +461,13 @@ TEST_CASE("[cpu/operator] Pow", "[Pow][CPU]") {
                         }
                     }
                 }
-            ));
-            const auto expectedGrad1 = std::make_shared<Tensor>(Array1D<float, 3>(
+            );
+            const Tensor expectedGrad1 = Array1D<float, 3>(
                 {
                     {14.14779854, 22.99299049, 33.56402588}
                 }
-            ));
+            );
 
-            for(const auto T: {input0, input1, gradOut, expectedGrad0, expectedGrad1})
-            {
-                    T->setBackend("cpu") ;
-                    T->setDataType(DataType::Float32);
-            }
             std::shared_ptr<Node> powOp = Pow();
             auto opr = std::static_pointer_cast<OperatorTensor>(powOp-> getOperator());
             opr->setDataType(DataType::Float32);
@@ -475,8 +478,8 @@ TEST_CASE("[cpu/operator] Pow", "[Pow][CPU]") {
             powOp->forward();
 
             powOp->backward();
-            REQUIRE(approxEq<float>(*(opr->getInput(0)->grad()), *expectedGrad0));
-            REQUIRE(approxEq<float>(*(opr->getInput(1)->grad()), *expectedGrad1));
+            REQUIRE(approxEq<float>(*(opr->getInput(0)->grad()), expectedGrad0));
+            REQUIRE(approxEq<float>(*(opr->getInput(1)->grad()), expectedGrad1));
         }
     }
 }
diff --git a/unit_tests/operator/Test_ReLUImpl.cpp b/unit_tests/operator/Test_ReLUImpl.cpp
index 106d29ecfbf8ba785b4f9e5dba75daa272a86b26..eebdf7ac6c716db987c2600f098dcf9331d6a6c9 100644
--- a/unit_tests/operator/Test_ReLUImpl.cpp
+++ b/unit_tests/operator/Test_ReLUImpl.cpp
@@ -9,15 +9,16 @@
  *
  ********************************************************************************/
 
+#include <memory>
+
 #include <catch2/catch_test_macros.hpp>
 
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/ReLUImpl.hpp"
+#include "aidge/data/DataType.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/ReLU.hpp"
 
-#include "aidge/backend/cpu.hpp"
-
-#include <memory>
-
 
 using namespace Aidge;
 
@@ -26,17 +27,16 @@ TEST_CASE("[cpu/operator] ReLU(forward)", "[ReLU][CPU]") {
         std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array1D<int,10> {
             {0, 1, 2,-3, 4,-5,-6, 7, 8, 9}
         });
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array1D<int,10> {
+        Tensor expectedOutput = Array1D<int,10> {
             {0, 1, 2, 0, 4, 0, 0, 7, 8, 9}
-        });
+        };
 
-        std::shared_ptr<Node> myReLU = ReLU();
-        auto op = std::static_pointer_cast<OperatorTensor>(myReLU -> getOperator());
+        std::shared_ptr<ReLU_Op> op = std::make_shared<ReLU_Op>();
         op->associateInput(0,input0);
         op->setDataType(DataType::Int32);
         op->setBackend("cpu");
-        myReLU->forward();
-        REQUIRE(*(op->getOutput(0)) == *expectedOutput);
+        op->forward();
+        REQUIRE(*(op->getOutput(0)) == expectedOutput);
     }
 
     SECTION("2D Tensor") {
@@ -46,20 +46,19 @@ TEST_CASE("[cpu/operator] ReLU(forward)", "[ReLU][CPU]") {
                 {-5, 4, 2,-3, 4,-5,-6, 7,-1,10}
             }
         });
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array2D<int,2,10> {
+        Tensor expectedOutput = Array2D<int,2,10> {
             {
                 { 0, 1, 2, 0, 4, 0, 0, 7, 8, 9},
                 { 0, 4, 2, 0, 4, 0, 0, 7, 0,10}
             }
-        });
+        };
 
-        std::shared_ptr<Node> myReLU = ReLU();
-        auto op = std::static_pointer_cast<OperatorTensor>(myReLU -> getOperator());
+        std::shared_ptr<ReLU_Op> op = std::make_shared<ReLU_Op>();
         op->associateInput(0,input0);
         op->setDataType(DataType::Int32);
         op->setBackend("cpu");
-        myReLU->forward();
-        REQUIRE(*op->getOutput(0) == *expectedOutput);
+        op->forward();
+        REQUIRE(*op->getOutput(0) == expectedOutput);
     }
 
     SECTION("3D Tensor") {
@@ -75,7 +74,7 @@ TEST_CASE("[cpu/operator] ReLU(forward)", "[ReLU][CPU]") {
                 }
             }
         });
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array3D<int,2,2,10> {
+        Tensor expectedOutput = Array3D<int,2,2,10> {
             {
                 {
                     { 0, 1, 2, 0, 4, 0, 0, 7, 8, 9},
@@ -86,15 +85,14 @@ TEST_CASE("[cpu/operator] ReLU(forward)", "[ReLU][CPU]") {
                     { 0, 4, 2, 0, 4, 0, 0, 7, 0,10}
                 }
             }
-        });
+        };
 
-        std::shared_ptr<Node> myReLU = ReLU();
-        auto op = std::static_pointer_cast<OperatorTensor>(myReLU -> getOperator());
+        std::shared_ptr<ReLU_Op> op = std::make_shared<ReLU_Op>();
         op->associateInput(0,input0);
         op->setDataType(DataType::Int32);
         op->setBackend("cpu");
-        myReLU->forward();
-        REQUIRE(*(op->getOutput(0)) == *expectedOutput);
+        op->forward();
+        REQUIRE(*(op->getOutput(0)) == expectedOutput);
     }
 
     SECTION("4D Tensor") {
@@ -122,7 +120,7 @@ TEST_CASE("[cpu/operator] ReLU(forward)", "[ReLU][CPU]") {
                 }
             }
         });
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,2,2,2,10> {
+        Tensor expectedOutput = Array4D<int,2,2,2,10> {
             {
                 {
                     {
@@ -145,14 +143,13 @@ TEST_CASE("[cpu/operator] ReLU(forward)", "[ReLU][CPU]") {
                     }
                 }
             }
-        });
+        };
 
-        std::shared_ptr<Node> myReLU = ReLU();
-        auto op = std::static_pointer_cast<OperatorTensor>(myReLU -> getOperator());
+        std::shared_ptr<ReLU_Op> op = std::make_shared<ReLU_Op>();
         op->associateInput(0,input0);
         op->setDataType(DataType::Int32);
         op->setBackend("cpu");
-        myReLU->forward();
-        REQUIRE(*op->getOutput(0) == *expectedOutput);
+        op->forward();
+        REQUIRE(*op->getOutput(0) == expectedOutput);
     }
 }
\ No newline at end of file
diff --git a/unit_tests/operator/Test_ReduceMeanImpl.cpp b/unit_tests/operator/Test_ReduceMeanImpl.cpp
index dd647c7ba3f90fe7f3554aae7133e97ffa9c99ba..30ffeb0dd0b584f50349c206863c7ab9ac776721 100644
--- a/unit_tests/operator/Test_ReduceMeanImpl.cpp
+++ b/unit_tests/operator/Test_ReduceMeanImpl.cpp
@@ -9,16 +9,23 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
+#include <algorithm>   // std::fill
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::int32_t, std::uint16_t
 #include <memory>
-#include <numeric>   // std::accumulate
-#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <random>      // std::random_device, std::mt19937
+                       // std::uniform_int_distribution, std::uniform_real_distribution
+#include <vector>
+
+#include <catch2/catch_test_macros.hpp>
+#include <fmt/core.h>
 
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/ReduceMeanImpl.hpp"
+#include "aidge/data/DataType.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/ReduceMean.hpp"
-#include "aidge/operator/Conv.hpp"
-
-#include "aidge/backend/cpu.hpp"
+#include "aidge/operator/OperatorTensor.hpp"
 #include "aidge/utils/TensorUtils.hpp"
 
 using namespace Aidge;
diff --git a/unit_tests/operator/Test_ReduceSumImpl.cpp b/unit_tests/operator/Test_ReduceSumImpl.cpp
index 49569d1f65ff6c51f9681632b16375605ab326e7..0aa543da4f9d55dfa672790a5d99467cd794001a 100644
--- a/unit_tests/operator/Test_ReduceSumImpl.cpp
+++ b/unit_tests/operator/Test_ReduceSumImpl.cpp
@@ -9,17 +9,22 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
+#include <cstddef>  // std::size_t
+#include <cstdint>  // std::uint16_t, std::int32_t
 #include <memory>
-#include <numeric>   // std::accumulate
 #include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <vector>
+
+#include <catch2/catch_test_macros.hpp>
 
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/data/Data.hpp"  // DataType
 #include "aidge/data/Tensor.hpp"
+#include "aidge/graph/Node.hpp"
+#include "aidge/operator/OperatorTensor.hpp"
 #include "aidge/operator/ReduceSum.hpp"
-#include "aidge/operator/Conv.hpp"
-
-#include "aidge/backend/cpu.hpp"
 #include "aidge/utils/TensorUtils.hpp"
+#include "aidge/utils/Types.h"
 
 using namespace Aidge;
 
@@ -112,7 +117,7 @@ TEST_CASE("[cpu/operator] ReduceSum(forward)", "[ReduceSum][CPU]") {
                 std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(dims);
                 myInput->setBackend("cpu");
                 myInput->setDataType(DataType::Float32);
-                std::shared_ptr<Node> myReduceSum = ReduceSum(std::vector<int32_t>{}, false, true);
+                std::shared_ptr<Node> myReduceSum = ReduceSum(std::vector<std::int32_t>{}, false, true);
                 auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
                 op->associateInput(0,myInput);
                 op->setDataType(DataType::Float32);
diff --git a/unit_tests/operator/Test_RoundImpl.cpp b/unit_tests/operator/Test_RoundImpl.cpp
index b4cf9ffbedc18b35b42ebbc05971f86e0fa584e3..e658b0616683633ce19b2284abb9d4fae7942a23 100644
--- a/unit_tests/operator/Test_RoundImpl.cpp
+++ b/unit_tests/operator/Test_RoundImpl.cpp
@@ -9,15 +9,23 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
-#include <cstddef>   // std::size_t
-#include <cstdint>   // std::uint16_t
-#include <chrono>
-#include <iostream>
+#include <chrono>      // std::micro, std::chrono::time_point,
+                       // std::chrono::system_clock, std::chrono::duration
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::uint16_t
+#include <functional>  // std::multiplies
 #include <memory>
-#include <numeric>   
-#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
-#include <iomanip>
+#include <numeric>     // std::accumulate
+#include <random>      // std::random_device, std::mt19937
+                       // std::uniform_int_distribution, std::uniform_real_distribution
+#include <vector>
+
+#include <catch2/catch_test_macros.hpp>
+#include <fmt/core.h>
+
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/RoundImpl.hpp"
+#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/Round.hpp"
 #include "aidge/utils/TensorUtils.hpp"
@@ -29,7 +37,7 @@ TEST_CASE("[cpu/operator] Round_Test", "[Round][CPU]") {
     // Create a random number generator
     std::random_device rd;
     std::mt19937 gen(rd());
-    std::uniform_real_distribution<float> valueDist(-15, 15); 
+    std::uniform_real_distribution<float> valueDist(-15, 15);
     std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(5));
     std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(3));
 
@@ -59,7 +67,7 @@ TEST_CASE("[cpu/operator] Round_Test", "[Round][CPU]") {
             std::size_t number_of_operation = 0;
 
             for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
-                
+
                 // generate 2 random Tensors
                 const std::size_t nbDims = nbDimsDist(gen);
                 std::vector<std::size_t> dims;
@@ -72,7 +80,7 @@ TEST_CASE("[cpu/operator] Round_Test", "[Round][CPU]") {
                 // without broadcasting
                 float* array0 = new float[nb_elements];
                 float* result = new float[nb_elements];
-                
+
                 for (std::size_t i = 0; i < nb_elements; ++i) {
                     array0[i] = valueDist(gen);
                     result[i] = std::nearbyint(array0[i]);
@@ -86,29 +94,22 @@ TEST_CASE("[cpu/operator] Round_Test", "[Round][CPU]") {
                 // results
                 Tres->resize(dims);
                 Tres -> getImpl() -> setRawPtr(result, nb_elements);
-                
+
                 op->forwardDims();
                 start = std::chrono::system_clock::now();
                 myRound->forward();
                 end = std::chrono::system_clock::now();
                 duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
 
-                bool is_eq = approxEq<float>(*(op->getOutput(0)), *Tres);
-
-                auto Output = *(op->getOutput(0));
-                
-                auto prt = Output.getImpl()->rawPtr();
-
-                REQUIRE(is_eq);
-                
+                REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
 
                 delete[] array0;
                 delete[] result;
 
 
             }
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {} Î¼s\n", duration.count());
         }
     }
 } // namespace Aidge
diff --git a/unit_tests/operator/Test_SoftmaxImpl.cpp b/unit_tests/operator/Test_SoftmaxImpl.cpp
index da6c6f0d35a1db9ad9099a40b7e83459e14a20f5..bc452a409fef0236b3021de5b41eb47453f42f75 100644
--- a/unit_tests/operator/Test_SoftmaxImpl.cpp
+++ b/unit_tests/operator/Test_SoftmaxImpl.cpp
@@ -9,14 +9,16 @@
  *
  ********************************************************************************/
 
+#include <memory>
+
 #include <catch2/catch_test_macros.hpp>
 
+#include "aidge/backend/cpu/operator/SoftmaxImpl.hpp"
+#include "aidge/data/DataType.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/Softmax.hpp"
-
-#include "aidge/backend/cpu.hpp"
-
-#include <memory>
+#include "aidge/utils/ArrayHelpers.hpp"
+#include "aidge/utils/TensorUtils.hpp"
 
 using namespace Aidge;
 
@@ -30,28 +32,22 @@ TEST_CASE("[cpu/operator] Softmax(forward)", "[Softmax][CPU]") {
                     0.35077620, -0.78156322, -0.98952234,  0.04166317,  1.34357309}
             }
         });
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array2D<float,2,10> {
+        Tensor expectedOutput = Array2D<float,2,10> {
             {
                 {0.04883239, 0.11326669, 0.05974559, 0.09930880, 0.09267281, 0.03006749,
                     0.15842478, 0.24514021, 0.07825989, 0.07428131},
                 {0.05429055, 0.27136859, 0.28389078, 0.02240700, 0.06262558, 0.06087753,
                     0.01961952, 0.01593576, 0.04469007, 0.16429459}
             }
-        });
+        };
 
-        std::shared_ptr<Node> mySoftmax = Softmax(1);
-        auto op = std::static_pointer_cast<OperatorTensor>(mySoftmax -> getOperator());
+        std::shared_ptr<Softmax_Op> op = std::make_shared<Softmax_Op>(1);
         op->associateInput(0,input);
         op->setDataType(DataType::Float32);
         op->setBackend("cpu");
-        mySoftmax->forward();
-
-        float* resPtr = static_cast<float*>(op->getOutput(0)->getImpl()->rawPtr());
-        float* expectedPtr = static_cast<float*>(expectedOutput->getImpl()->rawPtr());
-        for (std::size_t i = 0; i< expectedOutput->size(); ++i) {
-            REQUIRE(std::abs(resPtr[i]-expectedPtr[i]) < 0.00001);
-        }
+        op->forward();
 
+        REQUIRE(approxEq<float>(*(op->getOutput(0)), expectedOutput, 1e-5f, 1e-8f));
     }
     SECTION("4D Tensor") {
         std::shared_ptr<Tensor> input = std::make_shared<Tensor>(Array4D<float,2,3,3,3> {
@@ -80,7 +76,7 @@ TEST_CASE("[cpu/operator] Softmax(forward)", "[Softmax][CPU]") {
                 }
             }
         });
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<float,2,3,3,3> {
+        Tensor expectedOutput = Array4D<float,2,3,3,3> {
             {
                 {
                     {{0.45109013, 0.42849392, 0.43775153},
@@ -105,19 +101,14 @@ TEST_CASE("[cpu/operator] Softmax(forward)", "[Softmax][CPU]") {
                      {0.34566763, 0.32462072, 0.48979440}}
                 }
             }
-        });
+        };
 
-        std::shared_ptr<Node> mySoftmax = Softmax(1);
-        auto op = std::static_pointer_cast<OperatorTensor>(mySoftmax -> getOperator());
+        std::shared_ptr<Softmax_Op> op = std::make_shared<Softmax_Op>(1);
         op->associateInput(0,input);
         op->setDataType(DataType::Float32);
         op->setBackend("cpu");
-        mySoftmax->forward();
+        op->forward();
 
-        float* resPtr = static_cast<float*>(op->getOutput(0)->getImpl()->rawPtr());
-        float* expectedPtr = static_cast<float*>(expectedOutput->getImpl()->rawPtr());
-        for (std::size_t i = 0; i< expectedOutput->size(); ++i) {
-            REQUIRE(std::abs(resPtr[i]-expectedPtr[i]) < 0.00001);
-        }
+        REQUIRE(approxEq<float>(*(op->getOutput(0)), expectedOutput, 1e-5f, 1e-8f));
     }
 }
\ No newline at end of file
diff --git a/unit_tests/operator/Test_SqrtImpl.cpp b/unit_tests/operator/Test_SqrtImpl.cpp
index d630c66c8b8085e6d382841da6b7cac2c88b1dd0..aac5046003611d96d5c8111192e9fd4c5254b89d 100644
--- a/unit_tests/operator/Test_SqrtImpl.cpp
+++ b/unit_tests/operator/Test_SqrtImpl.cpp
@@ -9,14 +9,16 @@
  *
  ********************************************************************************/
 
+#include <memory>
+
 #include <catch2/catch_test_macros.hpp>
 
+#include "aidge/backend/cpu/operator/SqrtImpl.hpp"
+#include "aidge/data/DataType.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/Sqrt.hpp"
-
-#include "aidge/backend/cpu.hpp"
-
-#include <memory>
+#include "aidge/utils/ArrayHelpers.hpp"
+#include "aidge/utils/TensorUtils.hpp"
 
 using namespace Aidge;
 
@@ -28,26 +30,20 @@ TEST_CASE("[cpu/operator] Sqrt(forward)", "[Sqrt][CPU]") {
                 { 0.00000000,  1.84539008}
             }
         });
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array2D<float,2,2> {
+        Tensor expectedOutput = Array2D<float,2,2> {
             {
                 {4.00000000, 0.78883994},
                 {0.00000000, 1.35845140}
             }
-        });
+        };
 
-        std::shared_ptr<Node> mySqrt = Sqrt();
-        auto op = std::static_pointer_cast<OperatorTensor>(mySqrt -> getOperator());
-        mySqrt->getOperator()->associateInput(0,input);
-        mySqrt->getOperator()->setDataType(DataType::Float32);
-        mySqrt->getOperator()->setBackend("cpu");
-        mySqrt->forward();
-
-        float* resPtr = static_cast<float*>(op->getOutput(0)->getImpl()->rawPtr());
-        float* expectedPtr = static_cast<float*>(expectedOutput->getImpl()->rawPtr());
-        for (std::size_t i = 0; i< 4; ++i) {
-            REQUIRE(std::abs(resPtr[i]-expectedPtr[i]) < 0.00001);
-        }
+        std::shared_ptr<Sqrt_Op> op = std::make_shared<Sqrt_Op>();
+        op->associateInput(0,input);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cpu");
+        op->forward();
 
+        REQUIRE(approxEq<float>(*(op->getOutput(0)), expectedOutput, 1e-5f, 1e-8f));
     }
 
     SECTION("4D Tensor") {
@@ -78,7 +74,7 @@ TEST_CASE("[cpu/operator] Sqrt(forward)", "[Sqrt][CPU]") {
             }
         });
 
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<float,2,3,3,3> {
+        Tensor expectedOutput = Array4D<float,2,3,3,3> {
             {
                 {
                     {{0.24936883, 0.6844717,  0.7804763},
@@ -103,19 +99,14 @@ TEST_CASE("[cpu/operator] Sqrt(forward)", "[Sqrt][CPU]") {
                      {0.3608653,  0.8571328,  0.16447252}}
                 }
             }
-        });
+        };
 
-        std::shared_ptr<Node> mySqrt = Sqrt();
-        auto op = std::static_pointer_cast<OperatorTensor>(mySqrt -> getOperator());
-        mySqrt->getOperator()->associateInput(0,input);
-        mySqrt->getOperator()->setDataType(DataType::Float32);
-        mySqrt->getOperator()->setBackend("cpu");
-        mySqrt->forward();
+        std::shared_ptr<Sqrt_Op> op = std::make_shared<Sqrt_Op>();
+        op->associateInput(0,input);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cpu");
+        op->forward();
 
-        float* resPtr = static_cast<float*>(op->getOutput(0)->getImpl()->rawPtr());
-        float* expectedPtr = static_cast<float*>(expectedOutput->getImpl()->rawPtr());
-        for (std::size_t i = 0; i< 54; ++i) {
-            REQUIRE(std::abs(resPtr[i]-expectedPtr[i]) < 0.00001);
-        }
+        REQUIRE(approxEq<float>(*(op->getOutput(0)), expectedOutput, 1e-5f, 1e-8f));
     }
 }
\ No newline at end of file
diff --git a/unit_tests/operator/Test_SubImpl.cpp b/unit_tests/operator/Test_SubImpl.cpp
index 44666ae631152c8898e24f7003b0c2ede8c67b84..1317e88a371e9a6e7a3deae5b7f662a9cd879a60 100644
--- a/unit_tests/operator/Test_SubImpl.cpp
+++ b/unit_tests/operator/Test_SubImpl.cpp
@@ -9,17 +9,26 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
-#include <cstddef>   // std::size_t
-#include <cstdint>   // std::uint16_t
-#include <chrono>
-#include <iostream>
+#include <chrono>      // std::micro, std::chrono::time_point,
+                       // std::chrono::system_clock
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::uint16_t
+#include <functional>  // std::multiplies
 #include <memory>
-#include <numeric>   // std::accumulate
-#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <numeric>     // std::accumulate
+#include <random>      // std::random_device, std::mt19937
+                       // std::uniform_int_distribution, std::uniform_real_distribution
+#include <vector>
+
+#include <catch2/catch_test_macros.hpp>
+#include <fmt/core.h>
 
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/SubImpl.hpp"
+#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/Sub.hpp"
+#include "aidge/operator/OperatorTensor.hpp"
 #include "aidge/utils/TensorUtils.hpp"
 
 namespace Aidge {
@@ -117,8 +126,8 @@ TEST_CASE("[cpu/operator] Sub", "[Sub][CPU]") {
 
                 // with broadcasting
             }
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {}Î¼s\n", duration.count());
         }
 
         SECTION("+1-D Tensor / +1-D Tensor - broadcasting") {
@@ -212,8 +221,8 @@ TEST_CASE("[cpu/operator] Sub", "[Sub][CPU]") {
                 const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
                 number_of_operation += nb_elements;
             }
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {}Î¼s\n", duration.count());
         }
         SECTION("+1-D Tensor / 1-D Tensor") {
             std::size_t number_of_operation = 0;
@@ -308,8 +317,8 @@ TEST_CASE("[cpu/operator] Sub", "[Sub][CPU]") {
                 number_of_operation += nb_elements;
             }
 
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {}Î¼s\n", duration.count());
         }
     }
 }
diff --git a/unit_tests/operator/Test_WeightInterleavingImpl.cpp b/unit_tests/operator/Test_WeightInterleavingImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c95c8fca19eb79eb78fc19e93ded3383054383e7
--- /dev/null
+++ b/unit_tests/operator/Test_WeightInterleavingImpl.cpp
@@ -0,0 +1,436 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/WeightInterleaving.hpp"
+#include "aidge/recipes/Recipes.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+#include "aidge/backend/cpu.hpp"
+
+#include <memory>
+
+using namespace Aidge;
+
+TEST_CASE("[cpu/operator] WeightInterleaving", "[WeightInterleaving][CPU]") {
+
+    std::shared_ptr<Node> myWeightInterleaving = WeightInterleaving();
+    auto opWeightInterleaving = std::static_pointer_cast<WeightInterleaving_Op>(myWeightInterleaving -> getOperator());
+
+    SECTION("CompactDataSize - Single element cases") {
+        REQUIRE(opWeightInterleaving->compactDataSize(1, 1) == 1);  // 1 bit, needs 1 byte
+        REQUIRE(opWeightInterleaving->compactDataSize(1, 7) == 1);  // 7 bits, needs 1 byte
+    }
+
+    SECTION("CompactDataSize - Boundary cases for different nb_bits values") {
+        REQUIRE(opWeightInterleaving->compactDataSize(8, 1) == 1);  // 8 elements at 1 bit each, fits in 1 byte
+        REQUIRE(opWeightInterleaving->compactDataSize(8, 2) == 2);  // 8 elements at 2 bits each, needs 2 bytes
+        REQUIRE(opWeightInterleaving->compactDataSize(8, 3) == 4);  // 8 elements at 3 bits each, needs 4 bytes
+        REQUIRE(opWeightInterleaving->compactDataSize(8, 4) == 4);  // 8 elements at 4 bits each, needs 4 bytes
+    }
+
+    SECTION("CompactDataSize - Larger dataSize values") {
+        REQUIRE(opWeightInterleaving->compactDataSize(16, 1) == 2);  // 16 elements at 1 bit each, fits in 2 bytes
+        REQUIRE(opWeightInterleaving->compactDataSize(16, 2) == 4);  // 16 elements at 2 bits each, needs 4 bytes
+        REQUIRE(opWeightInterleaving->compactDataSize(16, 3) == 8);  // 16 elements at 3 bits each, needs 6 bytes
+        REQUIRE(opWeightInterleaving->compactDataSize(16, 4) == 8);  // 16 elements at 4 bits each, needs 8 bytes
+    }
+
+    SECTION("CompactDataSize - Odd dataSize values with varying nb_bits") {
+        REQUIRE(opWeightInterleaving->compactDataSize(7, 1) == 1);  // 7 elements at 1 bit each, fits in 1 byte
+        REQUIRE(opWeightInterleaving->compactDataSize(7, 2) == 2);  // 7 elements at 2 bits each, needs 2 bytes
+        REQUIRE(opWeightInterleaving->compactDataSize(7, 3) == 4);  // 7 elements at 3 bits each, needs 4 bytes
+        REQUIRE(opWeightInterleaving->compactDataSize(7, 4) == 4);  // 7 elements at 4 bits each, needs 4 bytes
+    }
+
+    SECTION("CompactDataSize - Minimum and maximum values for nb_bits") {
+        REQUIRE(opWeightInterleaving->compactDataSize(5, 1) == 1);  // 5 elements at 1 bit each, fits in 1 byte
+    }
+
+    SECTION("CompactDataSize - Edge Case - dataSize of 0 should result in 0 required size") {
+        REQUIRE(opWeightInterleaving->compactDataSize(0, 1) == 0);  // No data elements
+    }
+
+
+    SECTION("CompactData - 4-bit compaction") {
+        std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array1D<std::int8_t, 4>{
+                                                                {static_cast<std::int8_t>(0x0F),
+                                                                static_cast<std::int8_t>(0xF5),
+                                                                static_cast<std::int8_t>(0xB3),
+                                                                static_cast<std::int8_t>(0x9C)}
+                                                                });
+
+        weight->setDataFormat(Aidge::DataFormat::NHWC);
+        weight->setDataType(Aidge::DataType::Int4);
+
+        std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array1D<std::int8_t, 2>{
+                                                                {static_cast<int8_t>(0xF5),
+                                                                static_cast<int8_t>(0x3C)}
+                                                                });
+
+        expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
+        expectedWeightInterleaving->setDataType(WeightInterleavedType_v<Aidge::DataType::Int4>);
+
+        std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
+        auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
+        op->associateInput(0,weight);
+        op->setDataType(WeightInterleavedType_v<Aidge::DataType::Int4>);
+        op->setDataFormat(DataFormat::NHWC);
+        op->setBackend("cpu");
+        myWeightInterleavingNode->forward();
+        REQUIRE(*(op->getOutput(0)) == *expectedWeightInterleaving);
+    }
+
+    SECTION("CompactData - 3-bit compaction") {
+        std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array1D<std::int8_t, 4>{
+                                                                {static_cast<int8_t>(0x0F),
+                                                                static_cast<int8_t>(0x05),
+                                                                static_cast<int8_t>(0x04),
+                                                                static_cast<int8_t>(0xD3)}
+                                                                });
+
+        weight->setDataFormat(Aidge::DataFormat::NHWC);
+        weight->setDataType(Aidge::DataType::Int3);
+
+        std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array1D<std::int8_t, 2>{
+                                                                {static_cast<int8_t>(0x75),
+                                                                static_cast<int8_t>(0x43)}
+                                                                });
+
+        expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
+        expectedWeightInterleaving->setDataType(WeightInterleavedType_v<Aidge::DataType::Int3>);
+
+        std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
+        auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
+        op->associateInput(0,weight);
+        op->setDataType(WeightInterleavedType_v<Aidge::DataType::Int3>);
+        op->setDataFormat(DataFormat::NHWC);
+        op->setBackend("cpu");
+        myWeightInterleavingNode->forward();
+        REQUIRE(*(op->getOutput(0)) == *expectedWeightInterleaving);
+    }
+
+    SECTION("CompactData - 2-bit compaction") {
+        std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array1D<std::int8_t, 4>{
+                                                                {static_cast<std::int8_t>(0x03),
+                                                                 static_cast<std::int8_t>(0x02),
+                                                                 static_cast<std::int8_t>(0x01),
+                                                                 static_cast<std::int8_t>(0x00)}
+                                                                 });
+
+        weight->setDataFormat(Aidge::DataFormat::NHWC);
+        weight->setDataType(Aidge::DataType::Int2);
+
+        std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array1D<std::int8_t, 1>{
+                                                                {static_cast<int8_t>(0xE4)}
+                                                                });
+
+        expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
+        expectedWeightInterleaving->setDataType(WeightInterleavedType_v<Aidge::DataType::Int2>);
+
+        std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
+        auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
+        op->associateInput(0,weight);
+        op->setDataType(WeightInterleavedType_v<Aidge::DataType::Int2>);
+        op->setDataFormat(DataFormat::NHWC);
+        op->setBackend("cpu");
+        myWeightInterleavingNode->forward();
+        REQUIRE(*(op->getOutput(0)) == *expectedWeightInterleaving);
+    }
+
+    SECTION("CompactData - Edge Cases - Single element data") {
+        std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array1D<std::int8_t, 1>{
+                                                                {static_cast<int8_t>(0x0F)}
+                                                                });
+
+        weight->setDataFormat(Aidge::DataFormat::NHWC);
+        weight->setDataType(Aidge::DataType::Int4);
+
+        std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array1D<std::int8_t, 1>{
+                                                                {static_cast<int8_t>(0xF0)}
+                                                                });
+
+        expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
+        expectedWeightInterleaving->setDataType(WeightInterleavedType_v<Aidge::DataType::Int4>);
+
+        std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
+        auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
+        op->associateInput(0,weight);
+        op->setDataType(WeightInterleavedType_v<Aidge::DataType::Int4>);
+        op->setDataFormat(DataFormat::NHWC);
+        op->setBackend("cpu");
+        myWeightInterleavingNode->forward();
+        REQUIRE(*(op->getOutput(0)) == *expectedWeightInterleaving);
+    }
+
+    SECTION("CompactData - Edge Cases - Non-divisible dataSize for nbSlot with nbbits=4") {
+        std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array1D<std::int8_t, 3>{
+                                                                {static_cast<int8_t>(0x0F),
+                                                                static_cast<int8_t>(0xA5),
+                                                                static_cast<int8_t>(0x34)}
+                                                                });
+
+        weight->setDataFormat(Aidge::DataFormat::NHWC);
+        weight->setDataType(Aidge::DataType::Int4);
+
+        std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array1D<std::int8_t, 2>{
+                                                                {static_cast<int8_t>(0xF5),
+                                                                static_cast<int8_t>(0x40)}
+                                                                });
+
+        expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
+        expectedWeightInterleaving->setDataType(WeightInterleavedType_v<Aidge::DataType::Int4>);
+
+        std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
+        auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
+        op->associateInput(0,weight);
+        op->setDataType(WeightInterleavedType_v<Aidge::DataType::Int4>);
+        op->setDataFormat(DataFormat::NHWC);
+        op->setBackend("cpu");
+        myWeightInterleavingNode->forward();
+        REQUIRE(*(op->getOutput(0)) == *expectedWeightInterleaving);
+
+    }
+
+    SECTION("CompactData - Edge Cases - Non-divisible dataSize for nbSlot with nbbits=3") {
+
+        std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array1D<std::int8_t, 3>{
+                                                                {static_cast<int8_t>(0x0F),
+                                                                static_cast<int8_t>(0x05),
+                                                                static_cast<int8_t>(0x04)}
+                                                                });
+
+        weight->setDataFormat(Aidge::DataFormat::NHWC);
+        weight->setDataType(Aidge::DataType::Int3);
+
+        std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array1D<std::int8_t, 2>{
+                                                                {static_cast<int8_t>(0x75),
+                                                                static_cast<int8_t>(0x40)}
+                                                                });
+
+        expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
+        expectedWeightInterleaving->setDataType(WeightInterleavedType_v<Aidge::DataType::Int3>);
+
+        std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
+        auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
+        op->associateInput(0,weight);
+        op->setDataType(WeightInterleavedType_v<Aidge::DataType::Int3>);
+        op->setDataFormat(DataFormat::NHWC);
+        op->setBackend("cpu");
+        myWeightInterleavingNode->forward();
+        REQUIRE(*(op->getOutput(0)) == *expectedWeightInterleaving);
+
+    }
+
+    SECTION("Forward Op - Convolution weight interleaving") {
+
+        // Weight [Cout = 2, H = 3, W = 3, Cin = 4]:
+        std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array4D<std::int8_t,2,3,3,4> {
+            {
+                {
+                    {
+                        {-6,  0,  5, -8}, // 'A' '0' '5' '8' in hexadecimal format
+                        { 5,  5,  4, -5}, // '5' '5' '4' 'B' in hexadecimal format
+                        {-7, -1,  4, -7}  // '9' 'F' '4' '9' in hexadecimal format
+                    },
+                    {
+                        { 3, -3, -3, -3}, // '3' 'D' 'D' 'D' in hexadecimal format
+                        { 1,  3,  1, -1}, // '1' '3' '1' 'F' in hexadecimal format
+                        { 7, -3, -1,  4}  // '7' 'D' 'F' '4' in hexadecimal format
+                    },
+                    {
+                        {-1,  3,  5,  6}, // 'F' '3' '5' '6' in hexadecimal format
+                        {-8,  4,  7,  1}, // '8' '4' '7' '1' in hexadecimal format
+                        {-5,  0, -1, -2}  // 'B' '0' 'F' 'E' in hexadecimal format
+                    }
+                },
+                {
+                    {
+                        { 2, -7,  7, -4}, // '2' '9' '7' 'C' in hexadecimal format
+                        {-7,  3,  0,  2}, // '9' '3' '0' '2' in hexadecimal format
+                        { 1, -1,  2,  3}  // '1' 'F' '2' '3' in hexadecimal format
+                    },
+                    {
+                        {-1, -5, -3, -7}, // 'F' 'B' 'D' '9' in hexadecimal format
+                        {-8,  3,  5, -1}, // '8' '3' '5' 'F' in hexadecimal format
+                        {-7, -4, -6, -1}  // '9' 'C' 'A' 'F' in hexadecimal format
+                    },
+                    {
+                        { 1,  7,  5, -1}, // '1' '7' '5' 'F' in hexadecimal format
+                        { 1, -8,  1,  2}, // '1' '8' '1' '2' in hexadecimal format
+                        {-1, -6, -3,  0}  // 'F' 'A' 'D' '0' in hexadecimal format
+                    }
+                }
+            }
+        });
+
+        std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array4D<std::int8_t,2,3,3,2> {
+            {
+                {
+                    {
+                        {static_cast<int8_t>(0xA0), static_cast<int8_t>(0x58)}, // 'A' '0' '5' '8' in hexadecimal format
+                        {static_cast<int8_t>(0x55), static_cast<int8_t>(0x4B)}, // '5' '5' '4' 'B' in hexadecimal format
+                        {static_cast<int8_t>(0x9F), static_cast<int8_t>(0x49)}  // '9' 'F' '4' '9' in hexadecimal format
+                    },
+                    {
+                        {static_cast<int8_t>(0x3D), static_cast<int8_t>(0xDD)}, // '3' 'D' 'D' 'D' in hexadecimal format
+                        {static_cast<int8_t>(0x13), static_cast<int8_t>(0x1F)}, // '1' '3' '1' 'F' in hexadecimal format
+                        {static_cast<int8_t>(0x7D), static_cast<int8_t>(0xF4)}  // '7' 'D' 'F' '4' in hexadecimal format
+                    },
+                    {
+                        {static_cast<int8_t>(0xF3), static_cast<int8_t>(0x56)}, // 'F' '3' '5' '6' in hexadecimal format
+                        {static_cast<int8_t>(0x84), static_cast<int8_t>(0x71)}, // '8' '4' '7' '1' in hexadecimal format
+                        {static_cast<int8_t>(0xB0), static_cast<int8_t>(0xFE)}  // 'B' '0' 'F' 'E' in hexadecimal format
+                    }
+                },
+                {
+                    {
+                        {static_cast<int8_t>(0x29), static_cast<int8_t>(0x7C)}, // '2' '9' '7' 'C' in hexadecimal format
+                        {static_cast<int8_t>(0x93), static_cast<int8_t>(0x02)}, // '9' '3' '0' '2' in hexadecimal format
+                        {static_cast<int8_t>(0x1F), static_cast<int8_t>(0x23)}  // '1' 'F' '2' '3' in hexadecimal format
+                    },
+                    {
+                        {static_cast<int8_t>(0xFB), static_cast<int8_t>(0xD9)}, // 'F' 'B' 'D' '9' in hexadecimal format
+                        {static_cast<int8_t>(0x83), static_cast<int8_t>(0x5F)}, // '8' '3' '5' 'F' in hexadecimal format
+                        {static_cast<int8_t>(0x9C), static_cast<int8_t>(0xAF)}  // '9' 'C' 'A' 'F' in hexadecimal format
+                    },
+                    {
+                        {static_cast<int8_t>(0x17), static_cast<int8_t>(0x5F)}, // '1' '7' '5' 'F' in hexadecimal format
+                        {static_cast<int8_t>(0x18), static_cast<int8_t>(0x12)}, // '1' '8' '1' '2' in hexadecimal format
+                        {static_cast<int8_t>(0xFA), static_cast<int8_t>(0xD0)}  // 'F' 'A' 'D' '0' in hexadecimal format
+                    }
+                }
+            }
+        });
+
+        weight->setDataFormat(Aidge::DataFormat::NHWC);
+        weight->setDataType(Aidge::DataType::Int4);
+
+        expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
+        expectedWeightInterleaving->setDataType(WeightInterleavedType_v<Aidge::DataType::Int4>);
+
+        std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
+        auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
+        op->associateInput(0,weight);
+        op->setDataType(WeightInterleavedType_v<Aidge::DataType::Int4>);
+        op->setDataFormat(DataFormat::NHWC);
+        op->setBackend("cpu");
+        myWeightInterleavingNode->forward();
+        REQUIRE(*(op->getOutput(0)) == *expectedWeightInterleaving);
+    }
+
+    SECTION("Recipie ApplyWeightInterleaving") {
+
+        // Weight [Cout = 2, H = 3, W = 3, Cin = 4]:
+        std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array4D<std::int8_t,2,3,3,4> {
+            {
+                {
+                    {
+                        {-6,  0,  5, -8}, // 'A' '0' '5' '8' in hexadecimal format
+                        { 5,  5,  4, -5}, // '5' '5' '4' 'B' in hexadecimal format
+                        {-7, -1,  4, -7}  // '9' 'F' '4' '9' in hexadecimal format
+                    },
+                    {
+                        { 3, -3, -3, -3}, // '3' 'D' 'D' 'D' in hexadecimal format
+                        { 1,  3,  1, -1}, // '1' '3' '1' 'F' in hexadecimal format
+                        { 7, -3, -1,  4}  // '7' 'D' 'F' '4' in hexadecimal format
+                    },
+                    {
+                        {-1,  3,  5,  6}, // 'F' '3' '5' '6' in hexadecimal format
+                        {-8,  4,  7,  1}, // '8' '4' '7' '1' in hexadecimal format
+                        {-5,  0, -1, -2}  // 'B' '0' 'F' 'E' in hexadecimal format
+                    }
+                },
+                {
+                    {
+                        { 2, -7,  7, -4}, // '2' '9' '7' 'C' in hexadecimal format
+                        {-7,  3,  0,  2}, // '9' '3' '0' '2' in hexadecimal format
+                        { 1, -1,  2,  3}  // '1' 'F' '2' '3' in hexadecimal format
+                    },
+                    {
+                        {-1, -5, -3, -7}, // 'F' 'B' 'D' '9' in hexadecimal format
+                        {-8,  3,  5, -1}, // '8' '3' '5' 'F' in hexadecimal format
+                        {-7, -4, -6, -1}  // '9' 'C' 'A' 'F' in hexadecimal format
+                    },
+                    {
+                        { 1,  7,  5, -1}, // '1' '7' '5' 'F' in hexadecimal format
+                        { 1, -8,  1,  2}, // '1' '8' '1' '2' in hexadecimal format
+                        {-1, -6, -3,  0}  // 'F' 'A' 'D' '0' in hexadecimal format
+                    }
+                }
+            }
+        });
+
+        std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array4D<std::int8_t,2,3,3,2> {
+            {
+                {
+                    {
+                        {static_cast<int8_t>(0xA0), static_cast<int8_t>(0x58)}, // 'A' '0' '5' '8' in hexadecimal format
+                        {static_cast<int8_t>(0x55), static_cast<int8_t>(0x4B)}, // '5' '5' '4' 'B' in hexadecimal format
+                        {static_cast<int8_t>(0x9F), static_cast<int8_t>(0x49)}  // '9' 'F' '4' '9' in hexadecimal format
+                    },
+                    {
+                        {static_cast<int8_t>(0x3D), static_cast<int8_t>(0xDD)}, // '3' 'D' 'D' 'D' in hexadecimal format
+                        {static_cast<int8_t>(0x13), static_cast<int8_t>(0x1F)}, // '1' '3' '1' 'F' in hexadecimal format
+                        {static_cast<int8_t>(0x7D), static_cast<int8_t>(0xF4)}  // '7' 'D' 'F' '4' in hexadecimal format
+                    },
+                    {
+                        {static_cast<int8_t>(0xF3), static_cast<int8_t>(0x56)}, // 'F' '3' '5' '6' in hexadecimal format
+                        {static_cast<int8_t>(0x84), static_cast<int8_t>(0x71)}, // '8' '4' '7' '1' in hexadecimal format
+                        {static_cast<int8_t>(0xB0), static_cast<int8_t>(0xFE)}  // 'B' '0' 'F' 'E' in hexadecimal format
+                    }
+                },
+                {
+                    {
+                        {static_cast<int8_t>(0x29), static_cast<int8_t>(0x7C)}, // '2' '9' '7' 'C' in hexadecimal format
+                        {static_cast<int8_t>(0x93), static_cast<int8_t>(0x02)}, // '9' '3' '0' '2' in hexadecimal format
+                        {static_cast<int8_t>(0x1F), static_cast<int8_t>(0x23)}  // '1' 'F' '2' '3' in hexadecimal format
+                    },
+                    {
+                        {static_cast<int8_t>(0xFB), static_cast<int8_t>(0xD9)}, // 'F' 'B' 'D' '9' in hexadecimal format
+                        {static_cast<int8_t>(0x83), static_cast<int8_t>(0x5F)}, // '8' '3' '5' 'F' in hexadecimal format
+                        {static_cast<int8_t>(0x9C), static_cast<int8_t>(0xAF)}  // '9' 'C' 'A' 'F' in hexadecimal format
+                    },
+                    {
+                        {static_cast<int8_t>(0x17), static_cast<int8_t>(0x5F)}, // '1' '7' '5' 'F' in hexadecimal format
+                        {static_cast<int8_t>(0x18), static_cast<int8_t>(0x12)}, // '1' '8' '1' '2' in hexadecimal format
+                        {static_cast<int8_t>(0xFA), static_cast<int8_t>(0xD0)}  // 'F' 'A' 'D' '0' in hexadecimal format
+                    }
+                }
+            }
+        });
+
+        expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
+        expectedWeightInterleaving->setDataType(Aidge::DataType::Dual_Int4);
+
+        // Create convolution node
+        std::shared_ptr<Node> conv = Conv(4, 2, {3, 3}, "conv1");
+
+        // Place the weight tensor in the weight producer of the conv
+        auto weightProducer = conv->getParent(1);
+        weightProducer->getOperator()->setOutput(0, weight);
+
+        // Set dataType, dataformat and backend of convolution
+        conv->getOperator()->setDataFormat(Aidge::DataFormat::NHWC);
+        conv->getOperator()->setDataType(Aidge::DataType::Int4);
+        conv->getOperator()->setBackend("cpu");
+
+        // Apply recipie
+        applyWeightInterleaving(conv);
+
+        // Compare the weight producer output tensor with the expected weights with interleaving
+        auto newProdOp = std::static_pointer_cast<OperatorTensor>(conv->getParent(1)->getOperator());
+        REQUIRE(*(newProdOp->getOutput(0)) == *expectedWeightInterleaving);
+    }
+
+}
diff --git a/unit_tests/scheduler/Test_CastMove.cpp b/unit_tests/scheduler/Test_CastMove.cpp
index 5ca2cd9de4dcc9dab2c78f7ae1e1bf3090db8f2b..b78e864fecab1fd103a2cb30924d10a25f5b8f10 100644
--- a/unit_tests/scheduler/Test_CastMove.cpp
+++ b/unit_tests/scheduler/Test_CastMove.cpp
@@ -13,15 +13,20 @@
 #include <memory>
 #include <string>
 
+
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/ConvImpl.hpp"
+#include "aidge/backend/cpu/operator/FCImpl.hpp"
 #include "aidge/data/Tensor.hpp"
-#include "aidge/utils/TensorUtils.hpp"
 #include "aidge/graph/Node.hpp"
 #include "aidge/graph/GraphView.hpp"
 #include "aidge/graph/OpArgs.hpp"
+#include "aidge/operator/Conv.hpp"
+#include "aidge/operator/FC.hpp"
 #include "aidge/scheduler/SequentialScheduler.hpp"
 #include "aidge/recipes/Recipes.hpp"
-
-#include "aidge/backend/cpu.hpp"
+#include "aidge/utils/ArrayHelpers.hpp"
+#include "aidge/utils/TensorUtils.hpp"
 
 using namespace Aidge;
 
@@ -205,15 +210,15 @@ TEST_CASE("[cpu/castmove] CastMove(forward)") {
         REQUIRE_NOTHROW(scheduler.forward());
         scheduler.saveSchedulingDiagram("schedulingSequential");
 
-        std::shared_ptr<Tensor> expectedOutput1 = std::make_shared<Tensor>(Array4D<int, 2, 3, 3, 3>{
+        Tensor expectedOutput1 = Array4D<int, 2, 3, 3, 3>{
                 {{{{367, 412, 457}, {592, 637, 682}, {817, 862, 907}},
                   {{854, 980, 1106}, {1484, 1610, 1736}, {2114, 2240, 2366}},
                   {{1341, 1548, 1755}, {2376, 2583, 2790}, {3411, 3618, 3825}}},
                  {{{1492, 1537, 1582}, {1717, 1762, 1807}, {1942, 1987, 2032}},
                   {{4004, 4130, 4256}, {4634, 4760, 4886}, {5264, 5390, 5516}},
-                  {{6516, 6723, 6930}, {7551, 7758, 7965}, {8586, 8793, 9000}}}}});
+                  {{6516, 6723, 6930}, {7551, 7758, 7965}, {8586, 8793, 9000}}}}};
 
-        std::shared_ptr<Tensor> expectedOutput2 = std::make_shared<Tensor>(Array4D<int, 2, 4, 3, 3>{
+        Tensor expectedOutput2 = Array4D<int, 2, 4, 3, 3>{
                 {{{{6099, 7017, 7935}, {10689, 11607, 12525}, {15279, 16197, 17115}},
                   {{13786, 15838, 17890}, {24046, 26098, 28150}, {34306, 36358, 38410}},
                   {{21473, 24659, 27845}, {37403, 40589, 43775}, {53333, 56519, 59705}},
@@ -221,26 +226,26 @@ TEST_CASE("[cpu/castmove] CastMove(forward)") {
                  {{{29049, 29967, 30885}, {33639, 34557, 35475}, {38229, 39147, 40065}},
                   {{65086, 67138, 69190}, {75346, 77398, 79450}, {85606, 87658, 89710}},
                   {{101123, 104309, 107495}, {117053, 120239, 123425}, {132983, 136169, 139355}},
-                  {{137160, 141480, 145800}, {158760, 163080, 167400}, {180360, 184680, 189000}}}}});
+                  {{137160, 141480, 145800}, {158760, 163080, 167400}, {180360, 184680, 189000}}}}};
 
-        std::shared_ptr<Tensor> expectedOutput3 = std::make_shared<Tensor>(Array4D<int, 2, 3, 3, 3>{
+        Tensor expectedOutput3 = Array4D<int, 2, 3, 3, 3>{
                 {{{{214731, 246591, 278451}, {374031, 405891, 437751}, {533331, 565191, 597051}},
                   {{496804, 570568, 644332}, {865624, 939388, 1013152}, {1234444, 1308208, 1381972}},
                   {{778877, 894545, 1010213}, {1357217, 1472885, 1588553}, {1935557, 2051225, 2166893}}},
                  {{{1011231, 1043091, 1074951}, {1170531, 1202391, 1234251}, {1329831, 1361691, 1393551}},
                   {{2340904, 2414668, 2488432}, {2709724, 2783488, 2857252}, {3078544, 3152308, 3226072}},
-                  {{3670577, 3786245, 3901913}, {4248917, 4364585, 4480253}, {4827257, 4942925, 5058593}}}}});
+                  {{3670577, 3786245, 3901913}, {4248917, 4364585, 4480253}, {4827257, 4942925, 5058593}}}}};
 
         Tensor expectedOutput4 = Array2D<int, 2, 5>{
                 {{205050376, 198925904, 181355097, 196978090, 238868348},
                 {598467376, 561797804, 560823897, 593043790, 698672948}}};
-        std::shared_ptr<Tensor> other1 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv1")->getOperator())->getOutput(0);
-        REQUIRE(approxEq<float, int>(*other1, *expectedOutput1, 0.0, 1.0e-12));
-        std::shared_ptr<Tensor> other2 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv2")->getOperator())->getOutput(0);
-        REQUIRE(approxEq<int>(*other2, *expectedOutput2, 0.0, 1.0e-12));
-        std::shared_ptr<Tensor> other3 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv3")->getOperator())->getOutput(0);
-        REQUIRE(approxEq<double, int>(*other3, *expectedOutput3, 0.0, 1.0e-12));
-        std::shared_ptr<Tensor> other4 = std::static_pointer_cast<OperatorTensor>(g->getNode("fc")->getOperator())->getOutput(0);
-        REQUIRE(approxEq<int>(*other4, expectedOutput4, 0.0, 1.0e-12));
+        std::shared_ptr<Tensor> other1 = std::static_pointer_cast<Conv_Op<2>>(g->getNode("conv1")->getOperator())->getOutput(0);
+        REQUIRE(approxEq<float, int>(*other1, expectedOutput1, 0.0, 1.0e-12));
+        std::shared_ptr<Tensor> other2 = std::static_pointer_cast<Conv_Op<2>>(g->getNode("conv2")->getOperator())->getOutput(0);
+        REQUIRE(*other2 == expectedOutput2);
+        std::shared_ptr<Tensor> other3 = std::static_pointer_cast<Conv_Op<2>>(g->getNode("conv3")->getOperator())->getOutput(0);
+        REQUIRE(approxEq<double, int>(*other3, expectedOutput3, 0.0, 1.0e-12));
+        std::shared_ptr<Tensor> other4 = std::static_pointer_cast<FC_Op>(g->getNode("fc")->getOperator())->getOutput(0);
+        REQUIRE(*other4 == expectedOutput4);
     }
 }
diff --git a/unit_tests/scheduler/Test_Scheduler.cpp b/unit_tests/scheduler/Test_Scheduler.cpp
index 78a10c308a60f026b83ea64cfbd25a848099eb90..956169c387c4a34f500f66b214dcf95a145feafd 100644
--- a/unit_tests/scheduler/Test_Scheduler.cpp
+++ b/unit_tests/scheduler/Test_Scheduler.cpp
@@ -18,10 +18,19 @@
 #include "aidge/graph/GraphView.hpp"
 #include "aidge/graph/OpArgs.hpp"
 #include "aidge/operator/Memorize.hpp"
+#include "aidge/operator/Pop.hpp"
+#include "aidge/operator/Stack.hpp"
+#include "aidge/operator/Identity.hpp"
+#include "aidge/operator/MetaOperator.hpp"
 #include "aidge/scheduler/SequentialScheduler.hpp"
 #include "aidge/scheduler/ParallelScheduler.hpp"
 
-#include "aidge/backend/cpu.hpp"
+#include "aidge/backend/cpu/operator/FCImpl.hpp"
+#include "aidge/backend/cpu/operator/ConvImpl.hpp"
+#include "aidge/backend/cpu/operator/ReLUImpl.hpp"
+#include "aidge/backend/cpu/operator/SqrtImpl.hpp"
+#include "aidge/backend/cpu/operator/AddImpl.hpp"
+
 #include "aidge/recipes/GraphViewHelper.hpp"
 
 
@@ -438,4 +447,69 @@ TEST_CASE("[cpu/scheduler] SequentialScheduler(backward)", "[scheduler][backward
     predictedOutput->setGrad(targetOutput);
     REQUIRE_NOTHROW(scheduler.backward());
 }
+
+std::shared_ptr<Node> Accumulate(int seqLength, const std::string& name) {
+    auto input = Identity((!name.empty()) ? name + "_input" : "");
+    auto hiddenState = Memorize(seqLength, (!name.empty()) ? name + "_hidden_state" : "");
+    auto add = Add((!name.empty()) ? name + "_add" : "");
+
+    input->addChild(add, 0, 0);
+    add->addChild(hiddenState, 0,0);
+    hiddenState->addChild(/*otherNode=*/add, /*outId=*/1, /*otherInId=*/1);
+
+    std::shared_ptr<GraphView> microGraph = std::make_shared<GraphView>();
+    microGraph->add(input);
+    microGraph->add({hiddenState, add});
+    microGraph->setOrderedInputs({{input, 0}, {hiddenState, 1}});
+    microGraph->setOrderedOutputs({{hiddenState, 0}});
+
+    auto metaOp = MetaOperator("Accumulate", microGraph, {}, name);
+    return metaOp;
+}
+
+TEST_CASE("[cpu/scheduler] Accumulate", "[scheduler]") {
+    std::shared_ptr<Tensor> Input = std::make_shared<Tensor>(
+        Array3D<float, 2, 3, 2>{{{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}},
+                                 {{2.0, 3.0}, {4.0, 5.0}, {6.0, 7.0}}}});
+
+    std::shared_ptr<Tensor> MemInit =
+        std::make_shared<Tensor>(Array2D<float, 3, 2>{
+            {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}});
+
+    auto meta = Accumulate(2, "accumulate");
+    auto op = std::static_pointer_cast<MetaOperator_Op>(meta->getOperator());
+    auto pop_i = Pop("pop_input");
+    auto pop_o = Identity("pop_output"); // NOTE: Could be Identity/Stack/Whatever node you want, this is is not the problem here
+
+    pop_i->getOperator()->associateInput(0, Input);
+    pop_i->addChild(op->getMicroGraph()->getOrderedInputs()[0].first, 0, 0);
+    op->getMicroGraph()->getOrderedOutputs()[0].first->addChild(pop_o, 0, 0);
+
+    //pop_i->addChild(meta, 0, 0);
+    //meta->addChild(pop_o, 0, 0);
+
+    //op->associateInput(1, MemInit);
+    op->getMicroGraph()->getNode("accumulate_hidden_state")->getOperator()->associateInput(1, MemInit);
+
+    // Build the graph.
+    auto myGraph = std::make_shared<GraphView>();
+    myGraph->add(pop_i);
+    myGraph->add(op->getMicroGraph());
+    //myGraph->add(meta);
+    myGraph->add(pop_o);
+    myGraph->compile("cpu", DataType::Float32);
+
+    myGraph->save("accumulate_graph", true);
+
+    // Schedule and run
+    auto scheduler = SequentialScheduler(myGraph);
+    scheduler.generateScheduling();
+    scheduler.saveStaticSchedulingDiagram("accumulate_scheduling");
+    REQUIRE_NOTHROW(scheduler.forward(true));
+
+    std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(
+        Array2D<float, 3, 2>{{{3.0, 5.0}, {7.0, 9.0}, {11.0, 13.0}}});
+    std::shared_ptr<Tensor> output = std::static_pointer_cast<OperatorTensor>(pop_o->getOperator())->getOutput(0);
+    REQUIRE(*output == *expectedOutput);
+}
 } // namespace Aidge
diff --git a/version.txt b/version.txt
index 267577d47e497a0630bc454b3f74c4fd9a10ced4..8f0916f768f0487bcf8d33827ce2c8dcecb645c1 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.4.1
+0.5.0