From 43f93816aaf6f402213282e056f6a705ea70597a Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Wed, 18 Dec 2024 12:09:49 +0100
Subject: [PATCH 01/30] Register Mul for mixed Float32, Float64 inputs

---
 include/aidge/backend/cpu/operator/MulImpl_kernels.hpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp
index 556dd56c..e3d17a4b 100644
--- a/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp
@@ -211,6 +211,9 @@ void MulImpl_cpu_backward_kernel(const std::size_t input0Length,
 REGISTRAR(MulImpl_cpu,
     {DataType::Float32},
     {ProdConso::inPlaceModel, Aidge::MulImpl_cpu_forward_kernel<float, float, float>, Aidge::MulImpl_cpu_backward_kernel<float, float, float>});
+REGISTRAR(MulImpl_cpu,
+    {{{DataType::Float32}, {DataType::Float64}}, {DataType::Float32}},
+    {ProdConso::inPlaceModel, Aidge::MulImpl_cpu_forward_kernel<float, double, float>, Aidge::MulImpl_cpu_backward_kernel<float, double, float>});
 REGISTRAR(MulImpl_cpu,
     {DataType::Float64},
     {ProdConso::inPlaceModel, Aidge::MulImpl_cpu_forward_kernel<double, double, double>, Aidge::MulImpl_cpu_backward_kernel<double, double, double>});
-- 
GitLab


From 7e5f09c456ade60f603136521945eb0d64c1b0c5 Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Wed, 18 Dec 2024 15:06:43 +0100
Subject: [PATCH 02/30] Hotfix for more generic registry

---
 .../cpu/operator/ResizeImpl_kernels.hpp       | 37 ++++++++++---------
 1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/ResizeImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ResizeImpl_kernels.hpp
index 6a22ff4e..6449417b 100644
--- a/include/aidge/backend/cpu/operator/ResizeImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ResizeImpl_kernels.hpp
@@ -99,30 +99,31 @@ void ResizeImpl_cpu_forward_kernel(
     }
     return;
 }
+
 // Kernels registration to implementation entry point
 REGISTRAR(ResizeImpl_cpu,
           {{{DataType::Int16},
-            {DataType::Float32},
-            {DataType::Float32},
-            {DataType::UInt64}},
+            {DataType::Any},
+            {DataType::Any},
+            {DataType::Any}},
            {DataType::Int16}},
           {ProdConso::inPlaceModel,
            ResizeImpl_cpu_forward_kernel<int16_t>,
            nullptr});
 REGISTRAR(ResizeImpl_cpu,
           {{{DataType::Int32},
-            {DataType::Float32},
-            {DataType::Float32},
-            {DataType::UInt64}},
+            {DataType::Any},
+            {DataType::Any},
+            {DataType::Any}},
            {DataType::Int32}},
           {ProdConso::inPlaceModel,
            ResizeImpl_cpu_forward_kernel<int32_t>,
            nullptr});
 REGISTRAR(ResizeImpl_cpu,
           {{{DataType::Int64},
-            {DataType::Float32},
-            {DataType::Float32},
-            {DataType::Int64}},
+            {DataType::Any},
+            {DataType::Any},
+            {DataType::Any}},
            {DataType::UInt64}},
           {ProdConso::inPlaceModel,
            ResizeImpl_cpu_forward_kernel<int64_t>,
@@ -130,27 +131,27 @@ REGISTRAR(ResizeImpl_cpu,
 
 REGISTRAR(ResizeImpl_cpu,
           {{{DataType::Float16},
-            {DataType::Float32},
-            {DataType::Float32},
-            {DataType::UInt64}},
+            {DataType::Any},
+            {DataType::Any},
+            {DataType::Any}},
            {DataType::Float16}},
           {ProdConso::inPlaceModel,
            ResizeImpl_cpu_forward_kernel<half_float::half>,
            nullptr});
 REGISTRAR(ResizeImpl_cpu,
           {{{DataType::Float32},
-            {DataType::Float32},
-            {DataType::Float32},
-            {DataType::UInt64}},
+            {DataType::Any},
+            {DataType::Any},
+            {DataType::Any}},
            {DataType::Float32}},
           {ProdConso::inPlaceModel,
            ResizeImpl_cpu_forward_kernel<float>,
            nullptr});
 REGISTRAR(ResizeImpl_cpu,
           {{{DataType::Float64},
-            {DataType::Float32},
-            {DataType::Float32},
-            {DataType::UInt64}},
+            {DataType::Any},
+            {DataType::Any},
+            {DataType::Any}},
            {DataType::Float64}},
           {ProdConso::inPlaceModel,
            ResizeImpl_cpu_forward_kernel<double>,
-- 
GitLab


From 824df143b0715a06d2e9f6eb794900d9e0839b11 Mon Sep 17 00:00:00 2001
From: cmoineau <cyril.moineau@cea.fr>
Date: Mon, 9 Dec 2024 16:40:30 +0000
Subject: [PATCH 03/30] Update backend_cpu with
 https://gitlab.eclipse.org/eclipse/aidge/aidge_core/-/merge_requests/277

---
 .gitignore                                    |  1 +
 CMakeLists.txt                                | 22 ++++++++++++++-----
 aidge_backend_cpu/__init__.py                 |  1 -
 include/aidge/backend/cpu.hpp                 |  2 ++
 include/aidge/backend/version.h.in            | 11 ++++++++++
 .../aidge/utils/sys_info/CpuVersionInfo.hpp   | 19 +++++++++-------
 project_name.txt                              |  1 +
 pyproject.toml                                | 19 ++++++++++------
 python_binding/pybind_cpu.cpp                 |  4 ++--
 .../utils/sys_info/pybind_CpuVersionInfo.cpp  |  6 +++--
 setup.cfg                                     |  3 +++
 setup.py                                      |  6 +++--
 12 files changed, 68 insertions(+), 27 deletions(-)
 create mode 100644 include/aidge/backend/version.h.in
 create mode 100644 project_name.txt
 create mode 100644 setup.cfg

diff --git a/.gitignore b/.gitignore
index 0e14676b..9877699f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@
 # C++ Build
 build*/
 install*/
+include/aidge/backend/cpu_version.h
 
 # VSCode
 .vscode
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e9e191c3..4329d993 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,15 +3,18 @@ set(CXX_STANDARD 14)
 
 file(STRINGS "${CMAKE_SOURCE_DIR}/version.txt" version)
 
+# Parse version.txt to retrieve Major, Minor and Path
+string(REGEX MATCH "([0-9]+\\.[0-9]+\\.[0-9]+)" _ MATCHES ${version})
+set(PROJECT_VERSION_MAJOR ${CMAKE_MATCH_1})
+set(PROJECT_VERSION_MINOR ${CMAKE_MATCH_2})
+set(PROJECT_VERSION_PATCH ${CMAKE_MATCH_3})
+
 project(aidge_backend_cpu
         VERSION ${version}
         DESCRIPTION "CPU implementations of the operators of aidge framework."
         LANGUAGES CXX)
 
-message(STATUS "Project name: ${CMAKE_PROJECT_NAME}")
-message(STATUS "Project version: ${version}")
-add_definitions(-DPROJECT_VERSION="${version}")
-
+# Retrieve latest git commit
 execute_process(
     COMMAND git rev-parse --short HEAD
     WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
@@ -19,8 +22,10 @@ execute_process(
     OUTPUT_STRIP_TRAILING_WHITESPACE
     ERROR_QUIET
 )
+
+message(STATUS "Project name: ${CMAKE_PROJECT_NAME}")
+message(STATUS "Project version: ${version}")
 message(STATUS "Latest git commit: ${GIT_COMMIT_HASH}")
-add_definitions(-DGIT_COMMIT_HASH="${GIT_COMMIT_HASH}")
 
 # helper for LSP users
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@@ -115,6 +120,13 @@ if(CMAKE_COMPILER_IS_GNUCXX AND COVERAGE)
     append_coverage_compiler_flags()
 endif()
 
+message(STATUS "Creating ${CMAKE_CURRENT_SOURCE_DIR}/include/aidge/backend/cpu_version.h")
+# Generate version.h file from config file version.h.in
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/include/aidge/backend/version.h.in"
+    "${CMAKE_CURRENT_SOURCE_DIR}/include/aidge/backend/cpu_version.h"
+)
+
 ##############################################
 # Installation instructions
 include(GNUInstallDirs)
diff --git a/aidge_backend_cpu/__init__.py b/aidge_backend_cpu/__init__.py
index a7fe1ea3..bb320b2f 100644
--- a/aidge_backend_cpu/__init__.py
+++ b/aidge_backend_cpu/__init__.py
@@ -1,3 +1,2 @@
 import aidge_core
 from aidge_backend_cpu.aidge_backend_cpu import * # import so generated by PyBind
-from ._version import *
diff --git a/include/aidge/backend/cpu.hpp b/include/aidge/backend/cpu.hpp
index caa75328..2223acef 100644
--- a/include/aidge/backend/cpu.hpp
+++ b/include/aidge/backend/cpu.hpp
@@ -12,6 +12,8 @@
 #ifndef AIDGE_CPU_IMPORTS_H_
 #define AIDGE_CPU_IMPORTS_H_
 
+#include "aidge/backend/cpu_version.h"
+
 #include "aidge/backend/cpu/operator/AbsImpl.hpp"
 #include "aidge/backend/cpu/operator/AddImpl.hpp"
 #include "aidge/backend/cpu/operator/AndImpl.hpp"
diff --git a/include/aidge/backend/version.h.in b/include/aidge/backend/version.h.in
new file mode 100644
index 00000000..4b876f63
--- /dev/null
+++ b/include/aidge/backend/version.h.in
@@ -0,0 +1,11 @@
+#ifndef VERSION_H
+#define VERSION_H
+
+namespace Aidge {
+static constexpr const int PROJECT_VERSION_MAJOR = @PROJECT_VERSION_MAJOR@;
+static constexpr const int PROJECT_VERSION_MINOR = @PROJECT_VERSION_MINOR@;
+static constexpr const int PROJECT_VERSION_PATCH = @PROJECT_VERSION_PATCH@;
+static constexpr const char * PROJECT_VERSION = "@PROJECT_VERSION_MAJOR@.@PROJECT_VERSION_MINOR@.@PROJECT_VERSION_PATCH@";
+static constexpr const char * PROJECT_GIT_HASH = "@GIT_COMMIT_HASH@";
+}
+#endif // VERSION_H
diff --git a/include/aidge/utils/sys_info/CpuVersionInfo.hpp b/include/aidge/utils/sys_info/CpuVersionInfo.hpp
index 887ce839..3df70d13 100644
--- a/include/aidge/utils/sys_info/CpuVersionInfo.hpp
+++ b/include/aidge/utils/sys_info/CpuVersionInfo.hpp
@@ -2,17 +2,20 @@
 #define AIDGE_UTILS_SYS_INFO_CPU_VERSION_INFO_H
 
 #include "aidge/utils/Log.hpp"
+#include "aidge/backend/cpu_version.h"
 
 namespace Aidge {
 
-#ifndef PROJECT_VERSION // Normally defined in CMakeLists.txt
-#define PROJECT_VERSION "Unknown version"
-#endif
-#ifndef GIT_COMMIT_HASH
-#define GIT_COMMIT_HASH ""
-#endif
-void showCpuVersion() {
-    Log::info("Aidge backend CPU: {} ({}), {} {}", PROJECT_VERSION, GIT_COMMIT_HASH, __DATE__, __TIME__);
+constexpr inline const char * getBackendCPUProjectVersion(){
+    return PROJECT_VERSION;
+}
+
+constexpr inline const char * getBackendCPUGitHash(){
+    return PROJECT_GIT_HASH;
+}
+
+void showBackendCpuVersion() {
+    Log::info("Aidge backend CPU: {} ({}), {} {}", getBackendCPUProjectVersion(), getBackendCPUGitHash(), __DATE__, __TIME__);
         // Compiler version
     #if defined(__clang__)
     /* Clang/LLVM. ---------------------------------------------- */
diff --git a/project_name.txt b/project_name.txt
new file mode 100644
index 00000000..25caafdd
--- /dev/null
+++ b/project_name.txt
@@ -0,0 +1 @@
+aidge_backend_cpu
diff --git a/pyproject.toml b/pyproject.toml
index 3c08302d..baa61de5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,17 +7,25 @@ dependencies = [
 requires-python = ">= 3.7"
 readme = "README.md"
 license = { file = "LICENSE" }
-classifiers = [ 
+classifiers = [
     "Development Status :: 2 - Pre-Alpha",
     "Programming Language :: Python :: 3"
     ]
-dynamic = ["version"] # defined in tool.setuptools_scm
+dynamic = ["version"] # defined by pbr
+
+
+[project.urls]
+Homepage = "https://www.deepgreen.ai/en/platform"
+Documentation = "https://eclipse-aidge.readthedocs.io/en/latest/"
+Repository = "https://gitlab.eclipse.org/eclipse/aidge/aidge_backend_cpu"
+Issues = "https://gitlab.eclipse.org/eclipse/aidge/aidge_backend_cpu/-/issues"
+Changelog = "https://gitlab.eclipse.org/eclipse/aidge/aidge_backend_cpu/-/releases"
 
 [build-system]
 requires = [
     "setuptools>=64",
-    "setuptools_scm[toml]==7.1.0",
-    "cmake>=3.18.4.post1"
+    "cmake>=3.18.4.post1",
+    "pbr"
 ]
 build-backend = "setuptools.build_meta"
 
@@ -29,9 +37,6 @@ where = ["."]  # list of folders that contain the packages (["."] by default)
 include = ["aidge_backend_cpu*"]  # package names should match these glob patterns (["*"] by default)
 exclude = ["aidge_backend_cpu.unit_tests*"]  # exclude packages matching these glob patterns (empty by default)
 namespaces = false  # to disable scanning PEP 420 namespaces (true by default)
-# SETUPTOOLS_SCM
-[tool.setuptools_scm]
-write_to = "aidge_backend_cpu/_version.py"
 
 #####################################################
 # CIBUILDWHEEL
diff --git a/python_binding/pybind_cpu.cpp b/python_binding/pybind_cpu.cpp
index d5022e1d..e576de08 100644
--- a/python_binding/pybind_cpu.cpp
+++ b/python_binding/pybind_cpu.cpp
@@ -6,10 +6,10 @@ namespace py = pybind11;
 
 namespace Aidge {
 
-void init_cpu_sys_info(py::module& m);
+void init_CpuVersionInfo(py::module& m);
 
 void init_Aidge(py::module& m){
-    init_cpu_sys_info(m);
+    init_CpuVersionInfo(m);
 }
 
 
diff --git a/python_binding/utils/sys_info/pybind_CpuVersionInfo.cpp b/python_binding/utils/sys_info/pybind_CpuVersionInfo.cpp
index 573bee36..7461dd95 100644
--- a/python_binding/utils/sys_info/pybind_CpuVersionInfo.cpp
+++ b/python_binding/utils/sys_info/pybind_CpuVersionInfo.cpp
@@ -3,7 +3,9 @@
 
 namespace py = pybind11;
 namespace Aidge {
-void init_cpu_sys_info(py::module& m){
-    m.def("show_cpu_version", &showCpuVersion);
+void init_CpuVersionInfo(py::module& m){
+    m.def("show_version", &showBackendCpuVersion);
+    m.def("get_project_version", &getBackendCPUProjectVersion);
+    m.def("get_git_hash", &getBackendCPUGitHash);
 }
 }
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 00000000..aa0f227f
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,3 @@
+# pbr file
+[metadata]
+version = file: version.txt
diff --git a/setup.py b/setup.py
index 22cbd973..82edd09c 100644
--- a/setup.py
+++ b/setup.py
@@ -11,8 +11,10 @@ from math import ceil
 from setuptools import setup, Extension
 from setuptools.command.build_ext import build_ext
 
+def get_project_name() -> str:
+    return open(pathlib.Path().absolute() / "project_name.txt", "r").read().strip()
 
-PROJECT_NAME = "aidge_backend_cpu"
+PROJECT_NAME = get_project_name()
 
 SETUP_DIR = pathlib.Path(__file__).parent
 
@@ -66,7 +68,7 @@ class AidgePkgBuild(build_ext):
             else []
         )
         test_onoff = os.environ.get("AIDGE_BUILD_TEST", "OFF")
-        
+
         self.spawn(
             [
                 "cmake",
-- 
GitLab


From 729b412a65cfa65e4ef26e3d172883178261531e Mon Sep 17 00:00:00 2001
From: NAUD Maxence <maxence.naud@cea.fr>
Date: Fri, 3 Jan 2025 09:19:52 +0000
Subject: [PATCH 04/30] Only include what is used for 'Test_ReduceSumImpl.cpp'

---
 unit_tests/operator/Test_ReduceSumImpl.cpp | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/unit_tests/operator/Test_ReduceSumImpl.cpp b/unit_tests/operator/Test_ReduceSumImpl.cpp
index 49569d1f..0aa543da 100644
--- a/unit_tests/operator/Test_ReduceSumImpl.cpp
+++ b/unit_tests/operator/Test_ReduceSumImpl.cpp
@@ -9,17 +9,22 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
+#include <cstddef>  // std::size_t
+#include <cstdint>  // std::uint16_t, std::int32_t
 #include <memory>
-#include <numeric>   // std::accumulate
 #include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <vector>
+
+#include <catch2/catch_test_macros.hpp>
 
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/data/Data.hpp"  // DataType
 #include "aidge/data/Tensor.hpp"
+#include "aidge/graph/Node.hpp"
+#include "aidge/operator/OperatorTensor.hpp"
 #include "aidge/operator/ReduceSum.hpp"
-#include "aidge/operator/Conv.hpp"
-
-#include "aidge/backend/cpu.hpp"
 #include "aidge/utils/TensorUtils.hpp"
+#include "aidge/utils/Types.h"
 
 using namespace Aidge;
 
@@ -112,7 +117,7 @@ TEST_CASE("[cpu/operator] ReduceSum(forward)", "[ReduceSum][CPU]") {
                 std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(dims);
                 myInput->setBackend("cpu");
                 myInput->setDataType(DataType::Float32);
-                std::shared_ptr<Node> myReduceSum = ReduceSum(std::vector<int32_t>{}, false, true);
+                std::shared_ptr<Node> myReduceSum = ReduceSum(std::vector<std::int32_t>{}, false, true);
                 auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
                 op->associateInput(0,myInput);
                 op->setDataType(DataType::Float32);
-- 
GitLab


From 58c5ec69ebfcb88b66d9be031381cbeb765564c5 Mon Sep 17 00:00:00 2001
From: thibault allenet <thibault.allenet@cea.fr>
Date: Thu, 21 Nov 2024 14:09:58 +0000
Subject: [PATCH 05/30] Add WeightInterleaving Operator CPU Implementation

---
 include/aidge/backend/cpu.hpp                 |   1 +
 .../cpu/operator/WeightInterleavingImpl.hpp   |  37 ++
 .../WeightInterleavingImpl_kernels.hpp        | 105 ++++++
 src/operator/WeightInterleavingImpl.cpp       |  75 ++++
 .../operator/Test_WeightInterleavingImpl.cpp  | 330 ++++++++++++++++++
 5 files changed, 548 insertions(+)
 create mode 100644 include/aidge/backend/cpu/operator/WeightInterleavingImpl.hpp
 create mode 100644 include/aidge/backend/cpu/operator/WeightInterleavingImpl_kernels.hpp
 create mode 100644 src/operator/WeightInterleavingImpl.cpp
 create mode 100644 unit_tests/operator/Test_WeightInterleavingImpl.cpp

diff --git a/include/aidge/backend/cpu.hpp b/include/aidge/backend/cpu.hpp
index 2223acef..98015d5b 100644
--- a/include/aidge/backend/cpu.hpp
+++ b/include/aidge/backend/cpu.hpp
@@ -53,6 +53,7 @@
 #include "aidge/backend/cpu/operator/SoftmaxImpl.hpp"
 #include "aidge/backend/cpu/operator/SubImpl.hpp"
 #include "aidge/backend/cpu/operator/TanhImpl.hpp"
+#include "aidge/backend/cpu/operator/WeightInterleavingImpl.hpp"
 
 #include "aidge/backend/cpu/data/TensorImpl.hpp"
 
diff --git a/include/aidge/backend/cpu/operator/WeightInterleavingImpl.hpp b/include/aidge/backend/cpu/operator/WeightInterleavingImpl.hpp
new file mode 100644
index 00000000..0b3b1c57
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/WeightInterleavingImpl.hpp
@@ -0,0 +1,37 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_WEIGHTINTERLEAVINGIMPL_H_
+#define AIDGE_CPU_OPERATOR_WEIGHTINTERLEAVINGIMPL_H_
+
+#include <array>
+#include <memory>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/WeightInterleaving.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using WeightInterleavingImpl_cpu = OperatorImpl_cpu<WeightInterleaving_Op,
+    void(const DimSize_t,
+        const DimSize_t,
+        const DimSize_t,
+        const void *,
+        void *)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(WeightInterleaving_Op, "cpu", Aidge::WeightInterleavingImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_WeightInterleavingIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/WeightInterleavingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/WeightInterleavingImpl_kernels.hpp
new file mode 100644
index 00000000..422afab5
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/WeightInterleavingImpl_kernels.hpp
@@ -0,0 +1,105 @@
+
+
+#ifndef AIDGE_CPU_OPERATOR_WEIGHTINTERLEAVINGIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_WEIGHTINTERLEAVINGIMPL_KERNELS_H_
+
+#include <algorithm>
+
+#include "aidge/backend/cpu/operator/WeightInterleavingImpl.hpp"
+#include "aidge/utils/Registrar.hpp"
+
+
+namespace Aidge {
+
+    /**
+     * @brief Compacts 8-bit data into a smaller bit-width representation.
+     * 
+     * This function takes an array of 8-bit data and compacts it into smaller chunks 
+     * based on the specified bit-width `nb_bits`. Each element in `compactData` will 
+     * store multiple packed `nb_bits` segments extracted from `data`.
+     * 
+     * @param data The input array of 8-bit values to be compacted.
+     * @param dataSize The size of the input `data` array.
+     * @param compactData The output array storing the compacted data.
+     * @param nb_bits The number of bits to extract from each `data` element (must be less than 8).
+     */
+    void compact_data(const std::int8_t* data, std::size_t dataSize, std::int8_t* compactData, std::uint8_t nb_bits) {
+        AIDGE_ASSERT(nb_bits > 0 && nb_bits < 5, "Cannot compact with the given nb_bits"); // Ensure valid bit width
+
+        // Mask to extract `nb_bits` from each data element
+        const unsigned int mask = (1U << nb_bits) - 1;
+
+        // Calculate the number of `nb_bits` segments that fit into an 8-bit compacted value
+        const unsigned int nbSlot = 8 / nb_bits;
+
+        // Case nb_bits=3 or 4, then shift is 4
+        // Case nb_bits=2, then shift is 2
+        // Case nb_bits=1, then shift is 1
+        std::uint8_t shift = 8 / nbSlot;
+
+        const unsigned int nbFullCompactbytes = dataSize / nbSlot;
+        
+        // Main loop to process data in groups of `nbSlot`
+        for (std::size_t i = 0; i < nbFullCompactbytes; ++i) {
+            std::int8_t compact = 0;
+            
+            for (unsigned int j = 0; j < nbSlot; ++j) {
+                compact |= (data[i * nbSlot + j] & mask);    // Apply mask to keep `nb_bits` only
+                
+                // Shift only if not on the last slot to make room for the next `nb_bits`
+                if (j < nbSlot - 1) {
+                    compact <<= shift;
+                }
+            }
+            // Store the compacted value in the output array
+            compactData[i] = compact;
+        }
+        
+
+        // Handle any remaining data elements (if dataSize is not a multiple of nbSlot).
+        std::size_t remaining = dataSize % nbSlot;
+        if (remaining != 0) {
+            std::int8_t compact = 0;
+            for (std::size_t j = 0; j < remaining; ++j) {
+                compact |= (data[nbFullCompactbytes*nbSlot + j] & mask);
+                
+                if (j < remaining - 1) {
+                    compact <<= shift;
+                }
+            }
+            compact <<= (shift*(nbSlot - remaining));
+            // Store the last compacted value
+            compactData[dataSize / nbSlot] = compact;
+        }
+    }
+
+template <class I, class O, int nb_bits>
+void WeightInterleavingImpl_cpu_forward_kernel(const DimSize_t input_interleaving,
+                            const DimSize_t nb_interleaving,
+                            const DimSize_t output_interleaving,
+                            const void* input_,
+                            void* output_) {
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+
+    // Aidge::compact_data(const std::int8_t* data, std::size_t dataSize, std::int8_t* compactData, std::uint8_t nb_bits) {
+    for (std::size_t i=0; i<nb_interleaving; ++i){
+        compact_data(input+(i*input_interleaving), input_interleaving, output+(i*output_interleaving), static_cast<std::uint8_t>(nb_bits));
+    }
+
+}
+
+
+REGISTRAR(WeightInterleavingImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Int4, DataFormat::NHWC}},
+    {ProdConso::defaultModel, Aidge::WeightInterleavingImpl_cpu_forward_kernel<int8_t, int8_t, 4>, nullptr});
+REGISTRAR(WeightInterleavingImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Int3, DataFormat::NHWC}},
+    {ProdConso::defaultModel, Aidge::WeightInterleavingImpl_cpu_forward_kernel<int8_t, int8_t, 3>, nullptr});
+REGISTRAR(WeightInterleavingImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Int2, DataFormat::NHWC}},
+    {ProdConso::defaultModel, Aidge::WeightInterleavingImpl_cpu_forward_kernel<int8_t, int8_t, 2>, nullptr});
+
+}
+
+#endif /* AIDGE_CPU_OPERATOR_WEIGHTINTERLEAVINGIMPL_KERNELS_H_ */
\ No newline at end of file
diff --git a/src/operator/WeightInterleavingImpl.cpp b/src/operator/WeightInterleavingImpl.cpp
new file mode 100644
index 00000000..afb79179
--- /dev/null
+++ b/src/operator/WeightInterleavingImpl.cpp
@@ -0,0 +1,75 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include "aidge/backend/cpu/operator/WeightInterleavingImpl.hpp"
+
+#include <cstddef>  // std::size_t
+#include <functional>
+#include <memory>
+#include <tuple>
+
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include "aidge/backend/cpu/operator/WeightInterleavingImpl_kernels.hpp"
+#include "aidge/operator/WeightInterleaving.hpp"
+#include "aidge/utils/ErrorHandling.hpp"
+#include "aidge/utils/Types.h"
+
+
+template <>
+void Aidge::WeightInterleavingImpl_cpu::forward()
+{
+    const WeightInterleaving_Op& op_ = dynamic_cast<const WeightInterleaving_Op&>(mOp);
+    AIDGE_ASSERT(op_.getInput(0), "missing input #0");
+
+    const auto impl = Registrar<WeightInterleavingImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Convert input data (no overhead if not needed!)
+    // TODO: right now, if needed, memory will be allocated/deallocated at each
+    // call to forward(). We might put the following shared_ptr as members of
+    // this class to avoid that.
+    std::shared_ptr<Tensor> input0Fallback;
+    const auto& input0 = op_.getInput(0)->refCastFrom(input0Fallback, *(op_.getOutput(0)));
+
+    // inputInterleaving is the number of consecutive input elements that will be compacted 
+    // Here the interleaving is the last dimension (cf STM32 low bit kernels)
+    std::size_t inputInterleaving = input0.dims().back();
+
+    // The resulting compacted dimension was computed in forwardDims and the output tensor was resized
+    std::size_t outputInterleaving = op_.getOutput(0)->dims().back();
+
+    // nb_interleaving is the number of compacted segments 
+    std::size_t nbInterleaving;
+
+    // Determine the number of segment to compact
+    if (input0.dims().size() > 1){
+        nbInterleaving = std::accumulate(
+        input0.dims().cbegin(),
+        std::prev(input0.dims().cend()), // Exclude the last element
+        std::size_t(1),
+        std::multiplies<std::size_t>());
+    } else {
+        // Case when the weight tensor is only one dimension
+        nbInterleaving = 1;
+    }
+
+    impl.forward(inputInterleaving,
+        nbInterleaving,
+        outputInterleaving,
+        input0.getImpl()->rawPtr(),
+        getCPUPtr(mOp.getRawOutput(0)));
+    
+    
+}
+
+template <>
+void Aidge::WeightInterleavingImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for WeightInterleaving_Op on backend cpu");
+}
\ No newline at end of file
diff --git a/unit_tests/operator/Test_WeightInterleavingImpl.cpp b/unit_tests/operator/Test_WeightInterleavingImpl.cpp
new file mode 100644
index 00000000..8d4a6ac5
--- /dev/null
+++ b/unit_tests/operator/Test_WeightInterleavingImpl.cpp
@@ -0,0 +1,330 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/WeightInterleaving.hpp"
+
+#include "aidge/backend/cpu.hpp"
+
+#include <memory>
+
+using namespace Aidge;
+
+TEST_CASE("[cpu/operator] WeightInterleaving", "[WeightInterleaving][CPU]") {
+    
+    std::shared_ptr<Node> myWeightInterleaving = WeightInterleaving();
+    auto opWeightInterleaving = std::static_pointer_cast<WeightInterleaving_Op>(myWeightInterleaving -> getOperator());
+
+    SECTION("CompactDataSize - Single element cases") {
+        REQUIRE(opWeightInterleaving->compactDataSize(1, 1) == 1);  // 1 bit, needs 1 byte
+        REQUIRE(opWeightInterleaving->compactDataSize(1, 7) == 1);  // 7 bits, needs 1 byte
+    }
+
+    SECTION("CompactDataSize - Boundary cases for different nb_bits values") {
+        REQUIRE(opWeightInterleaving->compactDataSize(8, 1) == 1);  // 8 elements at 1 bit each, fits in 1 byte
+        REQUIRE(opWeightInterleaving->compactDataSize(8, 2) == 2);  // 8 elements at 2 bits each, needs 2 bytes
+        REQUIRE(opWeightInterleaving->compactDataSize(8, 3) == 4);  // 8 elements at 3 bits each, needs 4 bytes
+        REQUIRE(opWeightInterleaving->compactDataSize(8, 4) == 4);  // 8 elements at 4 bits each, needs 4 bytes
+    }
+
+    SECTION("CompactDataSize - Larger dataSize values") {
+        REQUIRE(opWeightInterleaving->compactDataSize(16, 1) == 2);  // 16 elements at 1 bit each, fits in 2 bytes
+        REQUIRE(opWeightInterleaving->compactDataSize(16, 2) == 4);  // 16 elements at 2 bits each, needs 4 bytes
+        REQUIRE(opWeightInterleaving->compactDataSize(16, 3) == 8);  // 16 elements at 3 bits each, needs 6 bytes
+        REQUIRE(opWeightInterleaving->compactDataSize(16, 4) == 8);  // 16 elements at 4 bits each, needs 8 bytes
+    }
+
+    SECTION("CompactDataSize - Odd dataSize values with varying nb_bits") {
+        REQUIRE(opWeightInterleaving->compactDataSize(7, 1) == 1);  // 7 elements at 1 bit each, fits in 1 byte
+        REQUIRE(opWeightInterleaving->compactDataSize(7, 2) == 2);  // 7 elements at 2 bits each, needs 2 bytes
+        REQUIRE(opWeightInterleaving->compactDataSize(7, 3) == 4);  // 7 elements at 3 bits each, needs 4 bytes
+        REQUIRE(opWeightInterleaving->compactDataSize(7, 4) == 4);  // 7 elements at 4 bits each, needs 4 bytes
+    }
+
+    SECTION("CompactDataSize - Minimum and maximum values for nb_bits") {
+        REQUIRE(opWeightInterleaving->compactDataSize(5, 1) == 1);  // 5 elements at 1 bit each, fits in 1 byte
+    }
+
+    SECTION("CompactDataSize - Edge Case - dataSize of 0 should result in 0 required size") {
+        REQUIRE(opWeightInterleaving->compactDataSize(0, 1) == 0);  // No data elements
+    }
+
+
+    SECTION("CompactData - 4-bit compaction") {
+        std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array1D<std::int8_t, 4>{
+                                                                {static_cast<std::int8_t>(0x0F), 
+                                                                static_cast<std::int8_t>(0xF5), 
+                                                                static_cast<std::int8_t>(0xB3), 
+                                                                static_cast<std::int8_t>(0x9C)}
+                                                                });
+
+        weight->setDataFormat(Aidge::DataFormat::NHWC);
+        weight->setDataType(Aidge::DataType::Int4);
+
+        std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array1D<std::int8_t, 2>{
+                                                                {static_cast<int8_t>(0xF5), 
+                                                                static_cast<int8_t>(0x3C)}
+                                                                });
+
+        expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
+        expectedWeightInterleaving->setDataType(Aidge::DataType::Int4);
+
+        std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
+        auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
+        op->associateInput(0,weight);
+        op->setDataType(DataType::Int4);
+        op->setDataFormat(DataFormat::NHWC);
+        op->setBackend("cpu");
+        myWeightInterleavingNode->forward();
+        REQUIRE(*(op->getOutput(0)) == *expectedWeightInterleaving);
+    }
+
+    SECTION("CompactData - 3-bit compaction") {
+        std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array1D<std::int8_t, 4>{
+                                                                {static_cast<int8_t>(0x0F), 
+                                                                static_cast<int8_t>(0x05), 
+                                                                static_cast<int8_t>(0x04),
+                                                                static_cast<int8_t>(0xD3)}
+                                                                });
+
+        weight->setDataFormat(Aidge::DataFormat::NHWC);
+        weight->setDataType(Aidge::DataType::Int3);
+
+        std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array1D<std::int8_t, 2>{
+                                                                {static_cast<int8_t>(0x75), 
+                                                                static_cast<int8_t>(0x43)}
+                                                                });
+
+        expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
+        expectedWeightInterleaving->setDataType(Aidge::DataType::Int3);
+
+        std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
+        auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
+        op->associateInput(0,weight);
+        op->setDataType(DataType::Int3);
+        op->setDataFormat(DataFormat::NHWC);
+        op->setBackend("cpu");
+        myWeightInterleavingNode->forward();
+        REQUIRE(*(op->getOutput(0)) == *expectedWeightInterleaving);
+    }
+
+    SECTION("CompactData - 2-bit compaction") {
+        std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array1D<std::int8_t, 4>{
+                                                                {static_cast<std::int8_t>(0x03),
+                                                                 static_cast<std::int8_t>(0x02),
+                                                                 static_cast<std::int8_t>(0x01), 
+                                                                 static_cast<std::int8_t>(0x00)}
+                                                                 });
+
+        weight->setDataFormat(Aidge::DataFormat::NHWC);
+        weight->setDataType(Aidge::DataType::Int2);
+
+        std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array1D<std::int8_t, 1>{
+                                                                {static_cast<int8_t>(0xE4)}
+                                                                });
+
+        expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
+        expectedWeightInterleaving->setDataType(Aidge::DataType::Int2);
+
+        std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
+        auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
+        op->associateInput(0,weight);
+        op->setDataType(DataType::Int2);
+        op->setDataFormat(DataFormat::NHWC);
+        op->setBackend("cpu");
+        myWeightInterleavingNode->forward();
+        REQUIRE(*(op->getOutput(0)) == *expectedWeightInterleaving);
+    }
+
+    SECTION("CompactData - Edge Cases - Single element data") {
+        std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array1D<std::int8_t, 1>{
+                                                                {static_cast<int8_t>(0x0F)}
+                                                                });
+
+        weight->setDataFormat(Aidge::DataFormat::NHWC);
+        weight->setDataType(Aidge::DataType::Int4);
+
+        std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array1D<std::int8_t, 1>{
+                                                                {static_cast<int8_t>(0xF0)}
+                                                                });
+
+        expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
+        expectedWeightInterleaving->setDataType(Aidge::DataType::Int4);
+
+        std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
+        auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
+        op->associateInput(0,weight);
+        op->setDataType(DataType::Int4);
+        op->setDataFormat(DataFormat::NHWC);
+        op->setBackend("cpu");
+        myWeightInterleavingNode->forward();
+        REQUIRE(*(op->getOutput(0)) == *expectedWeightInterleaving);
+    }
+
+    SECTION("CompactData - Edge Cases - Non-divisible dataSize for nbSlot with nbbits=4") {
+        std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array1D<std::int8_t, 3>{
+                                                                {static_cast<int8_t>(0x0F), 
+                                                                static_cast<int8_t>(0xA5), 
+                                                                static_cast<int8_t>(0x34)}
+                                                                });
+
+        weight->setDataFormat(Aidge::DataFormat::NHWC);
+        weight->setDataType(Aidge::DataType::Int4);
+
+        std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array1D<std::int8_t, 2>{
+                                                                {static_cast<int8_t>(0xF5), 
+                                                                static_cast<int8_t>(0x40)}
+                                                                });
+
+        expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
+        expectedWeightInterleaving->setDataType(Aidge::DataType::Int4);
+
+        std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
+        auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
+        op->associateInput(0,weight);
+        op->setDataType(DataType::Int4);
+        op->setDataFormat(DataFormat::NHWC);
+        op->setBackend("cpu");
+        myWeightInterleavingNode->forward();
+        REQUIRE(*(op->getOutput(0)) == *expectedWeightInterleaving);
+
+    }
+
+    SECTION("CompactData - Edge Cases - Non-divisible dataSize for nbSlot with nbbits=3") {
+
+        std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array1D<std::int8_t, 3>{
+                                                                {static_cast<int8_t>(0x0F), 
+                                                                static_cast<int8_t>(0x05), 
+                                                                static_cast<int8_t>(0x04)}
+                                                                });
+
+        weight->setDataFormat(Aidge::DataFormat::NHWC);
+        weight->setDataType(Aidge::DataType::Int3);
+
+        std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array1D<std::int8_t, 2>{
+                                                                {static_cast<int8_t>(0x75), 
+                                                                static_cast<int8_t>(0x40)}
+                                                                });
+
+        expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
+        expectedWeightInterleaving->setDataType(Aidge::DataType::Int3);
+
+        std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
+        auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
+        op->associateInput(0,weight);
+        op->setDataType(DataType::Int3);
+        op->setDataFormat(DataFormat::NHWC);
+        op->setBackend("cpu");
+        myWeightInterleavingNode->forward();
+        REQUIRE(*(op->getOutput(0)) == *expectedWeightInterleaving);
+
+    }
+
+    SECTION("Forward Op - Convolution weight interleaving") {
+
+        // Weight [Cout = 2, H = 3, W = 3, Cin = 4]:
+        std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array4D<std::int8_t,2,3,3,4> {
+            {
+                {
+                    {
+                        {-6,  0,  5, -8}, // 'A' '0' '5' '8' in hexadecimal format
+                        { 5,  5,  4, -5}, // '5' '5' '4' 'B' in hexadecimal format
+                        {-7, -1,  4, -7}  // '9' 'F' '4' '9' in hexadecimal format
+                    },
+                    {
+                        { 3, -3, -3, -3}, // '3' 'D' 'D' 'D' in hexadecimal format
+                        { 1,  3,  1, -1}, // '1' '3' '1' 'F' in hexadecimal format
+                        { 7, -3, -1,  4}  // '7' 'D' 'F' '4' in hexadecimal format
+                    },
+                    {
+                        {-1,  3,  5,  6}, // 'F' '3' '5' '6' in hexadecimal format
+                        {-8,  4,  7,  1}, // '8' '4' '7' '1' in hexadecimal format
+                        {-5,  0, -1, -2}  // 'B' '0' 'F' 'E' in hexadecimal format
+                    }
+                },
+                {
+                    {
+                        { 2, -7,  7, -4}, // '2' '9' '7' 'C' in hexadecimal format
+                        {-7,  3,  0,  2}, // '9' '3' '0' '2' in hexadecimal format
+                        { 1, -1,  2,  3}  // '1' 'F' '2' '3' in hexadecimal format
+                    },
+                    {
+                        {-1, -5, -3, -7}, // 'F' 'B' 'D' '9' in hexadecimal format
+                        {-8,  3,  5, -1}, // '8' '3' '5' 'F' in hexadecimal format
+                        {-7, -4, -6, -1}  // '9' 'C' 'A' 'F' in hexadecimal format
+                    },
+                    {
+                        { 1,  7,  5, -1}, // '1' '7' '5' 'F' in hexadecimal format
+                        { 1, -8,  1,  2}, // '1' '8' '1' '2' in hexadecimal format
+                        {-1, -6, -3,  0}  // 'F' 'A' 'D' '0' in hexadecimal format
+                    }
+                }
+            } 
+        });
+        
+        std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array4D<std::int8_t,2,3,3,2> {
+            {
+                {
+                    {
+                        {static_cast<int8_t>(0xA0), static_cast<int8_t>(0x58)}, // 'A' '0' '5' '8' in hexadecimal format
+                        {static_cast<int8_t>(0x55), static_cast<int8_t>(0x4B)}, // '5' '5' '4' 'B' in hexadecimal format
+                        {static_cast<int8_t>(0x9F), static_cast<int8_t>(0x49)}  // '9' 'F' '4' '9' in hexadecimal format
+                    },
+                    {
+                        {static_cast<int8_t>(0x3D), static_cast<int8_t>(0xDD)}, // '3' 'D' 'D' 'D' in hexadecimal format
+                        {static_cast<int8_t>(0x13), static_cast<int8_t>(0x1F)}, // '1' '3' '1' 'F' in hexadecimal format
+                        {static_cast<int8_t>(0x7D), static_cast<int8_t>(0xF4)}  // '7' 'D' 'F' '4' in hexadecimal format
+                    },
+                    {
+                        {static_cast<int8_t>(0xF3), static_cast<int8_t>(0x56)}, // 'F' '3' '5' '6' in hexadecimal format
+                        {static_cast<int8_t>(0x84), static_cast<int8_t>(0x71)}, // '8' '4' '7' '1' in hexadecimal format
+                        {static_cast<int8_t>(0xB0), static_cast<int8_t>(0xFE)}  // 'B' '0' 'F' 'E' in hexadecimal format
+                    }
+                },
+                {
+                    {
+                        {static_cast<int8_t>(0x29), static_cast<int8_t>(0x7C)}, // '2' '9' '7' 'C' in hexadecimal format
+                        {static_cast<int8_t>(0x93), static_cast<int8_t>(0x02)}, // '9' '3' '0' '2' in hexadecimal format
+                        {static_cast<int8_t>(0x1F), static_cast<int8_t>(0x23)}  // '1' 'F' '2' '3' in hexadecimal format
+                    },
+                    {
+                        {static_cast<int8_t>(0xFB), static_cast<int8_t>(0xD9)}, // 'F' 'B' 'D' '9' in hexadecimal format
+                        {static_cast<int8_t>(0x83), static_cast<int8_t>(0x5F)}, // '8' '3' '5' 'F' in hexadecimal format
+                        {static_cast<int8_t>(0x9C), static_cast<int8_t>(0xAF)}  // '9' 'C' 'A' 'F' in hexadecimal format
+                    },
+                    {
+                        {static_cast<int8_t>(0x17), static_cast<int8_t>(0x5F)}, // '1' '7' '5' 'F' in hexadecimal format
+                        {static_cast<int8_t>(0x18), static_cast<int8_t>(0x12)}, // '1' '8' '1' '2' in hexadecimal format
+                        {static_cast<int8_t>(0xFA), static_cast<int8_t>(0xD0)}  // 'F' 'A' 'D' '0' in hexadecimal format
+                    }
+                }
+            } 
+        });
+
+        weight->setDataFormat(Aidge::DataFormat::NHWC);
+        weight->setDataType(Aidge::DataType::Int4);
+
+        expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
+        expectedWeightInterleaving->setDataType(Aidge::DataType::Int4);
+
+        std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
+        auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
+        op->associateInput(0,weight);
+        op->setDataType(DataType::Int4);
+        op->setDataFormat(DataFormat::NHWC);
+        op->setBackend("cpu");
+        myWeightInterleavingNode->forward();
+        REQUIRE(*(op->getOutput(0)) == *expectedWeightInterleaving);
+    }
+
+}
-- 
GitLab


From 25e707788840dc90be15ef41f5debedd826ac6bb Mon Sep 17 00:00:00 2001
From: thibault allenet <thibault.allenet@cea.fr>
Date: Thu, 21 Nov 2024 14:11:25 +0000
Subject: [PATCH 06/30] Add convolution registrar entry for int4 without
 implementation - Need for generate scheduling

---
 include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
index 1229d571..b08890c6 100644
--- a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
@@ -268,6 +268,9 @@ REGISTRAR(ConvImpl2D_cpu,
 REGISTRAR(ConvImpl2D_cpu,
     {{DataType::Any, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}},
     {ProdConso::inPlaceModel, Aidge::ConvImpl2D_cpu_forward_kernel<int32_t, int32_t, int32_t, int32_t>, nullptr});
+REGISTRAR(ConvImpl2D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Int4, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, nullptr, nullptr});
 REGISTRAR(ConvImpl2D_cpu,
     {{DataType::Any, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}},
     {ProdConso::inPlaceModel, Aidge::ConvImpl2D_cpu_forward_kernel<double, double, double, double>, nullptr});
-- 
GitLab


From 23a110dc213c8ec3342912ce15f2b47751fa64fe Mon Sep 17 00:00:00 2001
From: thibault allenet <thibault.allenet@cea.fr>
Date: Wed, 4 Dec 2024 13:41:39 +0000
Subject: [PATCH 07/30] Update implementation kernels for new lowbit integer
 datatypes

---
 .../WeightInterleavingImpl_kernels.hpp        | 36 ++++++++++++++++---
 1 file changed, 31 insertions(+), 5 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/WeightInterleavingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/WeightInterleavingImpl_kernels.hpp
index 422afab5..f2347fd2 100644
--- a/include/aidge/backend/cpu/operator/WeightInterleavingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/WeightInterleavingImpl_kernels.hpp
@@ -23,7 +23,8 @@ namespace Aidge {
      * @param compactData The output array storing the compacted data.
      * @param nb_bits The number of bits to extract from each `data` element (must be less than 8).
      */
-    void compact_data(const std::int8_t* data, std::size_t dataSize, std::int8_t* compactData, std::uint8_t nb_bits) {
+    template <typename T>
+    void compact_data(const T* data, std::size_t dataSize, T* compactData, std::uint8_t nb_bits) {
         AIDGE_ASSERT(nb_bits > 0 && nb_bits < 5, "Cannot compact with the given nb_bits"); // Ensure valid bit width
 
         // Mask to extract `nb_bits` from each data element
@@ -41,7 +42,7 @@ namespace Aidge {
         
         // Main loop to process data in groups of `nbSlot`
         for (std::size_t i = 0; i < nbFullCompactbytes; ++i) {
-            std::int8_t compact = 0;
+            T compact = 0;
             
             for (unsigned int j = 0; j < nbSlot; ++j) {
                 compact |= (data[i * nbSlot + j] & mask);    // Apply mask to keep `nb_bits` only
@@ -91,14 +92,39 @@ void WeightInterleavingImpl_cpu_forward_kernel(const DimSize_t input_interleavin
 
 
 REGISTRAR(WeightInterleavingImpl_cpu,
-    {ImplSpec::IOSpec{DataType::Int4, DataFormat::NHWC}},
+    {ImplSpec::IOSpec{DataType::Int4, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavingType<DataType::Int4>::type, DataFormat::NHWC}},
     {ProdConso::defaultModel, Aidge::WeightInterleavingImpl_cpu_forward_kernel<int8_t, int8_t, 4>, nullptr});
 REGISTRAR(WeightInterleavingImpl_cpu,
-    {ImplSpec::IOSpec{DataType::Int3, DataFormat::NHWC}},
+    {ImplSpec::IOSpec{DataType::Int3, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavingType<DataType::Int3>::type, DataFormat::NHWC}},
     {ProdConso::defaultModel, Aidge::WeightInterleavingImpl_cpu_forward_kernel<int8_t, int8_t, 3>, nullptr});
 REGISTRAR(WeightInterleavingImpl_cpu,
-    {ImplSpec::IOSpec{DataType::Int2, DataFormat::NHWC}},
+    {ImplSpec::IOSpec{DataType::Int2, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavingType<DataType::Int2>::type, DataFormat::NHWC}},
     {ProdConso::defaultModel, Aidge::WeightInterleavingImpl_cpu_forward_kernel<int8_t, int8_t, 2>, nullptr});
+REGISTRAR(WeightInterleavingImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Binary, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavingType<DataType::Binary>::type, DataFormat::NHWC}},
+    {ProdConso::defaultModel, Aidge::WeightInterleavingImpl_cpu_forward_kernel<int8_t, int8_t, 1>, nullptr});
+
+REGISTRAR(WeightInterleavingImpl_cpu,
+    {ImplSpec::IOSpec{DataType::UInt4, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavingType<DataType::UInt4>::type, DataFormat::NHWC}},
+    {ProdConso::defaultModel, Aidge::WeightInterleavingImpl_cpu_forward_kernel<uint8_t, uint8_t, 4>, nullptr});
+REGISTRAR(WeightInterleavingImpl_cpu,
+    {ImplSpec::IOSpec{DataType::UInt3, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavingType<DataType::UInt3>::type, DataFormat::NHWC}},
+    {ProdConso::defaultModel, Aidge::WeightInterleavingImpl_cpu_forward_kernel<uint8_t, uint8_t, 3>, nullptr});
+REGISTRAR(WeightInterleavingImpl_cpu,
+    {ImplSpec::IOSpec{DataType::UInt2, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavingType<DataType::UInt2>::type, DataFormat::NHWC}},
+    {ProdConso::defaultModel, Aidge::WeightInterleavingImpl_cpu_forward_kernel<uint8_t, uint8_t, 2>, nullptr});
+
+
+// REGISTRAR(WeightInterleavingImpl_cpu,
+//     {ImplSpec::IOSpec{DataType::Int4, DataFormat::NHWC}},
+//     {ProdConso::defaultModel, Aidge::WeightInterleavingImpl_cpu_forward_kernel<int8_t, int8_t, 4>, nullptr});
+// REGISTRAR(WeightInterleavingImpl_cpu,
+//     {ImplSpec::IOSpec{DataType::Int3, DataFormat::NHWC}},
+//     {ProdConso::defaultModel, Aidge::WeightInterleavingImpl_cpu_forward_kernel<int8_t, int8_t, 3>, nullptr});
+// REGISTRAR(WeightInterleavingImpl_cpu,
+//     {ImplSpec::IOSpec{DataType::Int2, DataFormat::NHWC}},
+//     {ProdConso::defaultModel, Aidge::WeightInterleavingImpl_cpu_forward_kernel<int8_t, int8_t, 2>, nullptr});
+
 
 }
 
-- 
GitLab


From 216d0ace2c510a47725ac5ea0f70ac9922ce3b34 Mon Sep 17 00:00:00 2001
From: thibault allenet <thibault.allenet@cea.fr>
Date: Wed, 4 Dec 2024 13:42:52 +0000
Subject: [PATCH 08/30] Add test for applyweightInterleaving recipe

---
 .../operator/Test_WeightInterleavingImpl.cpp  | 134 ++++++++++++++++--
 1 file changed, 120 insertions(+), 14 deletions(-)

diff --git a/unit_tests/operator/Test_WeightInterleavingImpl.cpp b/unit_tests/operator/Test_WeightInterleavingImpl.cpp
index 8d4a6ac5..9bd9f146 100644
--- a/unit_tests/operator/Test_WeightInterleavingImpl.cpp
+++ b/unit_tests/operator/Test_WeightInterleavingImpl.cpp
@@ -13,6 +13,8 @@
 
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/WeightInterleaving.hpp"
+#include "aidge/recipes/Recipes.hpp"
+#include "aidge/utils/TensorUtils.hpp"
 
 #include "aidge/backend/cpu.hpp"
 
@@ -77,12 +79,12 @@ TEST_CASE("[cpu/operator] WeightInterleaving", "[WeightInterleaving][CPU]") {
                                                                 });
 
         expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
-        expectedWeightInterleaving->setDataType(Aidge::DataType::Int4);
+        expectedWeightInterleaving->setDataType(WeightInterleavingType<Aidge::DataType::Int4>::type);
 
         std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
         auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
         op->associateInput(0,weight);
-        op->setDataType(DataType::Int4);
+        op->setDataType(WeightInterleavingType<Aidge::DataType::Int4>::type);
         op->setDataFormat(DataFormat::NHWC);
         op->setBackend("cpu");
         myWeightInterleavingNode->forward();
@@ -106,12 +108,12 @@ TEST_CASE("[cpu/operator] WeightInterleaving", "[WeightInterleaving][CPU]") {
                                                                 });
 
         expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
-        expectedWeightInterleaving->setDataType(Aidge::DataType::Int3);
+        expectedWeightInterleaving->setDataType(WeightInterleavingType<Aidge::DataType::Int3>::type);
 
         std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
         auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
         op->associateInput(0,weight);
-        op->setDataType(DataType::Int3);
+        op->setDataType(WeightInterleavingType<Aidge::DataType::Int3>::type);
         op->setDataFormat(DataFormat::NHWC);
         op->setBackend("cpu");
         myWeightInterleavingNode->forward();
@@ -134,12 +136,12 @@ TEST_CASE("[cpu/operator] WeightInterleaving", "[WeightInterleaving][CPU]") {
                                                                 });
 
         expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
-        expectedWeightInterleaving->setDataType(Aidge::DataType::Int2);
+        expectedWeightInterleaving->setDataType(WeightInterleavingType<Aidge::DataType::Int2>::type);
 
         std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
         auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
         op->associateInput(0,weight);
-        op->setDataType(DataType::Int2);
+        op->setDataType(WeightInterleavingType<Aidge::DataType::Int2>::type);
         op->setDataFormat(DataFormat::NHWC);
         op->setBackend("cpu");
         myWeightInterleavingNode->forward();
@@ -159,12 +161,12 @@ TEST_CASE("[cpu/operator] WeightInterleaving", "[WeightInterleaving][CPU]") {
                                                                 });
 
         expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
-        expectedWeightInterleaving->setDataType(Aidge::DataType::Int4);
+        expectedWeightInterleaving->setDataType(WeightInterleavingType<Aidge::DataType::Int4>::type);
 
         std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
         auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
         op->associateInput(0,weight);
-        op->setDataType(DataType::Int4);
+        op->setDataType(WeightInterleavingType<Aidge::DataType::Int4>::type);
         op->setDataFormat(DataFormat::NHWC);
         op->setBackend("cpu");
         myWeightInterleavingNode->forward();
@@ -187,12 +189,12 @@ TEST_CASE("[cpu/operator] WeightInterleaving", "[WeightInterleaving][CPU]") {
                                                                 });
 
         expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
-        expectedWeightInterleaving->setDataType(Aidge::DataType::Int4);
+        expectedWeightInterleaving->setDataType(WeightInterleavingType<Aidge::DataType::Int4>::type);
 
         std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
         auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
         op->associateInput(0,weight);
-        op->setDataType(DataType::Int4);
+        op->setDataType(WeightInterleavingType<Aidge::DataType::Int4>::type);
         op->setDataFormat(DataFormat::NHWC);
         op->setBackend("cpu");
         myWeightInterleavingNode->forward();
@@ -217,12 +219,12 @@ TEST_CASE("[cpu/operator] WeightInterleaving", "[WeightInterleaving][CPU]") {
                                                                 });
 
         expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
-        expectedWeightInterleaving->setDataType(Aidge::DataType::Int3);
+        expectedWeightInterleaving->setDataType(WeightInterleavingType<Aidge::DataType::Int3>::type);
 
         std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
         auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
         op->associateInput(0,weight);
-        op->setDataType(DataType::Int3);
+        op->setDataType(WeightInterleavingType<Aidge::DataType::Int3>::type);
         op->setDataFormat(DataFormat::NHWC);
         op->setBackend("cpu");
         myWeightInterleavingNode->forward();
@@ -315,16 +317,120 @@ TEST_CASE("[cpu/operator] WeightInterleaving", "[WeightInterleaving][CPU]") {
         weight->setDataType(Aidge::DataType::Int4);
 
         expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
-        expectedWeightInterleaving->setDataType(Aidge::DataType::Int4);
+        expectedWeightInterleaving->setDataType(WeightInterleavingType<Aidge::DataType::Int4>::type);
 
         std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
         auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
         op->associateInput(0,weight);
-        op->setDataType(DataType::Int4);
+        op->setDataType(WeightInterleavingType<Aidge::DataType::Int4>::type);
         op->setDataFormat(DataFormat::NHWC);
         op->setBackend("cpu");
         myWeightInterleavingNode->forward();
         REQUIRE(*(op->getOutput(0)) == *expectedWeightInterleaving);
     }
 
+    SECTION("Recipie ApplyWeightInterleaving") {
+
+        // Weight [Cout = 2, H = 3, W = 3, Cin = 4]:
+        std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array4D<std::int8_t,2,3,3,4> {
+            {
+                {
+                    {
+                        {-6,  0,  5, -8}, // 'A' '0' '5' '8' in hexadecimal format
+                        { 5,  5,  4, -5}, // '5' '5' '4' 'B' in hexadecimal format
+                        {-7, -1,  4, -7}  // '9' 'F' '4' '9' in hexadecimal format
+                    },
+                    {
+                        { 3, -3, -3, -3}, // '3' 'D' 'D' 'D' in hexadecimal format
+                        { 1,  3,  1, -1}, // '1' '3' '1' 'F' in hexadecimal format
+                        { 7, -3, -1,  4}  // '7' 'D' 'F' '4' in hexadecimal format
+                    },
+                    {
+                        {-1,  3,  5,  6}, // 'F' '3' '5' '6' in hexadecimal format
+                        {-8,  4,  7,  1}, // '8' '4' '7' '1' in hexadecimal format
+                        {-5,  0, -1, -2}  // 'B' '0' 'F' 'E' in hexadecimal format
+                    }
+                },
+                {
+                    {
+                        { 2, -7,  7, -4}, // '2' '9' '7' 'C' in hexadecimal format
+                        {-7,  3,  0,  2}, // '9' '3' '0' '2' in hexadecimal format
+                        { 1, -1,  2,  3}  // '1' 'F' '2' '3' in hexadecimal format
+                    },
+                    {
+                        {-1, -5, -3, -7}, // 'F' 'B' 'D' '9' in hexadecimal format
+                        {-8,  3,  5, -1}, // '8' '3' '5' 'F' in hexadecimal format
+                        {-7, -4, -6, -1}  // '9' 'C' 'A' 'F' in hexadecimal format
+                    },
+                    {
+                        { 1,  7,  5, -1}, // '1' '7' '5' 'F' in hexadecimal format
+                        { 1, -8,  1,  2}, // '1' '8' '1' '2' in hexadecimal format
+                        {-1, -6, -3,  0}  // 'F' 'A' 'D' '0' in hexadecimal format
+                    }
+                }
+            } 
+        });
+        
+        std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array4D<std::int8_t,2,3,3,2> {
+            {
+                {
+                    {
+                        {static_cast<int8_t>(0xA0), static_cast<int8_t>(0x58)}, // 'A' '0' '5' '8' in hexadecimal format
+                        {static_cast<int8_t>(0x55), static_cast<int8_t>(0x4B)}, // '5' '5' '4' 'B' in hexadecimal format
+                        {static_cast<int8_t>(0x9F), static_cast<int8_t>(0x49)}  // '9' 'F' '4' '9' in hexadecimal format
+                    },
+                    {
+                        {static_cast<int8_t>(0x3D), static_cast<int8_t>(0xDD)}, // '3' 'D' 'D' 'D' in hexadecimal format
+                        {static_cast<int8_t>(0x13), static_cast<int8_t>(0x1F)}, // '1' '3' '1' 'F' in hexadecimal format
+                        {static_cast<int8_t>(0x7D), static_cast<int8_t>(0xF4)}  // '7' 'D' 'F' '4' in hexadecimal format
+                    },
+                    {
+                        {static_cast<int8_t>(0xF3), static_cast<int8_t>(0x56)}, // 'F' '3' '5' '6' in hexadecimal format
+                        {static_cast<int8_t>(0x84), static_cast<int8_t>(0x71)}, // '8' '4' '7' '1' in hexadecimal format
+                        {static_cast<int8_t>(0xB0), static_cast<int8_t>(0xFE)}  // 'B' '0' 'F' 'E' in hexadecimal format
+                    }
+                },
+                {
+                    {
+                        {static_cast<int8_t>(0x29), static_cast<int8_t>(0x7C)}, // '2' '9' '7' 'C' in hexadecimal format
+                        {static_cast<int8_t>(0x93), static_cast<int8_t>(0x02)}, // '9' '3' '0' '2' in hexadecimal format
+                        {static_cast<int8_t>(0x1F), static_cast<int8_t>(0x23)}  // '1' 'F' '2' '3' in hexadecimal format
+                    },
+                    {
+                        {static_cast<int8_t>(0xFB), static_cast<int8_t>(0xD9)}, // 'F' 'B' 'D' '9' in hexadecimal format
+                        {static_cast<int8_t>(0x83), static_cast<int8_t>(0x5F)}, // '8' '3' '5' 'F' in hexadecimal format
+                        {static_cast<int8_t>(0x9C), static_cast<int8_t>(0xAF)}  // '9' 'C' 'A' 'F' in hexadecimal format
+                    },
+                    {
+                        {static_cast<int8_t>(0x17), static_cast<int8_t>(0x5F)}, // '1' '7' '5' 'F' in hexadecimal format
+                        {static_cast<int8_t>(0x18), static_cast<int8_t>(0x12)}, // '1' '8' '1' '2' in hexadecimal format
+                        {static_cast<int8_t>(0xFA), static_cast<int8_t>(0xD0)}  // 'F' 'A' 'D' '0' in hexadecimal format
+                    }
+                }
+            } 
+        });
+
+        expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
+        expectedWeightInterleaving->setDataType(Aidge::DataType::Dual_Int4);
+
+        // Create convolution node
+        std::shared_ptr<Node> conv = Conv(4, 2, {3, 3}, "conv1");
+        
+        // Place the weight tensor in the weight producer of the conv
+        auto weightProducer = conv->getParent(1);
+        weightProducer->getOperator()->setOutput(0, weight);
+
+        // Set dataType, dataformat and backend of convolution 
+        conv->getOperator()->setDataFormat(Aidge::DataFormat::NHWC);
+        conv->getOperator()->setDataType(Aidge::DataType::Int4);
+        conv->getOperator()->setBackend("cpu");
+
+        // Apply recipie
+        applyWeightInterleaving(conv);
+
+        // Compare the weight producer output tensor with the expected weights with interleaving
+        auto newProdOp = std::static_pointer_cast<OperatorTensor>(conv->getParent(1)->getOperator());
+        REQUIRE(*(newProdOp->getOutput(0)) == *expectedWeightInterleaving);
+    }
+
 }
-- 
GitLab


From 0650d4a77def64790dc5b15f9d27f42b39fcdde3 Mon Sep 17 00:00:00 2001
From: thibault allenet <thibault.allenet@cea.fr>
Date: Fri, 13 Dec 2024 15:42:59 +0000
Subject: [PATCH 09/30] remove registry for int4 uint4 convImpl

---
 include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
index b08890c6..1229d571 100644
--- a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
@@ -268,9 +268,6 @@ REGISTRAR(ConvImpl2D_cpu,
 REGISTRAR(ConvImpl2D_cpu,
     {{DataType::Any, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}},
     {ProdConso::inPlaceModel, Aidge::ConvImpl2D_cpu_forward_kernel<int32_t, int32_t, int32_t, int32_t>, nullptr});
-REGISTRAR(ConvImpl2D_cpu,
-    {{DataType::Any, DataFormat::NCHW}, {DataType::Int4, DataFormat::NCHW}},
-    {ProdConso::inPlaceModel, nullptr, nullptr});
 REGISTRAR(ConvImpl2D_cpu,
     {{DataType::Any, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}},
     {ProdConso::inPlaceModel, Aidge::ConvImpl2D_cpu_forward_kernel<double, double, double, double>, nullptr});
-- 
GitLab


From 9a4000ed824ff3eff7fce279cbf5d81f42cf5ced Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Mon, 9 Dec 2024 14:08:02 +0100
Subject: [PATCH 10/30] Added more tests

---
 unit_tests/operator/Test_Memorize.cpp   | 65 +++++++++++++++++++++++
 unit_tests/scheduler/Test_Scheduler.cpp | 69 +++++++++++++++++++++++++
 2 files changed, 134 insertions(+)
 create mode 100644 unit_tests/operator/Test_Memorize.cpp

diff --git a/unit_tests/operator/Test_Memorize.cpp b/unit_tests/operator/Test_Memorize.cpp
new file mode 100644
index 00000000..45ab40c5
--- /dev/null
+++ b/unit_tests/operator/Test_Memorize.cpp
@@ -0,0 +1,65 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <catch2/catch_test_macros.hpp>
+#include <memory>
+#include <string>
+
+#include "aidge/data/Tensor.hpp"
+#include "aidge/graph/Node.hpp"
+#include "aidge/graph/GraphView.hpp"
+#include "aidge/graph/OpArgs.hpp"
+#include "aidge/operator/Memorize.hpp"
+#include "aidge/operator/Producer.hpp"
+#include "aidge/scheduler/SequentialScheduler.hpp"
+
+#include "aidge/backend/cpu.hpp"
+#include "aidge/recipes/GraphViewHelper.hpp"
+
+
+namespace Aidge {
+
+TEST_CASE("[cpu/operator] Memorize(forward)", "[Memorize][CPU]") {
+    SECTION("Test simple") {
+        std::shared_ptr<Tensor> inputTensor =
+                std::make_shared<Tensor>(Array1D<int, 1>{{1}});
+
+        auto input = Producer({1}, "input");
+        auto init = Producer({1}, "init");
+        auto add = Add("add");
+        auto mem = Memorize(3, "mem");
+
+        input->addChild(add, 0, 0);
+        init->addChild(mem, 0, 1);
+        add->addChild(mem, 0,0);
+        mem->addChild(/*otherNode=*/add, /*outId=*/1, /*otherInId=*/1);
+
+        input->getOperator()->setOutput(0, inputTensor);
+        init->getOperator()->setOutput(0, inputTensor);
+
+        auto g = getConnectedGraphView(input);
+
+        g->setDataType(Aidge::DataType::Int32);
+        g->setBackend("cpu");
+        g->forwardDims();
+        g->save("simple_graph");
+
+        SequentialScheduler scheduler(g);
+        REQUIRE_NOTHROW(scheduler.forward());
+        scheduler.saveSchedulingDiagram("simple");
+
+        const auto expectedOutput = std::make_shared<Tensor>(Array1D<int, 1>{{4}});
+        std::shared_ptr<Tensor> other = std::static_pointer_cast<OperatorTensor>(mem->getOperator())->getOutput(0);
+        other->print();
+        REQUIRE((*other == *expectedOutput));
+    }
+}
+} // namespace Aidge
diff --git a/unit_tests/scheduler/Test_Scheduler.cpp b/unit_tests/scheduler/Test_Scheduler.cpp
index 78a10c30..9224d6f9 100644
--- a/unit_tests/scheduler/Test_Scheduler.cpp
+++ b/unit_tests/scheduler/Test_Scheduler.cpp
@@ -18,6 +18,10 @@
 #include "aidge/graph/GraphView.hpp"
 #include "aidge/graph/OpArgs.hpp"
 #include "aidge/operator/Memorize.hpp"
+#include "aidge/operator/Pop.hpp"
+#include "aidge/operator/Stack.hpp"
+#include "aidge/operator/Identity.hpp"
+#include "aidge/operator/MetaOperator.hpp"
 #include "aidge/scheduler/SequentialScheduler.hpp"
 #include "aidge/scheduler/ParallelScheduler.hpp"
 
@@ -438,4 +442,69 @@ TEST_CASE("[cpu/scheduler] SequentialScheduler(backward)", "[scheduler][backward
     predictedOutput->setGrad(targetOutput);
     REQUIRE_NOTHROW(scheduler.backward());
 }
+
+std::shared_ptr<Node> Accumulate(int seqLength, const std::string& name) {
+    auto input = Identity((!name.empty()) ? name + "_input" : "");
+    auto hiddenState = Memorize(seqLength, (!name.empty()) ? name + "_hidden_state" : "");
+    auto add = Add((!name.empty()) ? name + "_add" : "");
+
+    input->addChild(add, 0, 0);
+    add->addChild(hiddenState, 0,0);
+    hiddenState->addChild(/*otherNode=*/add, /*outId=*/1, /*otherInId=*/1);
+
+    std::shared_ptr<GraphView> microGraph = std::make_shared<GraphView>();
+    microGraph->add(input);
+    microGraph->add({hiddenState, add});
+    microGraph->setOrderedInputs({{input, 0}, {hiddenState, 1}});
+    microGraph->setOrderedOutputs({{hiddenState, 0}});
+
+    auto metaOp = MetaOperator("Accumulate", microGraph, {}, name);
+    return metaOp;
+}
+
+TEST_CASE("[cpu/scheduler] Accumulate", "[scheduler]") {
+    std::shared_ptr<Tensor> Input = std::make_shared<Tensor>(
+        Array3D<float, 2, 3, 2>{{{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}},
+                                 {{2.0, 3.0}, {4.0, 5.0}, {6.0, 7.0}}}});
+
+    std::shared_ptr<Tensor> MemInit =
+        std::make_shared<Tensor>(Array2D<float, 3, 2>{
+            {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}});
+
+    auto meta = Accumulate(2, "accumulate");
+    auto op = std::static_pointer_cast<MetaOperator_Op>(meta->getOperator());
+    auto pop_i = Pop("pop_input");
+    auto pop_o = Identity("pop_output"); // NOTE: Could be Identity/Stack/Whatever node you want, this is is not the problem here
+
+    pop_i->getOperator()->associateInput(0, Input);
+    pop_i->addChild(op->getMicroGraph()->getOrderedInputs()[0].first, 0, 0);
+    op->getMicroGraph()->getOrderedOutputs()[0].first->addChild(pop_o, 0, 0);
+
+    //pop_i->addChild(meta, 0, 0);
+    //meta->addChild(pop_o, 0, 0);
+
+    //op->associateInput(1, MemInit);
+    op->getMicroGraph()->getNode("accumulate_hidden_state")->getOperator()->associateInput(1, MemInit);
+
+    // Build the graph.
+    auto myGraph = std::make_shared<GraphView>();
+    myGraph->add(pop_i);
+    myGraph->add(op->getMicroGraph());
+    //myGraph->add(meta);
+    myGraph->add(pop_o);
+    myGraph->compile("cpu", DataType::Float32);
+
+    myGraph->save("accumulate_graph", true);
+
+    // Schedule and run
+    auto scheduler = SequentialScheduler(myGraph);
+    scheduler.generateScheduling();
+    scheduler.saveStaticSchedulingDiagram("accumulate_scheduling");
+    REQUIRE_NOTHROW(scheduler.forward(true));
+
+    std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(
+        Array2D<float, 3, 2>{{{3.0, 5.0}, {7.0, 9.0}, {11.0, 13.0}}});
+    std::shared_ptr<Tensor> output = std::static_pointer_cast<OperatorTensor>(pop_o->getOperator())->getOutput(0);
+    REQUIRE(*output == *expectedOutput);
+}
 } // namespace Aidge
-- 
GitLab


From 171e0702c490a62c1fbca404653f790cd74024d5 Mon Sep 17 00:00:00 2001
From: NAUD Maxence <maxence.naud@cea.fr>
Date: Fri, 17 Jan 2025 12:56:20 +0000
Subject: [PATCH 11/30] UPD: test includes

---
 unit_tests/data/Test_TensorImpl.cpp           | 25 ++++---
 unit_tests/operator/Test_AddImpl.cpp          |  8 +-
 unit_tests/operator/Test_AndImpl.cpp          | 14 +++-
 unit_tests/operator/Test_ArgMaxImpl.cpp       | 28 ++++---
 unit_tests/operator/Test_Atan.cpp             | 16 ++--
 unit_tests/operator/Test_AvgPoolingImpl.cpp   | 16 ++--
 unit_tests/operator/Test_BatchNormImpl.cpp    | 14 ++--
 unit_tests/operator/Test_BitShift.cpp         | 32 ++++----
 unit_tests/operator/Test_ClipImpl.cpp         | 73 ++++++++++---------
 .../Test_GlobalAveragePoolingImpl.cpp         | 52 ++++++-------
 10 files changed, 149 insertions(+), 129 deletions(-)

diff --git a/unit_tests/data/Test_TensorImpl.cpp b/unit_tests/data/Test_TensorImpl.cpp
index fd938f10..2bc1e7d4 100644
--- a/unit_tests/data/Test_TensorImpl.cpp
+++ b/unit_tests/data/Test_TensorImpl.cpp
@@ -9,19 +9,23 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
-#include <cstddef>   // std::size_t
-#include <cstdint>   // std::uint16_t
-#include <chrono>
-#include <iostream>
+#include <chrono>      // std::micro, std::chrono::time_point,
+                       // std::chrono::system_clock
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::int32_t, std::uint16_t
 #include <memory>
-#include <numeric>   // std::accumulate
-#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <random>      // std::random_device, std::mt19937
+                       // std::uniform_int_distribution, std::uniform_real_distribution
+#include <vector>
+
+#include <catch2/catch_test_macros.hpp>
+#include <fmt/core.h>
 
-#include "aidge/data/Tensor.hpp"
 #include "aidge/backend/cpu/data/TensorImpl.hpp"
-#include "aidge/operator/Add.hpp"
 #include "aidge/backend/cpu/operator/AddImpl.hpp"
+#include "aidge/data/Data.hpp"
+#include "aidge/operator/Add.hpp"
+#include "aidge/utils/ArrayHelpers.hpp"
 
 namespace Aidge {
 
@@ -35,8 +39,7 @@ TEST_CASE("Test addition of Tensors","[TensorImpl][Add][Data]") {
     std::uniform_int_distribution<int> boolDist(0,1);
 
     // Create MatMul Operator
-    std::shared_ptr<Node> mySub = Add();
-    auto op = std::static_pointer_cast<OperatorTensor>(mySub-> getOperator());
+    std::shared_ptr<Add_Op> op = std::make_shared<Add_Op>();
     op->setDataType(DataType::Float32);
     op->setBackend("cpu");
 
diff --git a/unit_tests/operator/Test_AddImpl.cpp b/unit_tests/operator/Test_AddImpl.cpp
index bca40257..720c4ca2 100644
--- a/unit_tests/operator/Test_AddImpl.cpp
+++ b/unit_tests/operator/Test_AddImpl.cpp
@@ -9,12 +9,16 @@
  *
  ********************************************************************************/
 
+#include <memory>
+
 #include <catch2/catch_test_macros.hpp>
 
+#include "aidge/backend/cpu/operator/AddImpl.hpp"
+#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
+#include "aidge/graph/Node.hpp"
 #include "aidge/operator/Add.hpp"
-
-#include "aidge/backend/cpu.hpp"
+#include "aidge/utils/ArrayHelpers.hpp"
 
 using namespace Aidge;
 
diff --git a/unit_tests/operator/Test_AndImpl.cpp b/unit_tests/operator/Test_AndImpl.cpp
index 053bb3ea..c2309dce 100644
--- a/unit_tests/operator/Test_AndImpl.cpp
+++ b/unit_tests/operator/Test_AndImpl.cpp
@@ -9,13 +9,19 @@
  *
  ********************************************************************************/
 
+#include <cstddef> // std::size_t
+#include <cstdint> // std::uint16_t
+#include <memory>
+#include <random>  // std::random_device, std::mt19937, std::uniform_int_distribution, std::uniform_real_distribution
+
 #include <catch2/catch_test_macros.hpp>
-#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
 
+#include "aidge/backend/cpu/operator/AndImpl.hpp"
+#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
+#include "aidge/graph/Node.hpp"
 #include "aidge/operator/And.hpp"
-
-#include "aidge/backend/cpu.hpp"
+#include "aidge/utils/ArrayHelpers.hpp"
 
 using namespace Aidge;
 
@@ -180,7 +186,7 @@ TEST_CASE("[cpu/operator] And(forward)", "[And][CPU]") {
         }                                       //
         });                                     //
 
-        std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<int,2> {{10, 20}});  
+        std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<int,2> {{10, 20}});
         std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,1,3,3,2> {
             {                                   //
                 {                               //
diff --git a/unit_tests/operator/Test_ArgMaxImpl.cpp b/unit_tests/operator/Test_ArgMaxImpl.cpp
index 9915d904..894697f6 100644
--- a/unit_tests/operator/Test_ArgMaxImpl.cpp
+++ b/unit_tests/operator/Test_ArgMaxImpl.cpp
@@ -9,17 +9,20 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
+#include <cstddef> // std::size_t
+#include <cstdint> // std::uint16_t
 #include <memory>
-#include <numeric>   // std::accumulate
-#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <random>  // std::random_device, std::mt19937, std::uniform_int_distribution, std::uniform_real_distribution
+
+#include <catch2/catch_test_macros.hpp>
+#include <fmt/core.h>
 
+#include "aidge/backend/cpu/operator/ArgMaxImpl.hpp"
+#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
+#include "aidge/graph/Node.hpp"
 #include "aidge/operator/ArgMax.hpp"
-#include "aidge/operator/Conv.hpp"
-
-#include "aidge/backend/cpu.hpp"
-#include "aidge/utils/TensorUtils.hpp"
+#include "aidge/utils/ArrayHelpers.hpp"
 
 using namespace Aidge;
 
@@ -118,8 +121,8 @@ TEST_CASE("[cpu/operator] ArgMax(forward)", "[ArgMax][CPU]") {
         SECTION("Axis 2") {
 
             Tensor myOutput = Tensor(Array3D<float,2,3, 1> {
-               { 
-                    { 
+               {
+                    {
                         {3.0},
                         {2.0},
                         {1.0}
@@ -144,7 +147,7 @@ TEST_CASE("[cpu/operator] ArgMax(forward)", "[ArgMax][CPU]") {
         SECTION("Axis 2 with keep_dims false") {
 
             Tensor myOutput = Tensor(Array2D<float,2,3> {
-               { 
+               {
                     { 3.0, 2.0, 1.0 },
                     { 2.0, 1.0, 0.0 }
                }
@@ -196,10 +199,11 @@ TEST_CASE("[cpu/operator] ArgMax(forward)", "[ArgMax][CPU]") {
             op->associateInput(0,myInput);
             op->setDataType(DataType::Float32);
             op->setBackend("cpu");
-            std::cout << " ...............  "<< std::endl;
+            fmt::print("{:.^20}\n", "forward");
             myArgMax->forward();
+            fmt::print("{:.^20}\n", "result");
             op->getOutput(0)->print();
-            std::cout <<"------"<<std::endl;
+            fmt::print("{:.^20}\n", "truth");
             myOutput.print();
 
             REQUIRE(*(op->getOutput(0)) == myOutput);
diff --git a/unit_tests/operator/Test_Atan.cpp b/unit_tests/operator/Test_Atan.cpp
index 9548e35d..b9438db0 100644
--- a/unit_tests/operator/Test_Atan.cpp
+++ b/unit_tests/operator/Test_Atan.cpp
@@ -9,14 +9,18 @@
  *
  ********************************************************************************/
 
+#include <cmath>    // std::abs
+#include <cstddef>  // std::size_t
+#include <memory>
+
 #include <catch2/catch_test_macros.hpp>
 
+#include "aidge/backend/cpu/operator/AtanImpl.hpp"
+#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
+#include "aidge/graph/Node.hpp"
 #include "aidge/operator/Atan.hpp"
-
-#include "aidge/backend/cpu.hpp"
-
-#include <memory>
+#include "aidge/utils/ArrayHelpers.hpp"
 
 using namespace Aidge;
 
@@ -32,7 +36,7 @@ TEST_CASE("[cpu/operator] Atan(forward)") {
              0.09486303, 0.16007232, 0.40421187, 0.4102045, 0.39055911}});
 
     std::shared_ptr<Node> myAtan = Atan();
-    auto op = std::static_pointer_cast<OperatorTensor>(myAtan->getOperator());
+    auto op = std::static_pointer_cast<Atan_Op>(myAtan->getOperator());
     op->associateInput(0, input0);
     op->setDataType(DataType::Float32);
     op->setBackend("cpu");
@@ -61,7 +65,7 @@ TEST_CASE("[cpu/operator] Atan(forward)") {
                                   {0.75377332, 0.77411225, 0.32928031}}}});
 
     std::shared_ptr<Node> myAtan = Atan();
-    auto op = std::static_pointer_cast<OperatorTensor>(myAtan->getOperator());
+    auto op = std::static_pointer_cast<Atan_Op>(myAtan->getOperator());
     op->associateInput(0, input0);
     op->setDataType(DataType::Float32);
     op->setBackend("cpu");
diff --git a/unit_tests/operator/Test_AvgPoolingImpl.cpp b/unit_tests/operator/Test_AvgPoolingImpl.cpp
index aaa27578..372febc6 100644
--- a/unit_tests/operator/Test_AvgPoolingImpl.cpp
+++ b/unit_tests/operator/Test_AvgPoolingImpl.cpp
@@ -9,14 +9,18 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
+#include <cmath>    // std::abs
+#include <cstddef>  // std::size_t
 #include <memory>
-#include <cstdlib>
 
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu/operator/AvgPoolingImpl.hpp"
+#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
+#include "aidge/graph/Node.hpp"
 #include "aidge/operator/AvgPooling.hpp"
-
-#include "aidge/backend/cpu.hpp"
+#include "aidge/utils/ArrayHelpers.hpp"
 
 using namespace Aidge;
 
@@ -53,7 +57,7 @@ TEST_CASE("[cpu/operator] AvgPooling(forward)", "[AvgPooling][CPU]") {
     });
     SECTION("Stride") {
         std::shared_ptr<Node> myAvgPool = AvgPooling({2,2}, "mycdw", {2,2});
-        auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool -> getOperator());
+        auto op = std::static_pointer_cast<AvgPooling_Op<2>>(myAvgPool -> getOperator());
 
         std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,2,2,2> {
             {
@@ -90,7 +94,7 @@ TEST_CASE("[cpu/operator] AvgPooling(forward)", "[AvgPooling][CPU]") {
         }
         });
         std::shared_ptr<Node> myAvgPool = AvgPooling({3,3}, "mycdw", {3,3});
-        auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool -> getOperator());
+        auto op = std::static_pointer_cast<AvgPooling_Op<2>>(myAvgPool -> getOperator());
 
         Tensor myOutput = Array4D<float,1,1,1,1> {
             {{{{(0.3745 + 0.9507 + 0.7320 + 0.5987 + 0.1560 + 0.1560 + 0.0581 + 0.8662 + 0.6011)/9.0}}}}
diff --git a/unit_tests/operator/Test_BatchNormImpl.cpp b/unit_tests/operator/Test_BatchNormImpl.cpp
index 1b42c90d..26e964f9 100644
--- a/unit_tests/operator/Test_BatchNormImpl.cpp
+++ b/unit_tests/operator/Test_BatchNormImpl.cpp
@@ -9,20 +9,24 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
+#include <cmath>    // std::abs
+#include <cstddef>  // std::size_t
 #include <memory>
 
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu/operator/BatchNormImpl.hpp"
+#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
+#include "aidge/graph/Node.hpp"
 #include "aidge/operator/BatchNorm.hpp"
-#include "aidge/scheduler/SequentialScheduler.hpp"
-
-#include "aidge/backend/cpu.hpp"
+#include "aidge/utils/ArrayHelpers.hpp"
 
 using namespace Aidge;
 
 TEST_CASE("[cpu/operator] BatchNorm(forward)", "[BatchNorm][CPU]") {
     std::shared_ptr<Node> myBatchNorm = BatchNorm<2>(3, 0.00001F, 0.1F, "mybatchnorm");
-    auto op = std::static_pointer_cast<OperatorTensor>(myBatchNorm -> getOperator());
+    auto op = std::static_pointer_cast<BatchNorm_Op<2>>(myBatchNorm -> getOperator());
     std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array1D<float,3> {{0.9044, 0.3028, 0.0218}});
     std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float,3> {{0.1332, 0.7503, 0.0878}});
     std::shared_ptr<Tensor> myMean = std::make_shared<Tensor>(Array1D<float,3> {{0.9931, 0.8421, 0.9936}});
diff --git a/unit_tests/operator/Test_BitShift.cpp b/unit_tests/operator/Test_BitShift.cpp
index a52990bc..db97e8d3 100644
--- a/unit_tests/operator/Test_BitShift.cpp
+++ b/unit_tests/operator/Test_BitShift.cpp
@@ -9,15 +9,20 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
+#include <chrono>      // std::micro, std::chrono::time_point,
+                       // std::chrono::system_clock
 #include <cstddef>   // std::size_t
 #include <cstdint>   // std::uint16_t
 #include <chrono>
-#include <iostream>
 #include <memory>
-#include <numeric>   
+#include <numeric>
 #include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
-#include <iomanip>
+
+#include <catch2/catch_test_macros.hpp>
+#include <fmt/core.h>
+
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/BitShiftImpl.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/BitShift.hpp"
 #include "aidge/utils/TensorUtils.hpp"
@@ -29,7 +34,7 @@ TEST_CASE("[cpu/operator] BitShift_TEST", "[BitShift][CPU]") {
     // Create a random number generator
     std::random_device rd;
     std::mt19937 gen(rd());
-    std::uniform_int_distribution<int> valueDist(-15, 15); 
+    std::uniform_int_distribution<int> valueDist(-15, 15);
     std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(5));
     std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(3));
     std::uniform_int_distribution<int> boolDist(0,1);
@@ -131,8 +136,8 @@ TEST_CASE("[cpu/operator] BitShift_TEST", "[BitShift][CPU]") {
 
 
             }
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "μs" << std::endl;
+            fmt::print("INFO: number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            fmt::print("INFO: total time: {}μs\n", duration.count());
         }
         SECTION("Test BitShift kernels with Broadcasting") {
             std::size_t number_of_operation = 0;
@@ -194,7 +199,7 @@ TEST_CASE("[cpu/operator] BitShift_TEST", "[BitShift][CPU]") {
                                 }
                                 else
                                 {
-                                    result[idx_out + d] = array0[idx0] >> array1[idx1];                               
+                                    result[idx_out + d] = array0[idx0] >> array1[idx1];
                                 }
                             }
                         }
@@ -222,12 +227,7 @@ TEST_CASE("[cpu/operator] BitShift_TEST", "[BitShift][CPU]") {
                 duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
 
                 // comparison between truth and computed result
-                bool equiv = (approxEq<int>(*(op->getOutput(0)), *Tres));
-                if(equiv == false)
-                {
-                    std::cout << "Problem\n";
-                }
-                REQUIRE(equiv);
+                REQUIRE(approxEq<int>(*(op->getOutput(0)), *Tres));
 
                 delete[] array0;
                 delete[] array1;
@@ -236,8 +236,8 @@ TEST_CASE("[cpu/operator] BitShift_TEST", "[BitShift][CPU]") {
                 const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
                 number_of_operation += nb_elements;
             }
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "μs" << std::endl;
+            fmt::print("INFO: number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            fmt::print("INFO: total time: {}μs\n", duration.count());
         }
 
 }
diff --git a/unit_tests/operator/Test_ClipImpl.cpp b/unit_tests/operator/Test_ClipImpl.cpp
index 45c8da5b..1a7aa5e5 100644
--- a/unit_tests/operator/Test_ClipImpl.cpp
+++ b/unit_tests/operator/Test_ClipImpl.cpp
@@ -9,36 +9,37 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
+#include <algorithm>  // std::max, std::min
+#include <chrono>
 #include <cstddef>  // std::size_t
 #include <cstdint>  // std::uint16_t
-#include <chrono>
-#include <iostream>
-#include <vector>
-#include <algorithm>
-#include <iomanip>
 #include <memory>
-#include <random>   // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <random>      // std::random_device, std::mt19937
+                       // std::uniform_int_distribution, std::uniform_real_distribution
+#include <vector>
+
+#include <catch2/catch_test_macros.hpp>
+#include <fmt/core.h>
 
+#include "aidge/backend/cpu/operator/ClipImpl.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/Clip.hpp"
 #include "aidge/operator/OperatorTensor.hpp"
 #include "aidge/utils/TensorUtils.hpp"
-#include "aidge/backend/cpu.hpp"
 
 void ComputeClipBackward(const std::vector<float>& vec1, std::vector<float>& vec2, float min, float max) {
     if (vec1.size() != vec2.size()) {
-        std::cerr << "Vectors should have the same sizes." << std::endl;
+        fmt::print(stderr, "Vectors should have the same sizes.\n");
         return;
     }
 
-    for (size_t i = 0; i < vec1.size(); ++i) {
+    for (std::size_t i = 0; i < vec1.size(); ++i) {
         if (vec1[i] < min || vec1[i] > max) {
             vec2[i] = 0.0f;
         }
     }
 }
-namespace Aidge 
+namespace Aidge
 {
 TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
  {
@@ -47,8 +48,8 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
     std::random_device rd;
     std::mt19937 gen(rd());
     std::uniform_real_distribution<float> dis(0.0, 10.0);
-    std::uniform_real_distribution<float> dismin(0.0, 4.5); 
-    std::uniform_real_distribution<float> dismax(5.5, 10.0); 
+    std::uniform_real_distribution<float> dismin(0.0, 4.5);
+    std::uniform_real_distribution<float> dismax(5.5, 10.0);
     std::uniform_int_distribution<std::size_t> distDims(5,15);
     std::uniform_int_distribution<std::size_t> distNbMatrix(1, 5);
 
@@ -71,7 +72,7 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
 
             // Create and populate the array with random float values
             float* Array = new float[dim0*dim1];
-            for (int i = 0; i < dim0*dim1; ++i) {
+            for (std::size_t i = 0; i < dim0*dim1; ++i) {
                 Array[i] = dis(gen); // Generate random float value
             }
 
@@ -80,7 +81,7 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
             TInput -> resize({dim0,dim1});
             TInput -> setBackend("cpu");
             TInput -> getImpl() -> setRawPtr(Array, dim0*dim1);
-            
+
             float min = dismin(gen);
             std::shared_ptr<Tensor> Tmin = std::make_shared<Tensor>(DataType::Float32);
             Tmin -> resize({});
@@ -109,7 +110,7 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
             op->setDataType(DataType::Float32);
             op->setBackend("cpu");
             op->forwardDims(true);
-            
+
             start = std::chrono::system_clock::now();
             myClip->forward();
             end = std::chrono::system_clock::now();
@@ -118,9 +119,9 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
 
             REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
         }
-        std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl;
-        std::cout << "total time: " << duration.count() << std::endl;
-    } 
+        fmt::print("INFO: multiplications over time spent: {}\n", totalComputation/duration.count());
+        fmt::print("INFO: total time: {}\n", duration.count());
+    }
     SECTION("Clip test with min >= max [Forward]") {
         std::size_t totalComputation = 0;
         for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
@@ -131,7 +132,7 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
 
             // Create and populate the array with random float values
             float* Array = new float[dim0*dim1];
-            for (int i = 0; i < dim0*dim1; ++i) {
+            for (std::size_t i = 0; i < dim0*dim1; ++i) {
                 Array[i] = dis(gen); // Generate random float value
             }
 
@@ -140,7 +141,7 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
             TInput -> resize({dim0,dim1});
             TInput -> setBackend("cpu");
             TInput -> getImpl() -> setRawPtr(Array, dim0*dim1);
-            
+
             float min = dismax(gen);
             std::shared_ptr<Tensor> Tmin = std::make_shared<Tensor>(DataType::Float32);
             Tmin -> resize({});
@@ -169,7 +170,7 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
             op->setDataType(DataType::Float32);
             op->setBackend("cpu");
             op->forwardDims(true);
-            
+
             start = std::chrono::system_clock::now();
             myClip->forward();
             end = std::chrono::system_clock::now();
@@ -178,13 +179,13 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
 
             REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
         }
-        std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl;
-        std::cout << "total time: " << duration.count() << std::endl;
-    } 
+        fmt::print("INFO: multiplications over time spent: {}\n", totalComputation/duration.count());
+        fmt::print("INFO: total time: {}\n", duration.count());
+    }
     SECTION("Clip with Clip Attr [Forward]")
     {
         std::size_t totalComputation = 0;
-        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) 
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial)
         {
 
             float min = dismin(gen);
@@ -200,7 +201,7 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
 
             // Create and populate the array with random float values
             float* Array = new float[dim0*dim1];
-            for (int i = 0; i < dim0*dim1; ++i) {
+            for (std::size_t i = 0; i < dim0*dim1; ++i) {
                 Array[i] = dis(gen); // Generate random float value
             }
             // Convert Input to Tensor
@@ -231,8 +232,8 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
 
             REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
         }
-        std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl;
-        std::cout << "total time: " << duration.count() << std::endl;
+        fmt::print("INFO: multiplications over time spent: {}\n", totalComputation/duration.count());
+        fmt::print("INFO: total time: {}\n", duration.count());
     }
     SECTION("Simple clip test [Backward]") {
         std::size_t totalComputation = 0;
@@ -243,13 +244,13 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
             // generate Tensors dimensions
             const std::size_t dim0 = distDims(gen);
             const std::size_t dim1 = distDims(gen);
-  
+
             totalComputation += dim0*dim1;
 
             // Create and populate the array with random float values
             float* Array = new float[dim0*dim1];
             float* gradArray = new float[dim0*dim1];
-            for (int i = 0; i < dim0*dim1; ++i) {
+            for (std::size_t i = 0; i < dim0*dim1; ++i) {
                 Array[i] = dis(gen); // Generate random float value
                 gradArray[i] = dis(gen);
             }
@@ -264,7 +265,7 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
             TInput -> resize({dim0,dim1});
             TInput -> setBackend("cpu");
             TInput -> getImpl() -> setRawPtr(Array, dim0*dim1);
-            
+
             float min = dismin(gen);
             std::shared_ptr<Tensor> Tmin = std::make_shared<Tensor>(DataType::Float32);
             Tmin -> resize({});
@@ -296,7 +297,7 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
             myClip->forward();
 
             op->getOutput(0)->setGrad(TGrad);
-            
+
             start = std::chrono::system_clock::now();
             REQUIRE_NOTHROW(myClip->backward());
             end = std::chrono::system_clock::now();
@@ -310,9 +311,9 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
             duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
             REQUIRE(GT1 == BackwardTensorVec);
         }
-        std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl;
-        std::cout << "total time: " << duration.count() << std::endl;
+        fmt::print("INFO: multiplications over time spent: {}\n", totalComputation/duration.count());
+        fmt::print("INFO: total time: {}\n", duration.count());
     }
  }
-} // namespace Aidge 
+} // namespace Aidge
 }
\ No newline at end of file
diff --git a/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp b/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp
index 43af5448..63f8d326 100644
--- a/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp
+++ b/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp
@@ -9,34 +9,29 @@
  *
  ********************************************************************************/
 
-#include <aidge/utils/Types.h>
-#include <catch2/catch_test_macros.hpp>
 #include <chrono>
-#include <cmath>
 #include <cstddef> // std::size_t
 #include <cstdint> // std::uint16_t
-#include <iostream>
+#include <functional>  // std::multiplies
 #include <memory>
 #include <numeric> // std::accumulate
-#include <ostream>
-#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <random>      // std::random_device, std::mt19937
+                       // std::uniform_int_distribution, std::uniform_real_distribution
+#include <vector>
+
+#include <catch2/catch_test_macros.hpp>
+#include <fmt/core.h>
 
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/GlobalAveragePoolingImpl.hpp"
+#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/GlobalAveragePooling.hpp"
 #include "aidge/utils/TensorUtils.hpp"
-
-// debug print function
-void print_tensor(Aidge::Tensor &T) {
-  // Print tensors
-  std::cout << "Tensor : size =  [";
-  for (auto &dim : T.dims()) {
-    std::cout << dim << " , ";
-  }
-  std::cout << "]" << std::endl;
-  T.print();
-}
+#include "aidge/utils/Types.h"
 
 namespace Aidge {
+
 TEST_CASE("[cpu/operator] GlobalAveragePooling",
           "[GlobalAveragePooling][CPU]") {
   constexpr std::uint16_t NBTRIALS = 10;
@@ -54,9 +49,7 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling",
                                                             std::size_t(7));
 
   // Create MatGlobalAveragePooling Operator
-  std::shared_ptr<Node> globAvgPool = GlobalAveragePooling();
-  auto op =
-      std::static_pointer_cast<OperatorTensor>(globAvgPool->getOperator());
+  std::shared_ptr<GlobalAveragePooling_Op> op = std::make_shared<GlobalAveragePooling_Op>();
   op->setDataType(DataType::Float32);
   op->setBackend("cpu");
 
@@ -99,7 +92,7 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling",
       T0->resize(dims);
       T0->getImpl()->setRawPtr(array0, nb_elements);
 
-      REQUIRE_THROWS(globAvgPool->forward());
+      REQUIRE_THROWS(op->forward());
       delete[] array0;
     }
 
@@ -158,7 +151,7 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling",
 
         op->forwardDims();
         start = std::chrono::system_clock::now();
-        REQUIRE_NOTHROW(globAvgPool->forward());
+        REQUIRE_NOTHROW(op->forward());
         end = std::chrono::system_clock::now();
         duration +=
             std::chrono::duration_cast<std::chrono::microseconds>(end - start);
@@ -231,7 +224,7 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling",
 
           op->forwardDims();
           start = std::chrono::system_clock::now();
-          REQUIRE_NOTHROW(globAvgPool->forward());
+          REQUIRE_NOTHROW(op->forward());
           end = std::chrono::system_clock::now();
           duration += std::chrono::duration_cast<std::chrono::microseconds>(
               end - start);
@@ -358,7 +351,7 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling",
           Tres->getImpl()->setRawPtr(result, out_nb_elems);
           op->forwardDims();
           start = std::chrono::system_clock::now();
-          REQUIRE_NOTHROW(globAvgPool->forward());
+          REQUIRE_NOTHROW(op->forward());
           end = std::chrono::system_clock::now();
           duration += std::chrono::duration_cast<std::chrono::microseconds>(
               end - start);
@@ -547,7 +540,7 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling",
           Tres->getImpl()->setRawPtr(result, out_nb_elems);
           op->forwardDims();
           start = std::chrono::system_clock::now();
-          REQUIRE_NOTHROW(globAvgPool->forward());
+          REQUIRE_NOTHROW(op->forward());
           end = std::chrono::system_clock::now();
           duration += std::chrono::duration_cast<std::chrono::microseconds>(
               end - start);
@@ -561,12 +554,9 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling",
           delete[] result;
         }
       }
-      std::cout << "GlobalAveragePooling total execution time : "
-                << duration.count() << "µs" << std::endl;
-      std::cout << "Number of operations : " << number_of_operation
-                << std::endl;
-      std::cout << "Operation / µs = " << number_of_operation / duration.count()
-                << std::endl;
+      fmt::print("INFO: GlobalAveragePooling total execution time: {}µs\n", duration.count());
+      fmt::print("INFO: Number of operations : {}\n", number_of_operation);
+      fmt::print("INFO: Operation / µs = {}\n", number_of_operation / duration.count());
     }
   }
 }
-- 
GitLab


From a77b7a0ea5c816a9086cced44cb1760a5b505211 Mon Sep 17 00:00:00 2001
From: NAUD Maxence <maxence.naud@cea.fr>
Date: Fri, 17 Jan 2025 12:57:35 +0000
Subject: [PATCH 12/30] ENHANCE: allow to use local install of Catch2 library
 or download latest available

---
 unit_tests/CMakeLists.txt | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/unit_tests/CMakeLists.txt b/unit_tests/CMakeLists.txt
index 5984524f..6bd1a89d 100644
--- a/unit_tests/CMakeLists.txt
+++ b/unit_tests/CMakeLists.txt
@@ -1,12 +1,19 @@
-Include(FetchContent)
+find_package(Catch2 REQUIRED)
 
-FetchContent_Declare(
-  Catch2
-  GIT_REPOSITORY https://github.com/catchorg/Catch2.git
-  GIT_TAG        v3.7.1 # or a later release
-)
+if(NOT Catch2_FOUND)
+    message(STATUS "Catch2 not found in system, retrieving from git")
+    Include(FetchContent)
 
-FetchContent_MakeAvailable(Catch2)
+    FetchContent_Declare(
+      Catch2
+      GIT_REPOSITORY https://github.com/catchorg/Catch2.git
+      GIT_TAG        devel # or a later release
+    )
+
+    FetchContent_MakeAvailable(Catch2)
+else()
+    message(STATUS "Found system Catch2 version ${Catch2_VERSION}")
+endif()
 
 file(GLOB_RECURSE src_files "*.cpp")
 
-- 
GitLab


From 2a5a4255afa0209842fcb3045bd8dfa2c6f43d63 Mon Sep 17 00:00:00 2001
From: NAUD Maxence <maxence.naud@cea.fr>
Date: Fri, 17 Jan 2025 13:19:01 +0000
Subject: [PATCH 13/30] Change REQUIRED for QUIET in  option of Catch2 for
 tests as it is downloaded if not found

---
 unit_tests/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unit_tests/CMakeLists.txt b/unit_tests/CMakeLists.txt
index 6bd1a89d..7e63fb2b 100644
--- a/unit_tests/CMakeLists.txt
+++ b/unit_tests/CMakeLists.txt
@@ -1,4 +1,4 @@
-find_package(Catch2 REQUIRED)
+find_package(Catch2 QUIET)
 
 if(NOT Catch2_FOUND)
     message(STATUS "Catch2 not found in system, retrieving from git")
-- 
GitLab


From 853884477c1905cc86cfa5e0654ef2c55691bb63 Mon Sep 17 00:00:00 2001
From: NAUD Maxence <maxence.naud@cea.fr>
Date: Fri, 17 Jan 2025 21:54:12 +0000
Subject: [PATCH 14/30] Make includes right and remove 'iostream' for some more
 tests

---
 .../operator/Test_ConstantOfShapeImpl.cpp     |  33 +-
 unit_tests/operator/Test_DivImpl.cpp          |  35 +-
 unit_tests/operator/Test_MatMulImpl.cpp       |  33 +-
 unit_tests/operator/Test_MulImpl.cpp          | 494 +++++++++---------
 unit_tests/operator/Test_PowImpl.cpp          |  53 +-
 unit_tests/operator/Test_RoundImpl.cpp        |  45 +-
 unit_tests/operator/Test_SubImpl.cpp          |  35 +-
 7 files changed, 368 insertions(+), 360 deletions(-)

diff --git a/unit_tests/operator/Test_ConstantOfShapeImpl.cpp b/unit_tests/operator/Test_ConstantOfShapeImpl.cpp
index 42505d38..8ec1669b 100644
--- a/unit_tests/operator/Test_ConstantOfShapeImpl.cpp
+++ b/unit_tests/operator/Test_ConstantOfShapeImpl.cpp
@@ -9,32 +9,27 @@
  *
  ********************************************************************************/
 
-#include <algorithm>
-#include <chrono>
-#include <cmath>
-#include <cstddef> // std::size_t
-#include <cstdint> // std::uint16_t
-#include <iostream>
+#include <chrono>      // std::micro, std::chrono::time_point,
+                       // std::chrono::system_clock
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::int64_t, std::uint16_t
 #include <memory>
-#include <numeric> // std::accumulate
-#include <ostream>
-#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <random>      // std::random_device, std::mt19937
+                       // std::uniform_int_distribution, std::uniform_real_distribution
+#include <vector>
 
-#include "catch2/internal/catch_compiler_capabilities.hpp"
-#include "catch2/internal/catch_enforce.hpp"
 #include <catch2/catch_test_macros.hpp>
 #include <catch2/generators/catch_generators_random.hpp>
 
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp"
+#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
+#include "aidge/filler/Filler.hpp"
 #include "aidge/operator/ConstantOfShape.hpp"
+#include "aidge/operator/OperatorTensor.hpp"
 #include "aidge/utils/TensorUtils.hpp"
-#include <aidge/data/Data.hpp>
-#include <aidge/data/half.hpp>
-#include <aidge/filler/Filler.hpp>
-#include <aidge/operator/OperatorTensor.hpp>
-#include <aidge/operator/Reshape.hpp>
-#include <aidge/utils/TensorUtils.hpp>
-#include <aidge/utils/Types.h>
+#include "aidge/utils/Types.h"
 
 namespace Aidge {
 TEST_CASE("[cpu/operator] ConstantOfShape", "[ConstantOfShape][CPU]") {
@@ -62,7 +57,7 @@ TEST_CASE("[cpu/operator] ConstantOfShape", "[ConstantOfShape][CPU]") {
     result->setDataType(DataType::Int64);
     result->setBackend("cpu");
     for (DimSize_t i = 0; i < result->size(); ++i) {
-      result->set<int64_t>(i, input_tensor_values_dist(gen));
+      result->set<std::int64_t>(i, input_tensor_values_dist(gen));
     }
     return result;
   };
diff --git a/unit_tests/operator/Test_DivImpl.cpp b/unit_tests/operator/Test_DivImpl.cpp
index 5d7dfdf1..b03fe4aa 100644
--- a/unit_tests/operator/Test_DivImpl.cpp
+++ b/unit_tests/operator/Test_DivImpl.cpp
@@ -9,17 +9,26 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
-#include <cstddef>   // std::size_t
-#include <cstdint>   // std::uint16_t
-#include <chrono>
-#include <iostream>
+#include <chrono>      // std::micro, std::chrono::time_point,
+                       // std::chrono::system_clock
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::uint16_t
+#include <functional>  // std::multiplies
 #include <memory>
-#include <numeric>   // std::accumulate
-#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <numeric>     // std::accumulate
+#include <random>      // std::random_device, std::mt19937
+                       // std::uniform_int_distribution, std::uniform_real_distribution
+#include <vector>
+
+#include <catch2/catch_test_macros.hpp>
+#include <fmt/core.h>
 
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/DivImpl.hpp"
+#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/Div.hpp"
+#include "aidge/operator/OperatorTensor.hpp"
 #include "aidge/utils/TensorUtils.hpp"
 
 namespace Aidge {
@@ -117,8 +126,8 @@ TEST_CASE("[cpu/operator] Div", "[Div][CPU]") {
 
                 // with broadcasting
             }
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "μs" << std::endl;
+            fmt::print("INFO: number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            fmt::print("INFO: total time: {} μs\n", duration.count());
         }
 
         SECTION("+1-D Tensor / +1-D Tensor - broadcasting") {
@@ -212,8 +221,8 @@ TEST_CASE("[cpu/operator] Div", "[Div][CPU]") {
                 const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
                 number_of_operation += nb_elements;
             }
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "μs" << std::endl;
+            fmt::print("INFO: number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            fmt::print("INFO: total time: {} μs\n", duration.count());
         }
         SECTION("+1-D Tensor / 1-D Tensor") {
             std::size_t number_of_operation = 0;
@@ -308,8 +317,8 @@ TEST_CASE("[cpu/operator] Div", "[Div][CPU]") {
                 number_of_operation += nb_elements;
             }
 
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "μs" << std::endl;
+            fmt::print("INFO: number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            fmt::print("INFO: total time: {} μs\n", duration.count());
         }
     }
 }
diff --git a/unit_tests/operator/Test_MatMulImpl.cpp b/unit_tests/operator/Test_MatMulImpl.cpp
index d6e934b4..daef47b3 100644
--- a/unit_tests/operator/Test_MatMulImpl.cpp
+++ b/unit_tests/operator/Test_MatMulImpl.cpp
@@ -9,21 +9,26 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
-#include <cstddef>  // std::size_t
-#include <cstdint>  // std::uint16_t
-#include <chrono>
-#include <iostream>
+#include <chrono>      // std::micro, std::chrono::time_point,
+                       // std::chrono::system_clock, std::chrono::duration
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::uint16_t
 #include <memory>
-#include <random>   // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <random>      // std::random_device, std::mt19937
+                       // std::uniform_int_distribution, std::uniform_real_distribution
+#include <vector>
+
+#include <catch2/catch_test_macros.hpp>
+#include <fmt/core.h>
 
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/MatMulImpl.hpp"
+#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/MatMul.hpp"
 #include "aidge/operator/OperatorTensor.hpp"
 #include "aidge/utils/TensorUtils.hpp"
 
-#include "aidge/backend/cpu/operator/MatMulImpl.hpp"
-
 namespace Aidge {
 
 TEST_CASE("[cpu/operator] MatMul(forward)", "[MatMul][CPU]") {
@@ -106,8 +111,8 @@ TEST_CASE("[cpu/operator] MatMul(forward)", "[MatMul][CPU]") {
             delete[] bigArray2;
             delete[] res;
         }
-        std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl;
-        std::cout << "total time: " << duration.count() << std::endl;
+        fmt::print("INFO: number of multiplications over time spent: {}\n", (totalComputation / duration.count()));
+        fmt::print("INFO: total time: {} μs\n", duration.count());
     }
 
     SECTION("3-D Tensors") {
@@ -174,8 +179,8 @@ TEST_CASE("[cpu/operator] MatMul(forward)", "[MatMul][CPU]") {
             delete[] bigArray2;
             delete[] res;
         }
-        std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl;
-        std::cout << "total time: " << duration.count() << std::endl;
+        fmt::print("INFO: number of multiplications over time spent: {}\n", (totalComputation / duration.count()));
+        fmt::print("INFO: total time: {} μs\n", duration.count());
     }
 
     SECTION("4-D Tensors") {
@@ -244,8 +249,8 @@ TEST_CASE("[cpu/operator] MatMul(forward)", "[MatMul][CPU]") {
             delete[] bigArray2;
             delete[] res;
         }
-        std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl;
-        std::cout << "total time: " << duration.count() << std::endl;
+        fmt::print("INFO: number of multiplications over time spent: {}\n", (totalComputation / duration.count()));
+        fmt::print("INFO: total time: {} μs\n", duration.count());
     }
 
     SECTION("+2-D / 1-D") {
diff --git a/unit_tests/operator/Test_MulImpl.cpp b/unit_tests/operator/Test_MulImpl.cpp
index 3378861d..925b9f20 100644
--- a/unit_tests/operator/Test_MulImpl.cpp
+++ b/unit_tests/operator/Test_MulImpl.cpp
@@ -9,351 +9,338 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
-#include <cstddef>   // std::size_t
-#include <cstdint>   // std::uint16_t
-#include <chrono>
-#include <iostream>
+#include <chrono>      // std::micro, std::chrono::time_point,
+                       // std::chrono::system_clock,
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::uint16_t
+#include <functional>  // std::multiplies
 #include <memory>
-#include <numeric>   // std::accumulate
-#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <numeric>     // std::accumulate
+#include <random>      // std::random_device, std::mt19937
+                       // std::uniform_int_distribution, std::uniform_real_distribution
+#include <vector>
+
+#include <catch2/catch_test_macros.hpp>
+#include <fmt/core.h>
 
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/MulImpl.hpp"
+#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/Mul.hpp"
+#include "aidge/utils/ArrayHelpers.hpp"
+#include "aidge/utils/Log.hpp"
 #include "aidge/utils/TensorUtils.hpp"
 
 namespace Aidge {
 
-    TEST_CASE("[CPU/Operator] Mul Backward", "[Mul][CPU][Backward]")
-    {
-        std::shared_ptr<Node> myMul = Mul();
-        auto op = std::static_pointer_cast<OperatorTensor>(myMul->getOperator());
-        op->setDataType(DataType::Float32);
-        op->setBackend("cpu");
+TEST_CASE("[CPU/Operator] Mul Backward", "[Mul][CPU][Backward]")
+{
+    using aif32 = cpptype_t<DataType::Float32>;
+    std::shared_ptr<Mul_Op> op = std::make_shared<Mul_Op>();
+    op->setDataType(DataType::Float32);
+    op->setBackend("cpu");
 
-        SECTION("Case 1: 2D and 1D tensors") {
-            const auto T0 = std::make_shared<Tensor>(Array2D<float,2,3>(
+    SECTION("Case 1: 2D and 1D tensors") {
+        const auto T0 = std::make_shared<Tensor>(Array2D<aif32,2,3>(
+            {
                 {
-                    {
-                        {1,2,3},{4,5,6}
-                    }
+                    {1,2,3},{4,5,6}
                 }
-            ));
-
-            const auto T1 = std::make_shared<Tensor>(Array1D<float,3>(
-                {0.1,0.2,0.3}
-            ));
+            }
+        ));
 
-            T0->setDataType(DataType::Float32);
-            T0->setBackend("cpu");
-            T1->setDataType(DataType::Float32);
-            T1->setBackend("cpu");
+        const auto T1 = std::make_shared<Tensor>(Array1D<aif32,3>(
+            {0.1,0.2,0.3}
+        ));
 
-            op->getOutput(0)->setGrad(std::make_shared<Tensor>(Array2D<float,2,3>({{{1.0,1.0,1.0},{1.0,1.0,1.0}}})));
+        op->getOutput(0)->setGrad(std::make_shared<Tensor>(Array2D<aif32,2,3>({{{1.0,1.0,1.0},{1.0,1.0,1.0}}})));
 
-            op->associateInput(0,T0);
-            op->associateInput(1,T1);
-            op->forwardDims();
+        op->associateInput(0,T0);
+        op->associateInput(1,T1);
+        op->forwardDims();
 
-            myMul->forward();
-            myMul->backward();
+        op->forward();
+        op->backward();
 
-            auto T0Grad = std::make_shared<Tensor>(Array2D<float, 2,3>({{{0.1,0.2,0.3},{0.1, 0.2, 0.3}}}));
-            auto T1Grad = std::make_shared<Tensor>(Array1D<float, 3>({5,7,9}));
+        const Tensor T0Grad = Array2D<aif32, 2, 3>({{{0.1,0.2,0.3},{0.1, 0.2, 0.3}}});
+        const Tensor T1Grad = Array1D<aif32, 3>({5,7,9});
 
-            REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *T0Grad));
-            REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *T1Grad));
-        }
+        REQUIRE(approxEq<aif32>(*(op->getInput(0)->grad()), T0Grad));
+        REQUIRE(approxEq<aif32>(*(op->getInput(1)->grad()), T1Grad));
+    }
 
-        SECTION("Case 2: 3D and 1D tensors") {
-            const auto T0 = std::make_shared<Tensor>(Array3D<float,2,2,3>(
+    SECTION("Case 2: 3D and 1D tensors") {
+        const auto T0 = std::make_shared<Tensor>(Array3D<aif32,2,2,3>(
+            {
                 {
                     {
-                        {
-                            {1.0, 2.0, 3.0},
-                            {4.0, 5.0, 6.0}
-                        },
-                        {
-                            {7.0, 8.0, 9.0},
-                            {10.0, 11.0, 12.0}
-                        }
+                        {1.0, 2.0, 3.0},
+                        {4.0, 5.0, 6.0}
+                    },
+                    {
+                        {7.0, 8.0, 9.0},
+                        {10.0, 11.0, 12.0}
                     }
                 }
-            ));
-
-            const auto T1 = std::make_shared<Tensor>(Array1D<float, 3>({0.3,0.2,0.1}));
+            }
+        ));
 
-            const auto newGrad = std::make_shared<Tensor>(Array3D<float,2,2,3>(
-                    {
-                        {
-                            {
-                                {1, 1, 1},
-                                {1, 1, 1}
-                            },
-                            {
-                                {1, 1, 1},
-                                {1, 1, 1}
-                            }
-                        }
-                    }
-                ));
+        const auto T1 = std::make_shared<Tensor>(Array1D<aif32, 3>({0.3,0.2,0.1}));
 
-            const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float,2,2,3>(
+        const auto newGrad = std::make_shared<Tensor>(Array3D<aif32,2,2,3>(
                 {
                     {
                         {
-                            {0.3, 0.2, 0.1},
-                            {0.3, 0.2, 0.1}
+                            {1, 1, 1},
+                            {1, 1, 1}
                         },
                         {
-                            {0.3, 0.2, 0.1},
-                            {0.3, 0.2, 0.1}
+                            {1, 1, 1},
+                            {1, 1, 1}
                         }
                     }
                 }
             ));
 
-            const auto expectedGrad1 = std::make_shared<Tensor>(Array1D<float,3>(
-                {22.0, 26.0, 30.0}
-            ));
-
-            for(auto T: {T0, T1, newGrad, expectedGrad0, expectedGrad1})
+        const Tensor expectedGrad0 = Array3D<aif32,2,2,3>(
             {
-                    T->setBackend("cpu") ;
-                    T->setDataType(DataType::Float32);
+                {
+                    {
+                        {0.3, 0.2, 0.1},
+                        {0.3, 0.2, 0.1}
+                    },
+                    {
+                        {0.3, 0.2, 0.1},
+                        {0.3, 0.2, 0.1}
+                    }
+                }
             }
+        );
 
-            op->associateInput(0, T0);
-            op->associateInput(1, T1);
-            op->getOutput(0)->setGrad(newGrad);
-            op->forwardDims();
+        const Tensor expectedGrad1 = Array1D<aif32,3>(
+            {22.0, 26.0, 30.0}
+        );
 
-            myMul->backward();
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(newGrad);
+        op->forwardDims();
 
-            REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0));
-            REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1));
-        }
+        op->backward();
+
+        REQUIRE(approxEq<aif32>(*(op->getInput(0)->grad()), expectedGrad0));
+        REQUIRE(approxEq<aif32>(*(op->getInput(1)->grad()), expectedGrad1));
+    }
 
-        SECTION("Case 3: 4D and 2D tensors") {
-            const auto T0 = std::make_shared<Tensor>(Array4D<float,2, 2, 3, 3>(
+    SECTION("Case 3: 4D and 2D tensors") {
+        const auto T0 = std::make_shared<Tensor>(Array4D<aif32,2, 2, 3, 3>(
+            {
                 {
                     {
                         {
-                            {
-                                {1.0, 2.0, 3.0},
-                                {4.0, 5.0, 6.0},
-                                {7.0, 8.0, 9.0}
-                            },
-                            {
-                                {10.0, 11.0, 12.0},
-                                {13.0, 14.0, 15.0},
-                                {16.0, 17.0, 18.0}
-                            }
+                            {1.0, 2.0, 3.0},
+                            {4.0, 5.0, 6.0},
+                            {7.0, 8.0, 9.0}
                         },
                         {
-                            {
-                                {19.0, 20.0, 21.0},
-                                {22.0, 23.0, 24.0},
-                                {25.0, 26.0, 27.0}
-                            },
-                            {
-                                {28.0, 29.0, 30.0},
-                                {31.0, 32.0, 33.0},
-                                {34.0, 35.0, 36.0}
-                            }
+                            {10.0, 11.0, 12.0},
+                            {13.0, 14.0, 15.0},
+                            {16.0, 17.0, 18.0}
+                        }
+                    },
+                    {
+                        {
+                            {19.0, 20.0, 21.0},
+                            {22.0, 23.0, 24.0},
+                            {25.0, 26.0, 27.0}
+                        },
+                        {
+                            {28.0, 29.0, 30.0},
+                            {31.0, 32.0, 33.0},
+                            {34.0, 35.0, 36.0}
                         }
                     }
                 }
-            ));
+            }
+        ));
 
-            const auto T1 = std::make_shared<Tensor>(Array2D<float, 3,3>(
+        const auto T1 = std::make_shared<Tensor>(Array2D<aif32, 3,3>(
+            {
                 {
-                    {
-                        {0.5,0.3,0.1},
-                        {0.4,0.2,0.6},
-                        {0.7,0.8,0.9}
-                    }
+                    {0.5,0.3,0.1},
+                    {0.4,0.2,0.6},
+                    {0.7,0.8,0.9}
                 }
-            ));
+            }
+        ));
 
-            const auto newGrad = std::make_shared<Tensor>(Array4D<float,2, 2, 3, 3>(
+        const auto newGrad = std::make_shared<Tensor>(Array4D<aif32,2, 2, 3, 3>(
+            {
                 {
                     {
                         {
-                            {
-                                {1.0, 1.0, 1.0},
-                                {1.0, 1.0, 1.0},
-                                {1.0, 1.0, 1.0}
-                            },
-                            {
-                                {1.0, 1.0, 1.0},
-                                {1.0, 1.0, 1.0},
-                                {1.0, 1.0, 1.0}
-                            }
+                            {1.0, 1.0, 1.0},
+                            {1.0, 1.0, 1.0},
+                            {1.0, 1.0, 1.0}
                         },
                         {
-                            {
-                                {1.0, 1.0, 1.0},
-                                {1.0, 1.0, 1.0},
-                                {1.0, 1.0, 1.0}
-                            },
-                            {
-                                {1.0, 1.0, 1.0},
-                                {1.0, 1.0, 1.0},
-                                {1.0, 1.0, 1.0}
-                            }
+                            {1.0, 1.0, 1.0},
+                            {1.0, 1.0, 1.0},
+                            {1.0, 1.0, 1.0}
                         }
-                    }
-                }
-            ));
-
-            const auto expectedGrad0 = std::make_shared<Tensor>(Array4D<float,2,2,3,3>(
-                {
+                    },
                     {
                         {
-                            {
-                                {0.5, 0.3, 0.1},
-                                {0.4, 0.2, 0.6},
-                                {0.7, 0.8, 0.9}
-                            },
-                            {
-                                {0.5, 0.3, 0.1},
-                                {0.4, 0.2, 0.6},
-                                {0.7, 0.8, 0.9}
-                            }
+                            {1.0, 1.0, 1.0},
+                            {1.0, 1.0, 1.0},
+                            {1.0, 1.0, 1.0}
                         },
                         {
-                            {
-                                {0.5, 0.3, 0.1},
-                                {0.4, 0.2, 0.6},
-                                {0.7, 0.8, 0.9}
-                            },
-                            {
-                                {0.5, 0.3, 0.1},
-                                {0.4, 0.2, 0.6},
-                                {0.7, 0.8, 0.9}
-                            }
+                            {1.0, 1.0, 1.0},
+                            {1.0, 1.0, 1.0},
+                            {1.0, 1.0, 1.0}
                         }
                     }
                 }
-            ));
+            }
+        ));
 
-            const auto expectedGrad1 = std::make_shared<Tensor>(Array2D<float,3, 3>(
+        const Tensor expectedGrad0 = Array4D<aif32,2,2,3,3>(
+            {
                 {
                     {
-                        {58.0, 62.0, 66.0},
-                        {70.0, 74.0, 78.0},
-                        {82.0, 86.0, 90.0}
+                        {
+                            {0.5, 0.3, 0.1},
+                            {0.4, 0.2, 0.6},
+                            {0.7, 0.8, 0.9}
+                        },
+                        {
+                            {0.5, 0.3, 0.1},
+                            {0.4, 0.2, 0.6},
+                            {0.7, 0.8, 0.9}
+                        }
+                    },
+                    {
+                        {
+                            {0.5, 0.3, 0.1},
+                            {0.4, 0.2, 0.6},
+                            {0.7, 0.8, 0.9}
+                        },
+                        {
+                            {0.5, 0.3, 0.1},
+                            {0.4, 0.2, 0.6},
+                            {0.7, 0.8, 0.9}
+                        }
                     }
                 }
-            ));
+            }
+        );
 
-            for(const auto T: {T0, T1, newGrad, expectedGrad0, expectedGrad1})
+        const Tensor expectedGrad1 = Array2D<aif32,3, 3>(
             {
-                    T->setBackend("cpu") ;
-                    T->setDataType(DataType::Float32);
+                {
+                    {58.0, 62.0, 66.0},
+                    {70.0, 74.0, 78.0},
+                    {82.0, 86.0, 90.0}
+                }
             }
+        );
 
-            op->associateInput(0, T0);
-            op->associateInput(1, T1);
-            op->getOutput(0)->setGrad(newGrad);
-            op->forwardDims();
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(newGrad);
+        op->forwardDims();
 
-            myMul->backward();
+        op->backward();
 
-            REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0));
-            REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1));
-        }
+        REQUIRE(approxEq<aif32>(*(op->getInput(0)->grad()), expectedGrad0));
+        REQUIRE(approxEq<aif32>(*(op->getInput(1)->grad()), expectedGrad1));
+    }
 
-        SECTION("Case 4: 3D and 2D tensors") {
-            const auto T0 = std::make_shared<Tensor>(Array3D<float, 2, 3, 4>(
+    SECTION("Case 4: 3D and 2D tensors") {
+        const auto T0 = std::make_shared<Tensor>(Array3D<aif32, 2, 3, 4>(
+            {
                 {
                     {
-                        {
-                            {1.0, 2.0, 3.0, 4.0},
-                            {5.0, 6.0, 7.0, 8.0},
-                            {9.0, 10.0, 11.0, 12.0},
-                        },
-                        {
-                            {13.0, 14.0, 15.0, 16.0},
-                            {17.0, 18.0, 19.0, 20.0},
-                            {21.0, 22.0, 23.0, 24.0},
-                        }
-                    }
-                }
-            ));
-
-            const auto T1 = std::make_shared<Tensor>(Array2D<float, 3, 4>(
-                {
+                        {1.0, 2.0, 3.0, 4.0},
+                        {5.0, 6.0, 7.0, 8.0},
+                        {9.0, 10.0, 11.0, 12.0},
+                    },
                     {
-                        {0.1, 0.2, 0.3, 0.4},
-                        {0.5, 0.6, 0.7, 0.8},
-                        {0.9, 1.0, 1.1, 1.2}
+                        {13.0, 14.0, 15.0, 16.0},
+                        {17.0, 18.0, 19.0, 20.0},
+                        {21.0, 22.0, 23.0, 24.0},
                     }
                 }
-            ));
+            }
+        ));
 
-            const auto newGrad = std::make_shared<Tensor>(Array3D<float, 2,3,4>(
+        const auto T1 = std::make_shared<Tensor>(Array2D<aif32, 3, 4>(
+            {
                 {
-                    {
-                        {
-                            {1.0, 1.0, 1.0, 1.0},
-                            {1.0, 1.0, 1.0, 1.0},
-                            {1.0, 1.0, 1.0, 1.0},
-                        },
-                        {
-                            {1.0, 1.0, 1.0, 1.0},
-                            {1.0, 1.0, 1.0, 1.0},
-                            {1.0, 1.0, 1.0, 1.0},
-                        }
-                    }
+                    {0.1, 0.2, 0.3, 0.4},
+                    {0.5, 0.6, 0.7, 0.8},
+                    {0.9, 1.0, 1.1, 1.2}
                 }
-            ));
+            }
+        ));
 
-            const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float,2,3,4>(
+        const auto newGrad = std::make_shared<Tensor>(Array3D<aif32, 2,3,4>(
+            {
                 {
                     {
-                        {
-                            {0.1, 0.2, 0.3, 0.4},
-                            {0.5, 0.6, 0.7, 0.8},
-                            {0.9, 1.0, 1.1, 1.2}
-                        },
-                        {
-                            {0.1, 0.2, 0.3, 0.4},
-                            {0.5, 0.6, 0.7, 0.8},
-                            {0.9, 1.0, 1.1, 1.2}
-                        }
+                        {1.0, 1.0, 1.0, 1.0},
+                        {1.0, 1.0, 1.0, 1.0},
+                        {1.0, 1.0, 1.0, 1.0},
+                    },
+                    {
+                        {1.0, 1.0, 1.0, 1.0},
+                        {1.0, 1.0, 1.0, 1.0},
+                        {1.0, 1.0, 1.0, 1.0},
                     }
                 }
-            ));
+            }
+        ));
 
-            const auto expectedGrad1 = std::make_shared<Tensor>(Array2D<float,3, 4>(
+        const Tensor expectedGrad0 = Array3D<aif32,2,3,4>(
+            {
                 {
                     {
-                        {14.0, 16.0, 18.0, 20.0},
-                        {22.0, 24.0, 26.0, 28.0},
-                        {30.0, 32.0, 34.0, 36.0}
+                        {0.1, 0.2, 0.3, 0.4},
+                        {0.5, 0.6, 0.7, 0.8},
+                        {0.9, 1.0, 1.1, 1.2}
+                    },
+                    {
+                        {0.1, 0.2, 0.3, 0.4},
+                        {0.5, 0.6, 0.7, 0.8},
+                        {0.9, 1.0, 1.1, 1.2}
                     }
                 }
-            ));
+            }
+        );
 
-            for(const auto T: {T0, T1, newGrad, expectedGrad0, expectedGrad1})
+        const Tensor expectedGrad1 = Array2D<aif32,3,4>(
             {
-                T->setBackend("cpu") ;
-                T->setDataType(DataType::Float32);
+                {
+                    {14.0, 16.0, 18.0, 20.0},
+                    {22.0, 24.0, 26.0, 28.0},
+                    {30.0, 32.0, 34.0, 36.0}
+                }
             }
+        );
 
-            op->associateInput(0, T0);
-            op->associateInput(1, T1);
-            op->getOutput(0)->setGrad(newGrad);
-            op->forwardDims();
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(newGrad);
+        op->forwardDims();
 
-            myMul->backward();
+        op->backward();
 
-            REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0));
-            REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1));
-        }
+        REQUIRE(approxEq<aif32>(*(op->getInput(0)->grad()), expectedGrad0));
+        REQUIRE(approxEq<aif32>(*(op->getInput(1)->grad()), expectedGrad1));
     }
+}
 
 TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
     constexpr std::uint16_t NBTRIALS = 10;
@@ -366,8 +353,7 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
     std::uniform_int_distribution<int> boolDist(0,1);
 
     // Create MatMul Operator
-    std::shared_ptr<Node> myMul = Mul();
-    auto op = std::static_pointer_cast<OperatorTensor>(myMul-> getOperator());
+    std::shared_ptr<Mul_Op> op = std::make_shared<Mul_Op>();
     op->setDataType(DataType::Float32);
     op->setBackend("cpu");
 
@@ -441,7 +427,7 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
 
                 op->forwardDims();
                 start = std::chrono::system_clock::now();
-                myMul->forward();
+                op->forward();
                 end = std::chrono::system_clock::now();
                 duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
 
@@ -451,8 +437,8 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
                 delete[] array1;
                 delete[] result;
             }
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "μs" << std::endl;
+            fmt::print("INFO: number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            fmt::print("INFO: total time: {} μs\n", duration.count());
         }
 
 
@@ -568,7 +554,7 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
                 // compute result
                 op->forwardDims();
                 start = std::chrono::system_clock::now();
-                myMul->forward();
+                op->forward();
                 end = std::chrono::system_clock::now();
                 duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
 
@@ -582,8 +568,8 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
                 const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
                 number_of_operation += nb_elements;
             }
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "μs" << std::endl;
+            fmt::print("INFO: number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            fmt::print("INFO: total time: {} μs\n", duration.count());
         }
         SECTION("+1-D Tensor / 1-D Tensor") {
             std::size_t number_of_operation = 0;
@@ -663,7 +649,7 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
                 // compute result
                 op->forwardDims();
                 start = std::chrono::system_clock::now();
-                myMul->forward();
+                op->forward();
                 end = std::chrono::system_clock::now();
                 duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
 
@@ -678,8 +664,8 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
                 number_of_operation += nb_elements;
             }
 
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "μs" << std::endl;
+            fmt::print("INFO: number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            fmt::print("INFO: total time: {} μs\n", duration.count());
         }
     }
 }
diff --git a/unit_tests/operator/Test_PowImpl.cpp b/unit_tests/operator/Test_PowImpl.cpp
index cb5d8872..8238da39 100644
--- a/unit_tests/operator/Test_PowImpl.cpp
+++ b/unit_tests/operator/Test_PowImpl.cpp
@@ -9,18 +9,26 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
-#include <cmath>
-#include <cstddef>   // std::size_t
-#include <cstdint>   // std::uint16_t
-#include <chrono>
-#include <iostream>
+#include <chrono>      // std::micro, std::chrono::time_point,
+                       // std::chrono::system_clock, std::chrono::duration
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::uint16_t
+#include <functional>  // std::multiplies
 #include <memory>
-#include <numeric>   // std::accumulate
-#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <numeric>     // std::accumulate
+#include <random>      // std::random_device, std::mt19937
+                       // std::uniform_int_distribution, std::uniform_real_distribution
+#include <vector>
+
+#include <catch2/catch_test_macros.hpp>
+#include <fmt/core.h>
 
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/PowImpl.hpp"
+#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/Pow.hpp"
+#include "aidge/utils/ArrayHelpers.hpp"
 #include "aidge/utils/TensorUtils.hpp"
 
 namespace Aidge {
@@ -118,8 +126,8 @@ TEST_CASE("[cpu/operator] Pow", "[Pow][CPU]") {
 
                 // with broadcasting
             }
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "μs" << std::endl;
+            fmt::print("INFO: number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            fmt::print("INFO: total time: {} μs\n", duration.count());
         }
 
         SECTION("+1-D Tensor / +1-D Tensor - broadcasting") {
@@ -213,8 +221,8 @@ TEST_CASE("[cpu/operator] Pow", "[Pow][CPU]") {
                 const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
                 number_of_operation += nb_elements;
             }
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "μs" << std::endl;
+            fmt::print("INFO: number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            fmt::print("INFO: total time: {} μs\n", duration.count());
         }
         SECTION("+1-D Tensor / 1-D Tensor") {
             std::size_t number_of_operation = 0;
@@ -309,8 +317,8 @@ TEST_CASE("[cpu/operator] Pow", "[Pow][CPU]") {
                 number_of_operation += nb_elements;
             }
 
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "μs" << std::endl;
+            fmt::print("INFO: number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            fmt::print("INFO: total time: {} μs\n", duration.count());
         }
     }
 
@@ -440,7 +448,7 @@ TEST_CASE("[cpu/operator] Pow", "[Pow][CPU]") {
                     }
                 }
             ));
-            const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+            const Tensor expectedGrad0 = Array3D<float, 2, 2, 3>(
                 {
                     {
                         {
@@ -453,18 +461,13 @@ TEST_CASE("[cpu/operator] Pow", "[Pow][CPU]") {
                         }
                     }
                 }
-            ));
-            const auto expectedGrad1 = std::make_shared<Tensor>(Array1D<float, 3>(
+            );
+            const Tensor expectedGrad1 = Array1D<float, 3>(
                 {
                     {14.14779854, 22.99299049, 33.56402588}
                 }
-            ));
+            );
 
-            for(const auto T: {input0, input1, gradOut, expectedGrad0, expectedGrad1})
-            {
-                    T->setBackend("cpu") ;
-                    T->setDataType(DataType::Float32);
-            }
             std::shared_ptr<Node> powOp = Pow();
             auto opr = std::static_pointer_cast<OperatorTensor>(powOp-> getOperator());
             opr->setDataType(DataType::Float32);
@@ -475,8 +478,8 @@ TEST_CASE("[cpu/operator] Pow", "[Pow][CPU]") {
             powOp->forward();
 
             powOp->backward();
-            REQUIRE(approxEq<float>(*(opr->getInput(0)->grad()), *expectedGrad0));
-            REQUIRE(approxEq<float>(*(opr->getInput(1)->grad()), *expectedGrad1));
+            REQUIRE(approxEq<float>(*(opr->getInput(0)->grad()), expectedGrad0));
+            REQUIRE(approxEq<float>(*(opr->getInput(1)->grad()), expectedGrad1));
         }
     }
 }
diff --git a/unit_tests/operator/Test_RoundImpl.cpp b/unit_tests/operator/Test_RoundImpl.cpp
index b4cf9ffb..8b5dd53a 100644
--- a/unit_tests/operator/Test_RoundImpl.cpp
+++ b/unit_tests/operator/Test_RoundImpl.cpp
@@ -9,15 +9,23 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
-#include <cstddef>   // std::size_t
-#include <cstdint>   // std::uint16_t
-#include <chrono>
-#include <iostream>
+#include <chrono>      // std::micro, std::chrono::time_point,
+                       // std::chrono::system_clock, std::chrono::duration
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::uint16_t
+#include <functional>  // std::multiplies
 #include <memory>
-#include <numeric>   
-#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
-#include <iomanip>
+#include <numeric>     // std::accumulate
+#include <random>      // std::random_device, std::mt19937
+                       // std::uniform_int_distribution, std::uniform_real_distribution
+#include <vector>
+
+#include <catch2/catch_test_macros.hpp>
+#include <fmt/core.h>
+
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/RoundImpl.hpp"
+#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/Round.hpp"
 #include "aidge/utils/TensorUtils.hpp"
@@ -29,7 +37,7 @@ TEST_CASE("[cpu/operator] Round_Test", "[Round][CPU]") {
     // Create a random number generator
     std::random_device rd;
     std::mt19937 gen(rd());
-    std::uniform_real_distribution<float> valueDist(-15, 15); 
+    std::uniform_real_distribution<float> valueDist(-15, 15);
     std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(5));
     std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(3));
 
@@ -59,7 +67,7 @@ TEST_CASE("[cpu/operator] Round_Test", "[Round][CPU]") {
             std::size_t number_of_operation = 0;
 
             for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
-                
+
                 // generate 2 random Tensors
                 const std::size_t nbDims = nbDimsDist(gen);
                 std::vector<std::size_t> dims;
@@ -72,7 +80,7 @@ TEST_CASE("[cpu/operator] Round_Test", "[Round][CPU]") {
                 // without broadcasting
                 float* array0 = new float[nb_elements];
                 float* result = new float[nb_elements];
-                
+
                 for (std::size_t i = 0; i < nb_elements; ++i) {
                     array0[i] = valueDist(gen);
                     result[i] = std::nearbyint(array0[i]);
@@ -86,29 +94,22 @@ TEST_CASE("[cpu/operator] Round_Test", "[Round][CPU]") {
                 // results
                 Tres->resize(dims);
                 Tres -> getImpl() -> setRawPtr(result, nb_elements);
-                
+
                 op->forwardDims();
                 start = std::chrono::system_clock::now();
                 myRound->forward();
                 end = std::chrono::system_clock::now();
                 duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
 
-                bool is_eq = approxEq<float>(*(op->getOutput(0)), *Tres);
-
-                auto Output = *(op->getOutput(0));
-                
-                auto prt = Output.getImpl()->rawPtr();
-
-                REQUIRE(is_eq);
-                
+                REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
 
                 delete[] array0;
                 delete[] result;
 
 
             }
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "μs" << std::endl;
+            fmt::print("INFO: number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            fmt::print("INFO: total time: {} μs\n", duration.count());
         }
     }
 } // namespace Aidge
diff --git a/unit_tests/operator/Test_SubImpl.cpp b/unit_tests/operator/Test_SubImpl.cpp
index 44666ae6..471ae560 100644
--- a/unit_tests/operator/Test_SubImpl.cpp
+++ b/unit_tests/operator/Test_SubImpl.cpp
@@ -9,17 +9,26 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
-#include <cstddef>   // std::size_t
-#include <cstdint>   // std::uint16_t
-#include <chrono>
-#include <iostream>
+#include <chrono>      // std::micro, std::chrono::time_point,
+                       // std::chrono::system_clock
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::uint16_t
+#include <functional>  // std::multiplies
 #include <memory>
-#include <numeric>   // std::accumulate
-#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <numeric>     // std::accumulate
+#include <random>      // std::random_device, std::mt19937
+                       // std::uniform_int_distribution, std::uniform_real_distribution
+#include <vector>
+
+#include <catch2/catch_test_macros.hpp>
+#include <fmt/core.h>
 
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/SubImpl.hpp"
+#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/Sub.hpp"
+#include "aidge/operator/OperatorTensor.hpp"
 #include "aidge/utils/TensorUtils.hpp"
 
 namespace Aidge {
@@ -117,8 +126,8 @@ TEST_CASE("[cpu/operator] Sub", "[Sub][CPU]") {
 
                 // with broadcasting
             }
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "μs" << std::endl;
+            fmt::print("INFO: number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            fmt::print("INFO: total time: {}μs\n", duration.count());
         }
 
         SECTION("+1-D Tensor / +1-D Tensor - broadcasting") {
@@ -212,8 +221,8 @@ TEST_CASE("[cpu/operator] Sub", "[Sub][CPU]") {
                 const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
                 number_of_operation += nb_elements;
             }
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "μs" << std::endl;
+            fmt::print("INFO: number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            fmt::print("INFO: total time: {}μs\n", duration.count());
         }
         SECTION("+1-D Tensor / 1-D Tensor") {
             std::size_t number_of_operation = 0;
@@ -308,8 +317,8 @@ TEST_CASE("[cpu/operator] Sub", "[Sub][CPU]") {
                 number_of_operation += nb_elements;
             }
 
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "μs" << std::endl;
+            fmt::print("INFO: number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            fmt::print("INFO: total time: {}μs\n", duration.count());
         }
     }
 }
-- 
GitLab


From 4c80e1421cbfcea086e90ea20f3e4bc3c96f2d20 Mon Sep 17 00:00:00 2001
From: NAUD Maxence <maxence.naud@cea.fr>
Date: Sun, 19 Jan 2025 15:55:52 +0000
Subject: [PATCH 15/30] Change 'weightInterLeaving' for 'weightInterLeaved'

---
 include/aidge/backend/cpu.hpp                 |   2 +-
 ...vingImpl.hpp => WeightInterleavedImpl.hpp} |   4 +-
 ....hpp => WeightInterleavedImpl_kernels.hpp} | 110 ++++++++++--------
 ...vingImpl.cpp => WeightInterleavedImpl.cpp} |  18 +--
 .../operator/Test_WeightInterleavingImpl.cpp  |  74 ++++++------
 5 files changed, 110 insertions(+), 98 deletions(-)
 rename include/aidge/backend/cpu/operator/{WeightInterleavingImpl.hpp => WeightInterleavedImpl.hpp} (87%)
 rename include/aidge/backend/cpu/operator/{WeightInterleavingImpl_kernels.hpp => WeightInterleavedImpl_kernels.hpp} (57%)
 rename src/operator/{WeightInterleavingImpl.cpp => WeightInterleavedImpl.cpp} (84%)

diff --git a/include/aidge/backend/cpu.hpp b/include/aidge/backend/cpu.hpp
index 98015d5b..539a3128 100644
--- a/include/aidge/backend/cpu.hpp
+++ b/include/aidge/backend/cpu.hpp
@@ -53,7 +53,7 @@
 #include "aidge/backend/cpu/operator/SoftmaxImpl.hpp"
 #include "aidge/backend/cpu/operator/SubImpl.hpp"
 #include "aidge/backend/cpu/operator/TanhImpl.hpp"
-#include "aidge/backend/cpu/operator/WeightInterleavingImpl.hpp"
+#include "aidge/backend/cpu/operator/WeightInterleavedImpl.hpp"
 
 #include "aidge/backend/cpu/data/TensorImpl.hpp"
 
diff --git a/include/aidge/backend/cpu/operator/WeightInterleavingImpl.hpp b/include/aidge/backend/cpu/operator/WeightInterleavedImpl.hpp
similarity index 87%
rename from include/aidge/backend/cpu/operator/WeightInterleavingImpl.hpp
rename to include/aidge/backend/cpu/operator/WeightInterleavedImpl.hpp
index 0b3b1c57..ff5c4778 100644
--- a/include/aidge/backend/cpu/operator/WeightInterleavingImpl.hpp
+++ b/include/aidge/backend/cpu/operator/WeightInterleavedImpl.hpp
@@ -23,7 +23,7 @@
 
 namespace Aidge {
 // Operator implementation entry point for the backend
-using WeightInterleavingImpl_cpu = OperatorImpl_cpu<WeightInterleaving_Op,
+using WeightInterleavedImpl_cpu = OperatorImpl_cpu<WeightInterleaving_Op,
     void(const DimSize_t,
         const DimSize_t,
         const DimSize_t,
@@ -31,7 +31,7 @@ using WeightInterleavingImpl_cpu = OperatorImpl_cpu<WeightInterleaving_Op,
         void *)>;
 
 // Implementation entry point registration to Operator
-REGISTRAR(WeightInterleaving_Op, "cpu", Aidge::WeightInterleavingImpl_cpu::create);
+REGISTRAR(WeightInterleaving_Op, "cpu", Aidge::WeightInterleavedImpl_cpu::create);
 }  // namespace Aidge
 
 #endif /* AIDGE_CPU_OPERATOR_WeightInterleavingIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/WeightInterleavingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/WeightInterleavedImpl_kernels.hpp
similarity index 57%
rename from include/aidge/backend/cpu/operator/WeightInterleavingImpl_kernels.hpp
rename to include/aidge/backend/cpu/operator/WeightInterleavedImpl_kernels.hpp
index f2347fd2..18557f8f 100644
--- a/include/aidge/backend/cpu/operator/WeightInterleavingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/WeightInterleavedImpl_kernels.hpp
@@ -1,23 +1,35 @@
-
-
-#ifndef AIDGE_CPU_OPERATOR_WEIGHTINTERLEAVINGIMPL_KERNELS_H_
-#define AIDGE_CPU_OPERATOR_WEIGHTINTERLEAVINGIMPL_KERNELS_H_
-
-#include <algorithm>
-
-#include "aidge/backend/cpu/operator/WeightInterleavingImpl.hpp"
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_WEIGHTINTERLEAVEDIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_WEIGHTINTERLEAVEDIMPL_KERNELS_H_
+
+#include <cstddef>  // std::size_t
+#include <cstdint>  // std::int8_t, std::uint8_t
+
+#include "aidge/backend/cpu/operator/WeightInterleavedImpl.hpp"
+#include "aidge/data/DataType.hpp"
 #include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/ErrorHandling.hpp"
 
 
 namespace Aidge {
 
     /**
      * @brief Compacts 8-bit data into a smaller bit-width representation.
-     * 
-     * This function takes an array of 8-bit data and compacts it into smaller chunks 
-     * based on the specified bit-width `nb_bits`. Each element in `compactData` will 
+     *
+     * This function takes an array of 8-bit data and compacts it into smaller chunks
+     * based on the specified bit-width `nb_bits`. Each element in `compactData` will
      * store multiple packed `nb_bits` segments extracted from `data`.
-     * 
+     *
      * @param data The input array of 8-bit values to be compacted.
      * @param dataSize The size of the input `data` array.
      * @param compactData The output array storing the compacted data.
@@ -39,14 +51,14 @@ namespace Aidge {
         std::uint8_t shift = 8 / nbSlot;
 
         const unsigned int nbFullCompactbytes = dataSize / nbSlot;
-        
+
         // Main loop to process data in groups of `nbSlot`
         for (std::size_t i = 0; i < nbFullCompactbytes; ++i) {
             T compact = 0;
-            
+
             for (unsigned int j = 0; j < nbSlot; ++j) {
                 compact |= (data[i * nbSlot + j] & mask);    // Apply mask to keep `nb_bits` only
-                
+
                 // Shift only if not on the last slot to make room for the next `nb_bits`
                 if (j < nbSlot - 1) {
                     compact <<= shift;
@@ -55,7 +67,7 @@ namespace Aidge {
             // Store the compacted value in the output array
             compactData[i] = compact;
         }
-        
+
 
         // Handle any remaining data elements (if dataSize is not a multiple of nbSlot).
         std::size_t remaining = dataSize % nbSlot;
@@ -63,7 +75,7 @@ namespace Aidge {
             std::int8_t compact = 0;
             for (std::size_t j = 0; j < remaining; ++j) {
                 compact |= (data[nbFullCompactbytes*nbSlot + j] & mask);
-                
+
                 if (j < remaining - 1) {
                     compact <<= shift;
                 }
@@ -75,7 +87,7 @@ namespace Aidge {
     }
 
 template <class I, class O, int nb_bits>
-void WeightInterleavingImpl_cpu_forward_kernel(const DimSize_t input_interleaving,
+void WeightInterleavedImpl_cpu_forward_kernel(const DimSize_t input_interleaving,
                             const DimSize_t nb_interleaving,
                             const DimSize_t output_interleaving,
                             const void* input_,
@@ -91,41 +103,41 @@ void WeightInterleavingImpl_cpu_forward_kernel(const DimSize_t input_interleavin
 }
 
 
-REGISTRAR(WeightInterleavingImpl_cpu,
-    {ImplSpec::IOSpec{DataType::Int4, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavingType<DataType::Int4>::type, DataFormat::NHWC}},
-    {ProdConso::defaultModel, Aidge::WeightInterleavingImpl_cpu_forward_kernel<int8_t, int8_t, 4>, nullptr});
-REGISTRAR(WeightInterleavingImpl_cpu,
-    {ImplSpec::IOSpec{DataType::Int3, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavingType<DataType::Int3>::type, DataFormat::NHWC}},
-    {ProdConso::defaultModel, Aidge::WeightInterleavingImpl_cpu_forward_kernel<int8_t, int8_t, 3>, nullptr});
-REGISTRAR(WeightInterleavingImpl_cpu,
-    {ImplSpec::IOSpec{DataType::Int2, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavingType<DataType::Int2>::type, DataFormat::NHWC}},
-    {ProdConso::defaultModel, Aidge::WeightInterleavingImpl_cpu_forward_kernel<int8_t, int8_t, 2>, nullptr});
-REGISTRAR(WeightInterleavingImpl_cpu,
-    {ImplSpec::IOSpec{DataType::Binary, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavingType<DataType::Binary>::type, DataFormat::NHWC}},
-    {ProdConso::defaultModel, Aidge::WeightInterleavingImpl_cpu_forward_kernel<int8_t, int8_t, 1>, nullptr});
-
-REGISTRAR(WeightInterleavingImpl_cpu,
-    {ImplSpec::IOSpec{DataType::UInt4, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavingType<DataType::UInt4>::type, DataFormat::NHWC}},
-    {ProdConso::defaultModel, Aidge::WeightInterleavingImpl_cpu_forward_kernel<uint8_t, uint8_t, 4>, nullptr});
-REGISTRAR(WeightInterleavingImpl_cpu,
-    {ImplSpec::IOSpec{DataType::UInt3, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavingType<DataType::UInt3>::type, DataFormat::NHWC}},
-    {ProdConso::defaultModel, Aidge::WeightInterleavingImpl_cpu_forward_kernel<uint8_t, uint8_t, 3>, nullptr});
-REGISTRAR(WeightInterleavingImpl_cpu,
-    {ImplSpec::IOSpec{DataType::UInt2, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavingType<DataType::UInt2>::type, DataFormat::NHWC}},
-    {ProdConso::defaultModel, Aidge::WeightInterleavingImpl_cpu_forward_kernel<uint8_t, uint8_t, 2>, nullptr});
-
-
-// REGISTRAR(WeightInterleavingImpl_cpu,
+REGISTRAR(WeightInterleavedImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Int4, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavedType_v<DataType::Int4>, DataFormat::NHWC}},
+    {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<int8_t, int8_t, 4>, nullptr});
+REGISTRAR(WeightInterleavedImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Int3, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavedType_v<DataType::Int3>, DataFormat::NHWC}},
+    {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<int8_t, int8_t, 3>, nullptr});
+REGISTRAR(WeightInterleavedImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Int2, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavedType_v<DataType::Int2>, DataFormat::NHWC}},
+    {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<int8_t, int8_t, 2>, nullptr});
+REGISTRAR(WeightInterleavedImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Binary, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavedType_v<DataType::Binary>, DataFormat::NHWC}},
+    {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<int8_t, int8_t, 1>, nullptr});
+
+REGISTRAR(WeightInterleavedImpl_cpu,
+    {ImplSpec::IOSpec{DataType::UInt4, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavedType_v<DataType::UInt4>, DataFormat::NHWC}},
+    {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<uint8_t, uint8_t, 4>, nullptr});
+REGISTRAR(WeightInterleavedImpl_cpu,
+    {ImplSpec::IOSpec{DataType::UInt3, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavedType_v<DataType::UInt3>, DataFormat::NHWC}},
+    {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<uint8_t, uint8_t, 3>, nullptr});
+REGISTRAR(WeightInterleavedImpl_cpu,
+    {ImplSpec::IOSpec{DataType::UInt2, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavedType_v<DataType::UInt2>, DataFormat::NHWC}},
+    {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<uint8_t, uint8_t, 2>, nullptr});
+
+
+// REGISTRAR(WeightInterleavedImpl_cpu,
 //     {ImplSpec::IOSpec{DataType::Int4, DataFormat::NHWC}},
-//     {ProdConso::defaultModel, Aidge::WeightInterleavingImpl_cpu_forward_kernel<int8_t, int8_t, 4>, nullptr});
-// REGISTRAR(WeightInterleavingImpl_cpu,
+//     {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<int8_t, int8_t, 4>, nullptr});
+// REGISTRAR(WeightInterleavedImpl_cpu,
 //     {ImplSpec::IOSpec{DataType::Int3, DataFormat::NHWC}},
-//     {ProdConso::defaultModel, Aidge::WeightInterleavingImpl_cpu_forward_kernel<int8_t, int8_t, 3>, nullptr});
-// REGISTRAR(WeightInterleavingImpl_cpu,
+//     {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<int8_t, int8_t, 3>, nullptr});
+// REGISTRAR(WeightInterleavedImpl_cpu,
 //     {ImplSpec::IOSpec{DataType::Int2, DataFormat::NHWC}},
-//     {ProdConso::defaultModel, Aidge::WeightInterleavingImpl_cpu_forward_kernel<int8_t, int8_t, 2>, nullptr});
+//     {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<int8_t, int8_t, 2>, nullptr});
 
 
 }
 
-#endif /* AIDGE_CPU_OPERATOR_WEIGHTINTERLEAVINGIMPL_KERNELS_H_ */
\ No newline at end of file
+#endif /* AIDGE_CPU_OPERATOR_WEIGHTINTERLEAVEDIMPL_KERNELS_H_ */
\ No newline at end of file
diff --git a/src/operator/WeightInterleavingImpl.cpp b/src/operator/WeightInterleavedImpl.cpp
similarity index 84%
rename from src/operator/WeightInterleavingImpl.cpp
rename to src/operator/WeightInterleavedImpl.cpp
index afb79179..2c9f3a6e 100644
--- a/src/operator/WeightInterleavingImpl.cpp
+++ b/src/operator/WeightInterleavedImpl.cpp
@@ -9,7 +9,7 @@
  *
  ********************************************************************************/
 
-#include "aidge/backend/cpu/operator/WeightInterleavingImpl.hpp"
+#include "aidge/backend/cpu/operator/WeightInterleavedImpl.hpp"
 
 #include <cstddef>  // std::size_t
 #include <functional>
@@ -17,19 +17,19 @@
 #include <tuple>
 
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
-#include "aidge/backend/cpu/operator/WeightInterleavingImpl_kernels.hpp"
+#include "aidge/backend/cpu/operator/WeightInterleavedImpl_kernels.hpp"
 #include "aidge/operator/WeightInterleaving.hpp"
 #include "aidge/utils/ErrorHandling.hpp"
 #include "aidge/utils/Types.h"
 
 
 template <>
-void Aidge::WeightInterleavingImpl_cpu::forward()
+void Aidge::WeightInterleavedImpl_cpu::forward()
 {
     const WeightInterleaving_Op& op_ = dynamic_cast<const WeightInterleaving_Op&>(mOp);
     AIDGE_ASSERT(op_.getInput(0), "missing input #0");
 
-    const auto impl = Registrar<WeightInterleavingImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+    const auto impl = Registrar<WeightInterleavedImpl_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Convert input data (no overhead if not needed!)
     // TODO: right now, if needed, memory will be allocated/deallocated at each
@@ -38,14 +38,14 @@ void Aidge::WeightInterleavingImpl_cpu::forward()
     std::shared_ptr<Tensor> input0Fallback;
     const auto& input0 = op_.getInput(0)->refCastFrom(input0Fallback, *(op_.getOutput(0)));
 
-    // inputInterleaving is the number of consecutive input elements that will be compacted 
+    // inputInterleaving is the number of consecutive input elements that will be compacted
     // Here the interleaving is the last dimension (cf STM32 low bit kernels)
     std::size_t inputInterleaving = input0.dims().back();
 
     // The resulting compacted dimension was computed in forwardDims and the output tensor was resized
     std::size_t outputInterleaving = op_.getOutput(0)->dims().back();
 
-    // nb_interleaving is the number of compacted segments 
+    // nb_interleaving is the number of compacted segments
     std::size_t nbInterleaving;
 
     // Determine the number of segment to compact
@@ -65,11 +65,11 @@ void Aidge::WeightInterleavingImpl_cpu::forward()
         outputInterleaving,
         input0.getImpl()->rawPtr(),
         getCPUPtr(mOp.getRawOutput(0)));
-    
-    
+
+
 }
 
 template <>
-void Aidge::WeightInterleavingImpl_cpu::backward() {
+void Aidge::WeightInterleavedImpl_cpu::backward() {
     AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for WeightInterleaving_Op on backend cpu");
 }
\ No newline at end of file
diff --git a/unit_tests/operator/Test_WeightInterleavingImpl.cpp b/unit_tests/operator/Test_WeightInterleavingImpl.cpp
index 9bd9f146..c95c8fca 100644
--- a/unit_tests/operator/Test_WeightInterleavingImpl.cpp
+++ b/unit_tests/operator/Test_WeightInterleavingImpl.cpp
@@ -23,7 +23,7 @@
 using namespace Aidge;
 
 TEST_CASE("[cpu/operator] WeightInterleaving", "[WeightInterleaving][CPU]") {
-    
+
     std::shared_ptr<Node> myWeightInterleaving = WeightInterleaving();
     auto opWeightInterleaving = std::static_pointer_cast<WeightInterleaving_Op>(myWeightInterleaving -> getOperator());
 
@@ -64,9 +64,9 @@ TEST_CASE("[cpu/operator] WeightInterleaving", "[WeightInterleaving][CPU]") {
 
     SECTION("CompactData - 4-bit compaction") {
         std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array1D<std::int8_t, 4>{
-                                                                {static_cast<std::int8_t>(0x0F), 
-                                                                static_cast<std::int8_t>(0xF5), 
-                                                                static_cast<std::int8_t>(0xB3), 
+                                                                {static_cast<std::int8_t>(0x0F),
+                                                                static_cast<std::int8_t>(0xF5),
+                                                                static_cast<std::int8_t>(0xB3),
                                                                 static_cast<std::int8_t>(0x9C)}
                                                                 });
 
@@ -74,17 +74,17 @@ TEST_CASE("[cpu/operator] WeightInterleaving", "[WeightInterleaving][CPU]") {
         weight->setDataType(Aidge::DataType::Int4);
 
         std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array1D<std::int8_t, 2>{
-                                                                {static_cast<int8_t>(0xF5), 
+                                                                {static_cast<int8_t>(0xF5),
                                                                 static_cast<int8_t>(0x3C)}
                                                                 });
 
         expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
-        expectedWeightInterleaving->setDataType(WeightInterleavingType<Aidge::DataType::Int4>::type);
+        expectedWeightInterleaving->setDataType(WeightInterleavedType_v<Aidge::DataType::Int4>);
 
         std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
         auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
         op->associateInput(0,weight);
-        op->setDataType(WeightInterleavingType<Aidge::DataType::Int4>::type);
+        op->setDataType(WeightInterleavedType_v<Aidge::DataType::Int4>);
         op->setDataFormat(DataFormat::NHWC);
         op->setBackend("cpu");
         myWeightInterleavingNode->forward();
@@ -93,8 +93,8 @@ TEST_CASE("[cpu/operator] WeightInterleaving", "[WeightInterleaving][CPU]") {
 
     SECTION("CompactData - 3-bit compaction") {
         std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array1D<std::int8_t, 4>{
-                                                                {static_cast<int8_t>(0x0F), 
-                                                                static_cast<int8_t>(0x05), 
+                                                                {static_cast<int8_t>(0x0F),
+                                                                static_cast<int8_t>(0x05),
                                                                 static_cast<int8_t>(0x04),
                                                                 static_cast<int8_t>(0xD3)}
                                                                 });
@@ -103,17 +103,17 @@ TEST_CASE("[cpu/operator] WeightInterleaving", "[WeightInterleaving][CPU]") {
         weight->setDataType(Aidge::DataType::Int3);
 
         std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array1D<std::int8_t, 2>{
-                                                                {static_cast<int8_t>(0x75), 
+                                                                {static_cast<int8_t>(0x75),
                                                                 static_cast<int8_t>(0x43)}
                                                                 });
 
         expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
-        expectedWeightInterleaving->setDataType(WeightInterleavingType<Aidge::DataType::Int3>::type);
+        expectedWeightInterleaving->setDataType(WeightInterleavedType_v<Aidge::DataType::Int3>);
 
         std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
         auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
         op->associateInput(0,weight);
-        op->setDataType(WeightInterleavingType<Aidge::DataType::Int3>::type);
+        op->setDataType(WeightInterleavedType_v<Aidge::DataType::Int3>);
         op->setDataFormat(DataFormat::NHWC);
         op->setBackend("cpu");
         myWeightInterleavingNode->forward();
@@ -124,7 +124,7 @@ TEST_CASE("[cpu/operator] WeightInterleaving", "[WeightInterleaving][CPU]") {
         std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array1D<std::int8_t, 4>{
                                                                 {static_cast<std::int8_t>(0x03),
                                                                  static_cast<std::int8_t>(0x02),
-                                                                 static_cast<std::int8_t>(0x01), 
+                                                                 static_cast<std::int8_t>(0x01),
                                                                  static_cast<std::int8_t>(0x00)}
                                                                  });
 
@@ -136,12 +136,12 @@ TEST_CASE("[cpu/operator] WeightInterleaving", "[WeightInterleaving][CPU]") {
                                                                 });
 
         expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
-        expectedWeightInterleaving->setDataType(WeightInterleavingType<Aidge::DataType::Int2>::type);
+        expectedWeightInterleaving->setDataType(WeightInterleavedType_v<Aidge::DataType::Int2>);
 
         std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
         auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
         op->associateInput(0,weight);
-        op->setDataType(WeightInterleavingType<Aidge::DataType::Int2>::type);
+        op->setDataType(WeightInterleavedType_v<Aidge::DataType::Int2>);
         op->setDataFormat(DataFormat::NHWC);
         op->setBackend("cpu");
         myWeightInterleavingNode->forward();
@@ -161,12 +161,12 @@ TEST_CASE("[cpu/operator] WeightInterleaving", "[WeightInterleaving][CPU]") {
                                                                 });
 
         expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
-        expectedWeightInterleaving->setDataType(WeightInterleavingType<Aidge::DataType::Int4>::type);
+        expectedWeightInterleaving->setDataType(WeightInterleavedType_v<Aidge::DataType::Int4>);
 
         std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
         auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
         op->associateInput(0,weight);
-        op->setDataType(WeightInterleavingType<Aidge::DataType::Int4>::type);
+        op->setDataType(WeightInterleavedType_v<Aidge::DataType::Int4>);
         op->setDataFormat(DataFormat::NHWC);
         op->setBackend("cpu");
         myWeightInterleavingNode->forward();
@@ -175,8 +175,8 @@ TEST_CASE("[cpu/operator] WeightInterleaving", "[WeightInterleaving][CPU]") {
 
     SECTION("CompactData - Edge Cases - Non-divisible dataSize for nbSlot with nbbits=4") {
         std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array1D<std::int8_t, 3>{
-                                                                {static_cast<int8_t>(0x0F), 
-                                                                static_cast<int8_t>(0xA5), 
+                                                                {static_cast<int8_t>(0x0F),
+                                                                static_cast<int8_t>(0xA5),
                                                                 static_cast<int8_t>(0x34)}
                                                                 });
 
@@ -184,17 +184,17 @@ TEST_CASE("[cpu/operator] WeightInterleaving", "[WeightInterleaving][CPU]") {
         weight->setDataType(Aidge::DataType::Int4);
 
         std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array1D<std::int8_t, 2>{
-                                                                {static_cast<int8_t>(0xF5), 
+                                                                {static_cast<int8_t>(0xF5),
                                                                 static_cast<int8_t>(0x40)}
                                                                 });
 
         expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
-        expectedWeightInterleaving->setDataType(WeightInterleavingType<Aidge::DataType::Int4>::type);
+        expectedWeightInterleaving->setDataType(WeightInterleavedType_v<Aidge::DataType::Int4>);
 
         std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
         auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
         op->associateInput(0,weight);
-        op->setDataType(WeightInterleavingType<Aidge::DataType::Int4>::type);
+        op->setDataType(WeightInterleavedType_v<Aidge::DataType::Int4>);
         op->setDataFormat(DataFormat::NHWC);
         op->setBackend("cpu");
         myWeightInterleavingNode->forward();
@@ -205,8 +205,8 @@ TEST_CASE("[cpu/operator] WeightInterleaving", "[WeightInterleaving][CPU]") {
     SECTION("CompactData - Edge Cases - Non-divisible dataSize for nbSlot with nbbits=3") {
 
         std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array1D<std::int8_t, 3>{
-                                                                {static_cast<int8_t>(0x0F), 
-                                                                static_cast<int8_t>(0x05), 
+                                                                {static_cast<int8_t>(0x0F),
+                                                                static_cast<int8_t>(0x05),
                                                                 static_cast<int8_t>(0x04)}
                                                                 });
 
@@ -214,17 +214,17 @@ TEST_CASE("[cpu/operator] WeightInterleaving", "[WeightInterleaving][CPU]") {
         weight->setDataType(Aidge::DataType::Int3);
 
         std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array1D<std::int8_t, 2>{
-                                                                {static_cast<int8_t>(0x75), 
+                                                                {static_cast<int8_t>(0x75),
                                                                 static_cast<int8_t>(0x40)}
                                                                 });
 
         expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
-        expectedWeightInterleaving->setDataType(WeightInterleavingType<Aidge::DataType::Int3>::type);
+        expectedWeightInterleaving->setDataType(WeightInterleavedType_v<Aidge::DataType::Int3>);
 
         std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
         auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
         op->associateInput(0,weight);
-        op->setDataType(WeightInterleavingType<Aidge::DataType::Int3>::type);
+        op->setDataType(WeightInterleavedType_v<Aidge::DataType::Int3>);
         op->setDataFormat(DataFormat::NHWC);
         op->setBackend("cpu");
         myWeightInterleavingNode->forward();
@@ -271,9 +271,9 @@ TEST_CASE("[cpu/operator] WeightInterleaving", "[WeightInterleaving][CPU]") {
                         {-1, -6, -3,  0}  // 'F' 'A' 'D' '0' in hexadecimal format
                     }
                 }
-            } 
+            }
         });
-        
+
         std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array4D<std::int8_t,2,3,3,2> {
             {
                 {
@@ -310,19 +310,19 @@ TEST_CASE("[cpu/operator] WeightInterleaving", "[WeightInterleaving][CPU]") {
                         {static_cast<int8_t>(0xFA), static_cast<int8_t>(0xD0)}  // 'F' 'A' 'D' '0' in hexadecimal format
                     }
                 }
-            } 
+            }
         });
 
         weight->setDataFormat(Aidge::DataFormat::NHWC);
         weight->setDataType(Aidge::DataType::Int4);
 
         expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
-        expectedWeightInterleaving->setDataType(WeightInterleavingType<Aidge::DataType::Int4>::type);
+        expectedWeightInterleaving->setDataType(WeightInterleavedType_v<Aidge::DataType::Int4>);
 
         std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
         auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
         op->associateInput(0,weight);
-        op->setDataType(WeightInterleavingType<Aidge::DataType::Int4>::type);
+        op->setDataType(WeightInterleavedType_v<Aidge::DataType::Int4>);
         op->setDataFormat(DataFormat::NHWC);
         op->setBackend("cpu");
         myWeightInterleavingNode->forward();
@@ -368,9 +368,9 @@ TEST_CASE("[cpu/operator] WeightInterleaving", "[WeightInterleaving][CPU]") {
                         {-1, -6, -3,  0}  // 'F' 'A' 'D' '0' in hexadecimal format
                     }
                 }
-            } 
+            }
         });
-        
+
         std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array4D<std::int8_t,2,3,3,2> {
             {
                 {
@@ -407,7 +407,7 @@ TEST_CASE("[cpu/operator] WeightInterleaving", "[WeightInterleaving][CPU]") {
                         {static_cast<int8_t>(0xFA), static_cast<int8_t>(0xD0)}  // 'F' 'A' 'D' '0' in hexadecimal format
                     }
                 }
-            } 
+            }
         });
 
         expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
@@ -415,12 +415,12 @@ TEST_CASE("[cpu/operator] WeightInterleaving", "[WeightInterleaving][CPU]") {
 
         // Create convolution node
         std::shared_ptr<Node> conv = Conv(4, 2, {3, 3}, "conv1");
-        
+
         // Place the weight tensor in the weight producer of the conv
         auto weightProducer = conv->getParent(1);
         weightProducer->getOperator()->setOutput(0, weight);
 
-        // Set dataType, dataformat and backend of convolution 
+        // Set dataType, dataformat and backend of convolution
         conv->getOperator()->setDataFormat(Aidge::DataFormat::NHWC);
         conv->getOperator()->setDataType(Aidge::DataType::Int4);
         conv->getOperator()->setBackend("cpu");
-- 
GitLab


From 33e4b6217e7ae668351dc1c3ad1ac391154ddc19 Mon Sep 17 00:00:00 2001
From: NAUD Maxence <maxence.naud@cea.fr>
Date: Sun, 19 Jan 2025 15:57:41 +0000
Subject: [PATCH 16/30] UPD: change log::info of some tests to the new format

---
 unit_tests/operator/Test_BitShift.cpp            |  8 ++++----
 unit_tests/operator/Test_ClipImpl.cpp            | 16 ++++++++--------
 unit_tests/operator/Test_DivImpl.cpp             | 12 ++++++------
 .../operator/Test_GlobalAveragePoolingImpl.cpp   |  6 +++---
 unit_tests/operator/Test_MatMulImpl.cpp          | 12 ++++++------
 unit_tests/operator/Test_MulImpl.cpp             | 12 ++++++------
 unit_tests/operator/Test_PowImpl.cpp             | 12 ++++++------
 unit_tests/operator/Test_RoundImpl.cpp           |  4 ++--
 unit_tests/operator/Test_SubImpl.cpp             | 12 ++++++------
 9 files changed, 47 insertions(+), 47 deletions(-)

diff --git a/unit_tests/operator/Test_BitShift.cpp b/unit_tests/operator/Test_BitShift.cpp
index db97e8d3..33ab932e 100644
--- a/unit_tests/operator/Test_BitShift.cpp
+++ b/unit_tests/operator/Test_BitShift.cpp
@@ -136,8 +136,8 @@ TEST_CASE("[cpu/operator] BitShift_TEST", "[BitShift][CPU]") {
 
 
             }
-            fmt::print("INFO: number of elements over time spent: {}\n", (number_of_operation / duration.count()));
-            fmt::print("INFO: total time: {}μs\n", duration.count());
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {}μs\n", duration.count());
         }
         SECTION("Test BitShift kernels with Broadcasting") {
             std::size_t number_of_operation = 0;
@@ -236,8 +236,8 @@ TEST_CASE("[cpu/operator] BitShift_TEST", "[BitShift][CPU]") {
                 const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
                 number_of_operation += nb_elements;
             }
-            fmt::print("INFO: number of elements over time spent: {}\n", (number_of_operation / duration.count()));
-            fmt::print("INFO: total time: {}μs\n", duration.count());
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {}μs\n", duration.count());
         }
 
 }
diff --git a/unit_tests/operator/Test_ClipImpl.cpp b/unit_tests/operator/Test_ClipImpl.cpp
index 1a7aa5e5..99147ac9 100644
--- a/unit_tests/operator/Test_ClipImpl.cpp
+++ b/unit_tests/operator/Test_ClipImpl.cpp
@@ -119,8 +119,8 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
 
             REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
         }
-        fmt::print("INFO: multiplications over time spent: {}\n", totalComputation/duration.count());
-        fmt::print("INFO: total time: {}\n", duration.count());
+        Log::info("multiplications over time spent: {}\n", totalComputation/duration.count());
+        Log::info("total time: {}\n", duration.count());
     }
     SECTION("Clip test with min >= max [Forward]") {
         std::size_t totalComputation = 0;
@@ -179,8 +179,8 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
 
             REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
         }
-        fmt::print("INFO: multiplications over time spent: {}\n", totalComputation/duration.count());
-        fmt::print("INFO: total time: {}\n", duration.count());
+        Log::info("multiplications over time spent: {}\n", totalComputation/duration.count());
+        Log::info("total time: {}\n", duration.count());
     }
     SECTION("Clip with Clip Attr [Forward]")
     {
@@ -232,8 +232,8 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
 
             REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
         }
-        fmt::print("INFO: multiplications over time spent: {}\n", totalComputation/duration.count());
-        fmt::print("INFO: total time: {}\n", duration.count());
+        Log::info("multiplications over time spent: {}\n", totalComputation/duration.count());
+        Log::info("total time: {}\n", duration.count());
     }
     SECTION("Simple clip test [Backward]") {
         std::size_t totalComputation = 0;
@@ -311,8 +311,8 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
             duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
             REQUIRE(GT1 == BackwardTensorVec);
         }
-        fmt::print("INFO: multiplications over time spent: {}\n", totalComputation/duration.count());
-        fmt::print("INFO: total time: {}\n", duration.count());
+        Log::info("multiplications over time spent: {}\n", totalComputation/duration.count());
+        Log::info("total time: {}\n", duration.count());
     }
  }
 } // namespace Aidge
diff --git a/unit_tests/operator/Test_DivImpl.cpp b/unit_tests/operator/Test_DivImpl.cpp
index b03fe4aa..4037b2ad 100644
--- a/unit_tests/operator/Test_DivImpl.cpp
+++ b/unit_tests/operator/Test_DivImpl.cpp
@@ -126,8 +126,8 @@ TEST_CASE("[cpu/operator] Div", "[Div][CPU]") {
 
                 // with broadcasting
             }
-            fmt::print("INFO: number of elements over time spent: {}\n", (number_of_operation / duration.count()));
-            fmt::print("INFO: total time: {} μs\n", duration.count());
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {} μs\n", duration.count());
         }
 
         SECTION("+1-D Tensor / +1-D Tensor - broadcasting") {
@@ -221,8 +221,8 @@ TEST_CASE("[cpu/operator] Div", "[Div][CPU]") {
                 const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
                 number_of_operation += nb_elements;
             }
-            fmt::print("INFO: number of elements over time spent: {}\n", (number_of_operation / duration.count()));
-            fmt::print("INFO: total time: {} μs\n", duration.count());
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {} μs\n", duration.count());
         }
         SECTION("+1-D Tensor / 1-D Tensor") {
             std::size_t number_of_operation = 0;
@@ -317,8 +317,8 @@ TEST_CASE("[cpu/operator] Div", "[Div][CPU]") {
                 number_of_operation += nb_elements;
             }
 
-            fmt::print("INFO: number of elements over time spent: {}\n", (number_of_operation / duration.count()));
-            fmt::print("INFO: total time: {} μs\n", duration.count());
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {} μs\n", duration.count());
         }
     }
 }
diff --git a/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp b/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp
index 63f8d326..8e8536ac 100644
--- a/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp
+++ b/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp
@@ -554,9 +554,9 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling",
           delete[] result;
         }
       }
-      fmt::print("INFO: GlobalAveragePooling total execution time: {}µs\n", duration.count());
-      fmt::print("INFO: Number of operations : {}\n", number_of_operation);
-      fmt::print("INFO: Operation / µs = {}\n", number_of_operation / duration.count());
+      Log::info("GlobalAveragePooling total execution time: {}µs\n", duration.count());
+      Log::info("Number of operations : {}\n", number_of_operation);
+      Log::info("Operation / µs = {}\n", number_of_operation / duration.count());
     }
   }
 }
diff --git a/unit_tests/operator/Test_MatMulImpl.cpp b/unit_tests/operator/Test_MatMulImpl.cpp
index daef47b3..f062f06c 100644
--- a/unit_tests/operator/Test_MatMulImpl.cpp
+++ b/unit_tests/operator/Test_MatMulImpl.cpp
@@ -111,8 +111,8 @@ TEST_CASE("[cpu/operator] MatMul(forward)", "[MatMul][CPU]") {
             delete[] bigArray2;
             delete[] res;
         }
-        fmt::print("INFO: number of multiplications over time spent: {}\n", (totalComputation / duration.count()));
-        fmt::print("INFO: total time: {} μs\n", duration.count());
+        Log::info("number of multiplications over time spent: {}\n", (totalComputation / duration.count()));
+        Log::info("total time: {} μs\n", duration.count());
     }
 
     SECTION("3-D Tensors") {
@@ -179,8 +179,8 @@ TEST_CASE("[cpu/operator] MatMul(forward)", "[MatMul][CPU]") {
             delete[] bigArray2;
             delete[] res;
         }
-        fmt::print("INFO: number of multiplications over time spent: {}\n", (totalComputation / duration.count()));
-        fmt::print("INFO: total time: {} μs\n", duration.count());
+        Log::info("number of multiplications over time spent: {}\n", (totalComputation / duration.count()));
+        Log::info("total time: {} μs\n", duration.count());
     }
 
     SECTION("4-D Tensors") {
@@ -249,8 +249,8 @@ TEST_CASE("[cpu/operator] MatMul(forward)", "[MatMul][CPU]") {
             delete[] bigArray2;
             delete[] res;
         }
-        fmt::print("INFO: number of multiplications over time spent: {}\n", (totalComputation / duration.count()));
-        fmt::print("INFO: total time: {} μs\n", duration.count());
+        Log::info("number of multiplications over time spent: {}\n", (totalComputation / duration.count()));
+        Log::info("total time: {} μs\n", duration.count());
     }
 
     SECTION("+2-D / 1-D") {
diff --git a/unit_tests/operator/Test_MulImpl.cpp b/unit_tests/operator/Test_MulImpl.cpp
index 925b9f20..b5f51725 100644
--- a/unit_tests/operator/Test_MulImpl.cpp
+++ b/unit_tests/operator/Test_MulImpl.cpp
@@ -437,8 +437,8 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
                 delete[] array1;
                 delete[] result;
             }
-            fmt::print("INFO: number of elements over time spent: {}\n", (number_of_operation / duration.count()));
-            fmt::print("INFO: total time: {} μs\n", duration.count());
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {} μs\n", duration.count());
         }
 
 
@@ -568,8 +568,8 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
                 const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
                 number_of_operation += nb_elements;
             }
-            fmt::print("INFO: number of elements over time spent: {}\n", (number_of_operation / duration.count()));
-            fmt::print("INFO: total time: {} μs\n", duration.count());
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {} μs\n", duration.count());
         }
         SECTION("+1-D Tensor / 1-D Tensor") {
             std::size_t number_of_operation = 0;
@@ -664,8 +664,8 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
                 number_of_operation += nb_elements;
             }
 
-            fmt::print("INFO: number of elements over time spent: {}\n", (number_of_operation / duration.count()));
-            fmt::print("INFO: total time: {} μs\n", duration.count());
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {} μs\n", duration.count());
         }
     }
 }
diff --git a/unit_tests/operator/Test_PowImpl.cpp b/unit_tests/operator/Test_PowImpl.cpp
index 8238da39..55a416c3 100644
--- a/unit_tests/operator/Test_PowImpl.cpp
+++ b/unit_tests/operator/Test_PowImpl.cpp
@@ -126,8 +126,8 @@ TEST_CASE("[cpu/operator] Pow", "[Pow][CPU]") {
 
                 // with broadcasting
             }
-            fmt::print("INFO: number of elements over time spent: {}\n", (number_of_operation / duration.count()));
-            fmt::print("INFO: total time: {} μs\n", duration.count());
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {} μs\n", duration.count());
         }
 
         SECTION("+1-D Tensor / +1-D Tensor - broadcasting") {
@@ -221,8 +221,8 @@ TEST_CASE("[cpu/operator] Pow", "[Pow][CPU]") {
                 const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
                 number_of_operation += nb_elements;
             }
-            fmt::print("INFO: number of elements over time spent: {}\n", (number_of_operation / duration.count()));
-            fmt::print("INFO: total time: {} μs\n", duration.count());
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {} μs\n", duration.count());
         }
         SECTION("+1-D Tensor / 1-D Tensor") {
             std::size_t number_of_operation = 0;
@@ -317,8 +317,8 @@ TEST_CASE("[cpu/operator] Pow", "[Pow][CPU]") {
                 number_of_operation += nb_elements;
             }
 
-            fmt::print("INFO: number of elements over time spent: {}\n", (number_of_operation / duration.count()));
-            fmt::print("INFO: total time: {} μs\n", duration.count());
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {} μs\n", duration.count());
         }
     }
 
diff --git a/unit_tests/operator/Test_RoundImpl.cpp b/unit_tests/operator/Test_RoundImpl.cpp
index 8b5dd53a..e658b061 100644
--- a/unit_tests/operator/Test_RoundImpl.cpp
+++ b/unit_tests/operator/Test_RoundImpl.cpp
@@ -108,8 +108,8 @@ TEST_CASE("[cpu/operator] Round_Test", "[Round][CPU]") {
 
 
             }
-            fmt::print("INFO: number of elements over time spent: {}\n", (number_of_operation / duration.count()));
-            fmt::print("INFO: total time: {} μs\n", duration.count());
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {} μs\n", duration.count());
         }
     }
 } // namespace Aidge
diff --git a/unit_tests/operator/Test_SubImpl.cpp b/unit_tests/operator/Test_SubImpl.cpp
index 471ae560..1317e88a 100644
--- a/unit_tests/operator/Test_SubImpl.cpp
+++ b/unit_tests/operator/Test_SubImpl.cpp
@@ -126,8 +126,8 @@ TEST_CASE("[cpu/operator] Sub", "[Sub][CPU]") {
 
                 // with broadcasting
             }
-            fmt::print("INFO: number of elements over time spent: {}\n", (number_of_operation / duration.count()));
-            fmt::print("INFO: total time: {}μs\n", duration.count());
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {}μs\n", duration.count());
         }
 
         SECTION("+1-D Tensor / +1-D Tensor - broadcasting") {
@@ -221,8 +221,8 @@ TEST_CASE("[cpu/operator] Sub", "[Sub][CPU]") {
                 const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
                 number_of_operation += nb_elements;
             }
-            fmt::print("INFO: number of elements over time spent: {}\n", (number_of_operation / duration.count()));
-            fmt::print("INFO: total time: {}μs\n", duration.count());
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {}μs\n", duration.count());
         }
         SECTION("+1-D Tensor / 1-D Tensor") {
             std::size_t number_of_operation = 0;
@@ -317,8 +317,8 @@ TEST_CASE("[cpu/operator] Sub", "[Sub][CPU]") {
                 number_of_operation += nb_elements;
             }
 
-            fmt::print("INFO: number of elements over time spent: {}\n", (number_of_operation / duration.count()));
-            fmt::print("INFO: total time: {}μs\n", duration.count());
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {}μs\n", duration.count());
         }
     }
 }
-- 
GitLab


From 1f6119e94cd58239a87a8babe50a3da07e79bb91 Mon Sep 17 00:00:00 2001
From: NAUD Maxence <maxence.naud@cea.fr>
Date: Tue, 21 Jan 2025 10:50:28 +0000
Subject: [PATCH 17/30] ADD: fmt as private library

---
 CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4329d993..a2f50c50 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -69,6 +69,8 @@ file(GLOB_RECURSE inc_files "include/*.hpp")
 add_library(${module_name} ${src_files} ${inc_files})
 
 target_link_libraries(${module_name}
+    PRIVATE
+        fmt::fmt
     PUBLIC
         _aidge_core # _ is added because we link the exported target and not the project
 )
-- 
GitLab


From 06ca82aade14627b128ce209d8d414e746b364ea Mon Sep 17 00:00:00 2001
From: NAUD Maxence <maxence.naud@cea.fr>
Date: Tue, 21 Jan 2025 14:12:16 +0000
Subject: [PATCH 18/30] UPD: try test using 'Log::log()' instead of
 'Tensor::print()' with Test_AddImpl.cpp

---
 unit_tests/operator/Test_AddImpl.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/unit_tests/operator/Test_AddImpl.cpp b/unit_tests/operator/Test_AddImpl.cpp
index 720c4ca2..bff9629b 100644
--- a/unit_tests/operator/Test_AddImpl.cpp
+++ b/unit_tests/operator/Test_AddImpl.cpp
@@ -100,7 +100,7 @@ TEST_CASE("[cpu/operator] Add(forward)", "[Add][CPU]") {
         });                                     //
 
         std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<int,2> {{100,200}});
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
+        Tensor expectedOutput = Array4D<int,3,3,3,2> {
             {                                               //
                 {                                           //
                     {{ 120, 222},{ 124, 226},{ 128, 230}},  //
@@ -118,7 +118,7 @@ TEST_CASE("[cpu/operator] Add(forward)", "[Add][CPU]") {
                     {{ 144, 246},{ 148, 250},{152, 254}}    //
                 }                                           //
             }                                               //
-        });                                                 //
+        };                                                 //
 
         std::shared_ptr<Node> myAdd_0 = Add();
         std::shared_ptr<Node> myAdd_1 = Add();
@@ -135,8 +135,8 @@ TEST_CASE("[cpu/operator] Add(forward)", "[Add][CPU]") {
         op_1->setBackend("cpu");
         myAdd_0->forward();
         myAdd_1->forward();
-        op_1->getOutput(0)->print();
-        expectedOutput->print();
-        REQUIRE(*op_1->getOutput(0) == *expectedOutput);
+        Log::info("Add_1 Tensor:\n{}", *(op_1->getOutput(0)));
+        Log::info("Expected Add_1 Tensor:\n{}", expectedOutput);
+        REQUIRE(*op_1->getOutput(0) == expectedOutput);
     }
 }
\ No newline at end of file
-- 
GitLab


From 3618bb51cf652d843930768c8161bc954fc31ce9 Mon Sep 17 00:00:00 2001
From: Jerome Hue <jerome.hue@cea.fr>
Date: Mon, 20 Jan 2025 16:18:05 +0100
Subject: [PATCH 19/30] chore: Improve and test Mul Backward kernel

- Rework Mul backward kernel to make more straighforward, and easily
adaptable to other element-wise kernels (sub, add, div).
- Add tests, including new test with random values
---
 .../aidge/backend/cpu/operator/MulImpl.hpp    |   1 +
 .../backend/cpu/operator/MulImpl_kernels.hpp  |  90 +-
 src/operator/MulImpl.cpp                      |   1 +
 unit_tests/operator/Test_MulImpl.cpp          | 893 ++++++++++--------
 4 files changed, 532 insertions(+), 453 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/MulImpl.hpp b/include/aidge/backend/cpu/operator/MulImpl.hpp
index c927af9e..eec5583b 100644
--- a/include/aidge/backend/cpu/operator/MulImpl.hpp
+++ b/include/aidge/backend/cpu/operator/MulImpl.hpp
@@ -34,6 +34,7 @@ using MulImpl_cpu = OperatorImpl_cpu<Mul_Op,
         const std::size_t,
         const std::vector<std::size_t>,
         const std::vector<std::size_t>,
+        const std::vector<std::size_t>,
         const void*,
         const void*,
         const void*,
diff --git a/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp
index e3d17a4b..36acb919 100644
--- a/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp
@@ -149,61 +149,53 @@ void MulImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
 
 template <class I1, class I2, class O>
 void MulImpl_cpu_backward_kernel(const std::size_t input0Length,
-                                 const std::size_t input1Length,
-                                 const std::size_t grad0Length,
-                                 const std::vector<std::size_t> input0Dims,
-                                 const std::vector<std::size_t> input1Dims,
-                                 const void* input0_,
-                                 const void* input1_,
-                                 const void* grad_output_,
-                                 void* gradientInput0,
-                                 void* gradientInput1)
+                                  const std::size_t input1Length,
+                                  const std::size_t gradOutputLength,
+                                  const std::vector<std::size_t>& dims0,
+                                  const std::vector<std::size_t>& dims1,
+                                  const std::vector<std::size_t>& outputDims,
+                                  const void* input0_,
+                                  const void* input1_,
+                                  const void* grad_output_,
+                                  void* gradientInput0_,
+                                  void* gradientInput1_)
 {
-    const auto* input0 = static_cast<const I1*>(input0_);
-    const auto* input1 = static_cast<const I1*>(input1_);
-    const auto* grad_output = static_cast<const O*>(grad_output_);
-    auto* grad_input_0 = static_cast<I1*>(gradientInput0);
-    auto* grad_input_1 = static_cast<I2*>(gradientInput1);
-
-
-    if(input0Dims.size() >= input1Dims.size())
-    {
-        AIDGE_ASSERT(input0Length == grad0Length, "Incorrect dimensions between Mul input and output tensors");
-
-        for(auto i = 0U; i < input0Length; ++i)
-        {
-            const auto indices = getMultiDimIndices(input1Dims, i);
-            const auto flattenedIndex = getFlattenedIndex(input1Dims, indices);
-
-            grad_input_0[i] = input1[flattenedIndex] * grad_output[i];
+    const I1* input0 = static_cast<const I1*>(input0_);
+    const I2* input1 = static_cast<const I2*>(input1_);
+    const O* grad_output = static_cast<const O*>(grad_output_);
+    auto* grad_input_0 = static_cast<I1*>(gradientInput0_);
+    auto* grad_input_1 = static_cast<I2*>(gradientInput1_);
+
+    std::fill_n(grad_input_0, input0Length, static_cast<I1>(0));
+    std::fill_n(grad_input_1, input1Length, static_cast<I2>(0));
+
+    // Broadcast dims0 and dims1 to match the shape of outputDims
+    auto broadcastedDims0 = getBroadcastedDims(outputDims, dims0);
+    auto broadcastedDims1 = getBroadcastedDims(outputDims, dims1);
+
+    for (std::size_t i = 0; i < gradOutputLength; ++i) {
+        auto idxOutputGrad = getMultiDimIndices(outputDims, i);
+        std::vector<std::size_t> idxInput0(broadcastedDims0.size());
+        std::vector<std::size_t> idxInput1(broadcastedDims1.size());
+
+        // Map output indices to input0 indices, considering broadcasting
+        for (std::size_t dimension = 0; dimension < broadcastedDims0.size(); ++dimension) {
+            // If input0 is broadcasted along this dimension (== 1) or both dimensions are 1, index is 0.
+            // idxInput0 represent the multi dim index of input0 contributing
+            // to the output at index i.
+            idxInput0[dimension] = (broadcastedDims0[dimension] == 1) ? 0 : idxOutputGrad[dimension];
         }
 
-        for(std::size_t i = 0 ; i < grad0Length; ++i)
-        {
-            const auto indices = getMultiDimIndices(input1Dims, i);
-            const auto flattenedIndex = getFlattenedIndex(input1Dims, indices);
-
-            grad_input_1[flattenedIndex] += input0[i] * grad_output[i];
+        for (std::size_t dimension = 0; dimension < broadcastedDims1.size(); ++dimension) {
+            idxInput1[dimension] = (broadcastedDims1[dimension] == 1) ? 0 : idxOutputGrad[dimension];
         }
 
-    } else {
-        AIDGE_ASSERT(input1Length == grad0Length, "Incorrect dimensions between Mul input and output tensors");
+        // We have to access tensors with a flat index, hence the conversion
+        auto idx0 = getFlattenedIndex(broadcastedDims0, idxInput0);
+        auto idx1 = getFlattenedIndex(broadcastedDims1, idxInput1);
 
-        for(auto i = 0U; i < input1Length; ++i)
-        {
-            const auto indices = getMultiDimIndices(input0Dims, i);
-            const auto flattenedIndex = getFlattenedIndex(input0Dims, indices);
-
-            grad_input_1[i] = input0[flattenedIndex] * grad_output[i];
-        }
-
-        for(std::size_t i = 0 ; i < grad0Length; ++i)
-        {
-            const auto indices = getMultiDimIndices(input0Dims, i);
-            const auto flattenedIndex = getFlattenedIndex(input0Dims, indices);
-
-            grad_input_0[flattenedIndex] += input1[i] * grad_output[i];
-        }
+        grad_input_0[idx0] += static_cast<I1>(grad_output[i] * input1[idx1]);
+        grad_input_1[idx1] += static_cast<I2>(grad_output[i] * input0[idx0]);
     }
 }
 
diff --git a/src/operator/MulImpl.cpp b/src/operator/MulImpl.cpp
index 422bdd00..a90d521a 100644
--- a/src/operator/MulImpl.cpp
+++ b/src/operator/MulImpl.cpp
@@ -58,6 +58,7 @@ void Aidge::MulImpl_cpu::backward() {
                /* grad0Length  */ out0grad->size(),
                /* input0Dims   */ in0->dims(),
                /* input1Dims   */ in1->dims(),
+               out0grad->dims(),
                getCPUPtr(in0),
                getCPUPtr(in1),
                getCPUPtr(out0grad),
diff --git a/unit_tests/operator/Test_MulImpl.cpp b/unit_tests/operator/Test_MulImpl.cpp
index b5f51725..7518bd18 100644
--- a/unit_tests/operator/Test_MulImpl.cpp
+++ b/unit_tests/operator/Test_MulImpl.cpp
@@ -9,365 +9,376 @@
  *
  ********************************************************************************/
 
-#include <chrono>      // std::micro, std::chrono::time_point,
-                       // std::chrono::system_clock,
-#include <cstddef>     // std::size_t
-#include <cstdint>     // std::uint16_t
-#include <functional>  // std::multiplies
-#include <memory>
-#include <numeric>     // std::accumulate
-#include <random>      // std::random_device, std::mt19937
-                       // std::uniform_int_distribution, std::uniform_real_distribution
-#include <vector>
-
 #include <catch2/catch_test_macros.hpp>
-#include <fmt/core.h>
+#include <chrono>
+#include <cstddef> // std::size_t
+#include <cstdint> // std::uint16_t
+#include <iostream>
+#include <memory>
+#include <numeric> // std::accumulate
+#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution
 
-#include "aidge/backend/cpu/data/TensorImpl.hpp"
-#include "aidge/backend/cpu/operator/MulImpl.hpp"
-#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/Mul.hpp"
-#include "aidge/utils/ArrayHelpers.hpp"
-#include "aidge/utils/Log.hpp"
 #include "aidge/utils/TensorUtils.hpp"
 
 namespace Aidge {
 
-TEST_CASE("[CPU/Operator] Mul Backward", "[Mul][CPU][Backward]")
-{
-    using aif32 = cpptype_t<DataType::Float32>;
-    std::shared_ptr<Mul_Op> op = std::make_shared<Mul_Op>();
+TEST_CASE("[CPU/Operator] Mul(Backward)", "[Mul][CPU][Backward]") {
+    std::shared_ptr<Node> myMul = Mul();
+    auto op = std::static_pointer_cast<OperatorTensor>(myMul->getOperator());
     op->setDataType(DataType::Float32);
     op->setBackend("cpu");
 
-    SECTION("Case 1: 2D and 1D tensors") {
-        const auto T0 = std::make_shared<Tensor>(Array2D<aif32,2,3>(
-            {
-                {
-                    {1,2,3},{4,5,6}
-                }
-            }
-        ));
+    // NOTE: The first four tests use fixed values, the last one uses random values but static dimensions.
+
+    SECTION("Case 1: 1D and 2D Tensors") {
+        const auto T0 = std::make_shared<Tensor>(
+            Array2D<float, 2, 3>({{{1, 2, 3}, {4, 5, 6}}}));
 
-        const auto T1 = std::make_shared<Tensor>(Array1D<aif32,3>(
-            {0.1,0.2,0.3}
-        ));
+        const auto T1 =
+            std::make_shared<Tensor>(Array1D<float, 3>({0.1, 0.2, 0.3}));
 
-        op->getOutput(0)->setGrad(std::make_shared<Tensor>(Array2D<aif32,2,3>({{{1.0,1.0,1.0},{1.0,1.0,1.0}}})));
+        float *input0 = static_cast<float *>(T0->getImpl()->rawPtr());
+        float *input1 = static_cast<float *>(T1->getImpl()->rawPtr());
 
-        op->associateInput(0,T0);
-        op->associateInput(1,T1);
+        // TODO Use
+        T0->setDataType(DataType::Float32);
+        T0->setBackend("cpu");
+        T1->setDataType(DataType::Float32);
+        T1->setBackend("cpu");
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(std::make_shared<Tensor>(
+            Array2D<float, 2, 3>({{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}})));
         op->forwardDims();
 
-        op->forward();
-        op->backward();
+        myMul->backward();
+
+        const auto expectedGrad0 = std::make_shared<Tensor>(
+            Array2D<float, 2, 3>({{{0.1, 0.2, 0.3}, {0.1, 0.2, 0.3}}}));
 
-        const Tensor T0Grad = Array2D<aif32, 2, 3>({{{0.1,0.2,0.3},{0.1, 0.2, 0.3}}});
-        const Tensor T1Grad = Array1D<aif32, 3>({5,7,9});
+        const auto expectedGrad1 =
+            std::make_shared<Tensor>(Array1D<float, 3>({5, 7, 9}));
 
-        REQUIRE(approxEq<aif32>(*(op->getInput(0)->grad()), T0Grad));
-        REQUIRE(approxEq<aif32>(*(op->getInput(1)->grad()), T1Grad));
+        REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0));
+        REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1));
     }
 
     SECTION("Case 2: 3D and 1D tensors") {
-        const auto T0 = std::make_shared<Tensor>(Array3D<aif32,2,2,3>(
-            {
-                {
-                    {
-                        {1.0, 2.0, 3.0},
-                        {4.0, 5.0, 6.0}
-                    },
-                    {
-                        {7.0, 8.0, 9.0},
-                        {10.0, 11.0, 12.0}
-                    }
-                }
-            }
-        ));
-
-        const auto T1 = std::make_shared<Tensor>(Array1D<aif32, 3>({0.3,0.2,0.1}));
-
-        const auto newGrad = std::make_shared<Tensor>(Array3D<aif32,2,2,3>(
-                {
-                    {
-                        {
-                            {1, 1, 1},
-                            {1, 1, 1}
-                        },
-                        {
-                            {1, 1, 1},
-                            {1, 1, 1}
-                        }
-                    }
-                }
-            ));
-
-        const Tensor expectedGrad0 = Array3D<aif32,2,2,3>(
-            {
-                {
-                    {
-                        {0.3, 0.2, 0.1},
-                        {0.3, 0.2, 0.1}
-                    },
-                    {
-                        {0.3, 0.2, 0.1},
-                        {0.3, 0.2, 0.1}
-                    }
-                }
-            }
-        );
+        const auto T0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+            {{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}},
+              {{7.0, 8.0, 9.0}, {10.0, 11.0, 12.0}}}}));
+
+        const auto T1 =
+            std::make_shared<Tensor>(Array1D<float, 3>({0.3, 0.2, 0.1}));
+
+        const auto newGrad = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+            {{{{1, 1, 1}, {1, 1, 1}}, {{1, 1, 1}, {1, 1, 1}}}}));
+
+        const auto expectedGrad0 = std::make_shared<Tensor>(
+            Array3D<float, 2, 2, 3>({{{{0.3, 0.2, 0.1}, {0.3, 0.2, 0.1}},
+                                      {{0.3, 0.2, 0.1}, {0.3, 0.2, 0.1}}}}));
 
-        const Tensor expectedGrad1 = Array1D<aif32,3>(
-            {22.0, 26.0, 30.0}
-        );
+        const auto expectedGrad1 =
+            std::make_shared<Tensor>(Array1D<float, 3>({22.0, 26.0, 30.0}));
+
+        for (auto T : {T0, T1, newGrad, expectedGrad0, expectedGrad1}) {
+            T->setBackend("cpu");
+            T->setDataType(DataType::Float32);
+        }
 
         op->associateInput(0, T0);
         op->associateInput(1, T1);
         op->getOutput(0)->setGrad(newGrad);
         op->forwardDims();
 
-        op->backward();
+        myMul->backward();
 
-        REQUIRE(approxEq<aif32>(*(op->getInput(0)->grad()), expectedGrad0));
-        REQUIRE(approxEq<aif32>(*(op->getInput(1)->grad()), expectedGrad1));
+        REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0));
+        REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1));
     }
 
     SECTION("Case 3: 4D and 2D tensors") {
-        const auto T0 = std::make_shared<Tensor>(Array4D<aif32,2, 2, 3, 3>(
-            {
-                {
-                    {
-                        {
-                            {1.0, 2.0, 3.0},
-                            {4.0, 5.0, 6.0},
-                            {7.0, 8.0, 9.0}
-                        },
-                        {
-                            {10.0, 11.0, 12.0},
-                            {13.0, 14.0, 15.0},
-                            {16.0, 17.0, 18.0}
-                        }
-                    },
-                    {
-                        {
-                            {19.0, 20.0, 21.0},
-                            {22.0, 23.0, 24.0},
-                            {25.0, 26.0, 27.0}
-                        },
-                        {
-                            {28.0, 29.0, 30.0},
-                            {31.0, 32.0, 33.0},
-                            {34.0, 35.0, 36.0}
-                        }
-                    }
-                }
-            }
-        ));
-
-        const auto T1 = std::make_shared<Tensor>(Array2D<aif32, 3,3>(
-            {
-                {
-                    {0.5,0.3,0.1},
-                    {0.4,0.2,0.6},
-                    {0.7,0.8,0.9}
-                }
-            }
-        ));
-
-        const auto newGrad = std::make_shared<Tensor>(Array4D<aif32,2, 2, 3, 3>(
-            {
-                {
-                    {
-                        {
-                            {1.0, 1.0, 1.0},
-                            {1.0, 1.0, 1.0},
-                            {1.0, 1.0, 1.0}
-                        },
-                        {
-                            {1.0, 1.0, 1.0},
-                            {1.0, 1.0, 1.0},
-                            {1.0, 1.0, 1.0}
-                        }
-                    },
-                    {
-                        {
-                            {1.0, 1.0, 1.0},
-                            {1.0, 1.0, 1.0},
-                            {1.0, 1.0, 1.0}
-                        },
-                        {
-                            {1.0, 1.0, 1.0},
-                            {1.0, 1.0, 1.0},
-                            {1.0, 1.0, 1.0}
-                        }
-                    }
-                }
-            }
-        ));
-
-        const Tensor expectedGrad0 = Array4D<aif32,2,2,3,3>(
-            {
-                {
-                    {
-                        {
-                            {0.5, 0.3, 0.1},
-                            {0.4, 0.2, 0.6},
-                            {0.7, 0.8, 0.9}
-                        },
-                        {
-                            {0.5, 0.3, 0.1},
-                            {0.4, 0.2, 0.6},
-                            {0.7, 0.8, 0.9}
-                        }
-                    },
-                    {
-                        {
-                            {0.5, 0.3, 0.1},
-                            {0.4, 0.2, 0.6},
-                            {0.7, 0.8, 0.9}
-                        },
-                        {
-                            {0.5, 0.3, 0.1},
-                            {0.4, 0.2, 0.6},
-                            {0.7, 0.8, 0.9}
-                        }
-                    }
-                }
-            }
-        );
-
-        const Tensor expectedGrad1 = Array2D<aif32,3, 3>(
-            {
-                {
-                    {58.0, 62.0, 66.0},
-                    {70.0, 74.0, 78.0},
-                    {82.0, 86.0, 90.0}
-                }
-            }
-        );
+        const auto T0 = std::make_shared<Tensor>(Array4D<float, 2, 2, 3, 3>(
+            {{{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}, {7.0, 8.0, 9.0}},
+               {{10.0, 11.0, 12.0}, {13.0, 14.0, 15.0}, {16.0, 17.0, 18.0}}},
+              {{{19.0, 20.0, 21.0}, {22.0, 23.0, 24.0}, {25.0, 26.0, 27.0}},
+               {{28.0, 29.0, 30.0},
+                {31.0, 32.0, 33.0},
+                {34.0, 35.0, 36.0}}}}}));
+
+        const auto T1 = std::make_shared<Tensor>(Array2D<float, 3, 3>(
+            {{{0.5, 0.3, 0.1}, {0.4, 0.2, 0.6}, {0.7, 0.8, 0.9}}}));
+
+        const auto newGrad =
+            std::make_shared<Tensor>(Array4D<float, 2, 2, 3, 3>(
+                {{{{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}},
+                   {{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}},
+                  {{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}},
+                   {{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}}}}));
+
+        const auto expectedGrad0 =
+            std::make_shared<Tensor>(Array4D<float, 2, 2, 3, 3>(
+                {{{{{0.5, 0.3, 0.1}, {0.4, 0.2, 0.6}, {0.7, 0.8, 0.9}},
+                   {{0.5, 0.3, 0.1}, {0.4, 0.2, 0.6}, {0.7, 0.8, 0.9}}},
+                  {{{0.5, 0.3, 0.1}, {0.4, 0.2, 0.6}, {0.7, 0.8, 0.9}},
+                   {{0.5, 0.3, 0.1}, {0.4, 0.2, 0.6}, {0.7, 0.8, 0.9}}}}}));
+
+        const auto expectedGrad1 = std::make_shared<Tensor>(
+            Array2D<float, 3, 3>({{{58.0, 62.0, 66.0},
+                                   {70.0, 74.0, 78.0},
+                                   {82.0, 86.0, 90.0}}}));
+
+        for (const auto T : {T0, T1, newGrad, expectedGrad0, expectedGrad1}) {
+            T->setBackend("cpu");
+            T->setDataType(DataType::Float32);
+        }
 
         op->associateInput(0, T0);
         op->associateInput(1, T1);
         op->getOutput(0)->setGrad(newGrad);
         op->forwardDims();
 
-        op->backward();
+        myMul->backward();
 
-        REQUIRE(approxEq<aif32>(*(op->getInput(0)->grad()), expectedGrad0));
-        REQUIRE(approxEq<aif32>(*(op->getInput(1)->grad()), expectedGrad1));
+        REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0));
+        REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1));
     }
 
     SECTION("Case 4: 3D and 2D tensors") {
-        const auto T0 = std::make_shared<Tensor>(Array3D<aif32, 2, 3, 4>(
-            {
-                {
-                    {
-                        {1.0, 2.0, 3.0, 4.0},
-                        {5.0, 6.0, 7.0, 8.0},
-                        {9.0, 10.0, 11.0, 12.0},
-                    },
-                    {
-                        {13.0, 14.0, 15.0, 16.0},
-                        {17.0, 18.0, 19.0, 20.0},
-                        {21.0, 22.0, 23.0, 24.0},
-                    }
-                }
-            }
-        ));
-
-        const auto T1 = std::make_shared<Tensor>(Array2D<aif32, 3, 4>(
-            {
-                {
-                    {0.1, 0.2, 0.3, 0.4},
-                    {0.5, 0.6, 0.7, 0.8},
-                    {0.9, 1.0, 1.1, 1.2}
-                }
-            }
-        ));
-
-        const auto newGrad = std::make_shared<Tensor>(Array3D<aif32, 2,3,4>(
-            {
-                {
-                    {
-                        {1.0, 1.0, 1.0, 1.0},
-                        {1.0, 1.0, 1.0, 1.0},
-                        {1.0, 1.0, 1.0, 1.0},
-                    },
-                    {
-                        {1.0, 1.0, 1.0, 1.0},
-                        {1.0, 1.0, 1.0, 1.0},
-                        {1.0, 1.0, 1.0, 1.0},
-                    }
-                }
-            }
-        ));
-
-        const Tensor expectedGrad0 = Array3D<aif32,2,3,4>(
-            {
-                {
-                    {
-                        {0.1, 0.2, 0.3, 0.4},
-                        {0.5, 0.6, 0.7, 0.8},
-                        {0.9, 1.0, 1.1, 1.2}
-                    },
-                    {
-                        {0.1, 0.2, 0.3, 0.4},
-                        {0.5, 0.6, 0.7, 0.8},
-                        {0.9, 1.0, 1.1, 1.2}
-                    }
-                }
-            }
-        );
-
-        const Tensor expectedGrad1 = Array2D<aif32,3,4>(
-            {
-                {
-                    {14.0, 16.0, 18.0, 20.0},
-                    {22.0, 24.0, 26.0, 28.0},
-                    {30.0, 32.0, 34.0, 36.0}
-                }
-            }
-        );
+        const auto T0 = std::make_shared<Tensor>(
+            Array3D<float, 2, 3, 4>({{{
+                                          {1.0, 2.0, 3.0, 4.0},
+                                          {5.0, 6.0, 7.0, 8.0},
+                                          {9.0, 10.0, 11.0, 12.0},
+                                      },
+                                      {
+                                          {13.0, 14.0, 15.0, 16.0},
+                                          {17.0, 18.0, 19.0, 20.0},
+                                          {21.0, 22.0, 23.0, 24.0},
+                                      }}}));
+
+        const auto T1 = std::make_shared<Tensor>(
+            Array2D<float, 3, 4>({{{0.1, 0.2, 0.3, 0.4},
+                                   {0.5, 0.6, 0.7, 0.8},
+                                   {0.9, 1.0, 1.1, 1.2}}}));
+
+        const auto newGrad = std::make_shared<Tensor>(
+            Array3D<float, 2, 3, 4>({{{
+                                          {1.0, 1.0, 1.0, 1.0},
+                                          {1.0, 1.0, 1.0, 1.0},
+                                          {1.0, 1.0, 1.0, 1.0},
+                                      },
+                                      {
+                                          {1.0, 1.0, 1.0, 1.0},
+                                          {1.0, 1.0, 1.0, 1.0},
+                                          {1.0, 1.0, 1.0, 1.0},
+                                      }}}));
+
+        const auto expectedGrad0 = std::make_shared<Tensor>(
+            Array3D<float, 2, 3, 4>({{{{0.1, 0.2, 0.3, 0.4},
+                                       {0.5, 0.6, 0.7, 0.8},
+                                       {0.9, 1.0, 1.1, 1.2}},
+                                      {{0.1, 0.2, 0.3, 0.4},
+                                       {0.5, 0.6, 0.7, 0.8},
+                                       {0.9, 1.0, 1.1, 1.2}}}}));
+
+        const auto expectedGrad1 = std::make_shared<Tensor>(
+            Array2D<float, 3, 4>({{{14.0, 16.0, 18.0, 20.0},
+                                   {22.0, 24.0, 26.0, 28.0},
+                                   {30.0, 32.0, 34.0, 36.0}}}));
+
+        for (const auto T : {T0, T1, newGrad, expectedGrad0, expectedGrad1}) {
+            T->setBackend("cpu");
+            T->setDataType(DataType::Float32);
+        }
 
         op->associateInput(0, T0);
         op->associateInput(1, T1);
         op->getOutput(0)->setGrad(newGrad);
         op->forwardDims();
 
-        op->backward();
+        myMul->backward();
 
-        REQUIRE(approxEq<aif32>(*(op->getInput(0)->grad()), expectedGrad0));
-        REQUIRE(approxEq<aif32>(*(op->getInput(1)->grad()), expectedGrad1));
+        REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0));
+        REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1));
+    }
+
+    SECTION("Case 5: Tensors with random values") {
+
+        // Use random values
+        std::vector<std::size_t> dims0 = {5, 2, 1, 7}; // First tensor
+        std::vector<std::size_t> dims1 = {2, 6, 7};    // Second tensor
+        std::vector<std::size_t> outputDims = {5, 2, 6, 7};
+
+        const auto input0Size = 5 * 2 * 1 * 7;
+        const auto input1Size = 2 * 6 * 7;
+        const auto outputSize = 5 * 2 * 6 * 7;
+
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> dist(0.1f, 1.0f);
+
+        std::vector<float> input0Data(input0Size);
+        std::vector<float> input1Data(input1Size);
+
+        // Fill with random values
+        for (auto &val : input0Data) {
+            val = dist(gen);
+        }
+        for (auto &val : input1Data) {
+            val = dist(gen);
+        }
+
+        auto T0 = std::make_shared<Tensor>();
+        auto T1 = std::make_shared<Tensor>();
+
+        T0->setDataType(DataType::Float32);
+        T0->setBackend("cpu");
+        T0->resize(dims0);
+        T0->getImpl()->setRawPtr(input0Data.data(), input0Size);
+
+        T1->setDataType(DataType::Float32);
+        T1->setBackend("cpu");
+        T1->resize(dims1);
+        T1->getImpl()->setRawPtr(input1Data.data(), input1Size);
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+
+        op->forwardDims();
+        myMul->forward();
+
+        std::vector<float> expectedOutput(outputSize);
+
+        for (std::size_t n = 0; n < 5; ++n) {
+            for (std::size_t c = 0; c < 2; ++c) {
+                for (std::size_t h = 0; h < 6; ++h) {
+                    for (std::size_t w = 0; w < 7; ++w) {
+                        std::size_t outIdx = w + 7 * (h + 6 * (c + 2 * n));
+                        std::size_t in0Idx =
+                            w + 7 * (0 + 1 * (c + 2 * n)); // middle dim is 1
+                        std::size_t in1Idx =
+                            w + 7 * (h + 6 * c);           // no n dimension
+
+                        expectedOutput[outIdx] =
+                            input0Data[in0Idx] * input1Data[in1Idx];
+                    }
+                }
+            }
+        }
+
+        auto outputTensor = op->getOutput(0);
+
+        // Verify forward pass
+        auto expectedOutputTensor = std::make_shared<Tensor>();
+        expectedOutputTensor->resize(outputDims);
+        expectedOutputTensor->setBackend("cpu");
+        expectedOutputTensor->setDataType(DataType::Float32);
+        expectedOutputTensor->getImpl()->setRawPtr(expectedOutput.data(),
+                                                     expectedOutput.size());
+
+        REQUIRE(approxEq<float>(*outputTensor, *expectedOutputTensor));
+
+        // Backward pass
+        std::vector<float> gradOutputData(outputSize);
+        for (auto &val : gradOutputData) {
+            val = dist(gen);
+        }
+
+        op->getOutput(0)->setGrad(std::make_shared<Tensor>());
+        op->getOutput(0)->grad()->resize(outputDims);
+        op->getOutput(0)->grad()->getImpl()->setRawPtr(gradOutputData.data(),
+                                                       outputSize);
+
+        // Compute reference gradients
+        std::vector<float> expectedGrad0(input0Size, 0.0f);
+        std::vector<float> expectedGrad1(input1Size, 0.0f);
+
+        for (std::size_t n = 0; n < 5; ++n) {
+            for (std::size_t c = 0; c < 2; ++c) {
+                for (std::size_t h = 0; h < 6; ++h) {
+                    for (std::size_t w = 0; w < 7; ++w) {
+                        std::size_t outIdx = w + 7 * (h + 6 * (c + 2 * n));
+                        std::size_t in0Idx = w + 7 * (0 + 1 * (c + 2 * n));
+                        std::size_t in1Idx = w + 7 * (h + 6 * c);
+
+                        // Gradient for input0: grad_output * input1
+                        expectedGrad0[in0Idx] +=
+                            gradOutputData[outIdx] * input1Data[in1Idx];
+
+                        // Gradient for input1: grad_output * input0
+                        expectedGrad1[in1Idx] +=
+                            gradOutputData[outIdx] * input0Data[in0Idx];
+                    }
+                }
+            }
+        }
+
+        // Perform backward pass
+        myMul->backward();
+
+        auto expectedGrad0Tensor = std::make_shared<Tensor>();
+        expectedGrad0Tensor->resize(T0->dims());
+        expectedGrad0Tensor->setBackend("cpu");
+        expectedGrad0Tensor->setDataType(DataType::Float32);
+        expectedGrad0Tensor->getImpl()->setRawPtr(expectedGrad0.data(),
+                                                    expectedGrad0.size());
+
+        auto expectedGrad1Tensor = std::make_shared<Tensor>();
+        expectedGrad1Tensor->resize(T1->dims());
+        expectedGrad1Tensor->setBackend("cpu");
+        expectedGrad1Tensor->setDataType(DataType::Float32);
+        expectedGrad1Tensor->getImpl()->setRawPtr(expectedGrad1.data(),
+                                                    expectedGrad1.size());
+
+        // Verify backward pass
+        REQUIRE(approxEq<float>(*T0->grad(), *expectedGrad0Tensor));
+        REQUIRE(approxEq<float>(*T1->grad(), *expectedGrad1Tensor));
+
+        // Optional: Print some values for verification
+        // std::cout << "Input shapes: (" << dims0[0] << "," << dims0[1] <<
+        // "," << dims0[2] << "," << dims0[3]
+        //           << ") * (" << dims1[0] << "," << dims1[1] << "," <<
+        //           dims1[2]
+        //           << ") -> (" << outputDims[0] << "," << outputDims[1]
+        //           << "," << outputDims[2] << "," << outputDims[3] <<
+        //           ")\n";
+        // std::cout << "Input sizes: " << input0_size << " * " <<
+        // input1_size << " -> " << output_size << "\n";
     }
 }
 
-TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
+TEST_CASE("[cpu/operator] Mul(forward)", "[Mul][CPU]") {
     constexpr std::uint16_t NBTRIALS = 10;
     // Create a random number generator
     std::random_device rd;
     std::mt19937 gen(rd());
-    std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1
-    std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
-    std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(3));
-    std::uniform_int_distribution<int> boolDist(0,1);
-
-    // Create MatMul Operator
-    std::shared_ptr<Mul_Op> op = std::make_shared<Mul_Op>();
+    std::uniform_real_distribution<float> valueDist(
+        0.1f,
+        1.1f); // Random float distribution between 0 and 1
+    std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2),
+                                                           std::size_t(10));
+    std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1),
+                                                          std::size_t(3));
+    std::uniform_int_distribution<int> boolDist(0, 1);
+
+    std::shared_ptr<Node> myMul = Mul();
+    auto op = std::static_pointer_cast<OperatorTensor>(myMul->getOperator());
     op->setDataType(DataType::Float32);
     op->setBackend("cpu");
 
-    // Create 2 input Tensors
     std::shared_ptr<Tensor> T0 = std::make_shared<Tensor>();
-    op->associateInput(0,T0);
+    op->associateInput(0, T0);
     T0->setDataType(DataType::Float32);
     T0->setBackend("cpu");
     std::shared_ptr<Tensor> T1 = std::make_shared<Tensor>();
-    op -> associateInput(1,T1);
+    op->associateInput(1, T1);
     T1->setDataType(DataType::Float32);
     T1->setBackend("cpu");
 
-    // Create results Tensor
     std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>();
     Tres->setDataType(DataType::Float32);
     Tres->setBackend("cpu");
@@ -377,14 +388,9 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
     std::chrono::time_point<std::chrono::system_clock> end;
     std::chrono::duration<double, std::micro> duration{};
 
-
     SECTION("MulImpl_cpu::forward()") {
-        SECTION("Scalar / Scalar") {
-
-        }
-        SECTION("Scalar / +1-D Tensor") {
-
-        }
+        SECTION("Scalar / Scalar") {}
+        SECTION("Scalar / +1-D Tensor") {}
         SECTION("+1-D Tensor / +1-D Tensor - same dimensions") {
 
             std::size_t number_of_operation = 0;
@@ -399,13 +405,17 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
                     dims.push_back(dimSizeDist(gen));
                 }
 
-                const auto nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                const auto nb_elements =
+                    std::accumulate(dims.cbegin(),
+                                    dims.cend(),
+                                    std::size_t(1),
+                                    std::multiplies<std::size_t>());
                 number_of_operation += nb_elements;
 
                 // without broadcasting
-                float* array0 = new float[nb_elements];
-                float* array1 = new float[nb_elements];
-                float* result = new float[nb_elements];
+                float *array0 = new float[nb_elements];
+                float *array1 = new float[nb_elements];
+                float *result = new float[nb_elements];
 
                 for (std::size_t i = 0; i < nb_elements; ++i) {
                     array0[i] = valueDist(gen);
@@ -415,21 +425,23 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
 
                 // input0
                 T0->resize(dims);
-                T0 -> getImpl() -> setRawPtr(array0, nb_elements);
+                T0->getImpl()->setRawPtr(array0, nb_elements);
 
                 // input1
                 T1->resize(dims);
-                T1 -> getImpl() -> setRawPtr(array1, nb_elements);
+                T1->getImpl()->setRawPtr(array1, nb_elements);
 
                 // results
                 Tres->resize(dims);
-                Tres -> getImpl() -> setRawPtr(result, nb_elements);
+                Tres->getImpl()->setRawPtr(result, nb_elements);
 
                 op->forwardDims();
                 start = std::chrono::system_clock::now();
-                op->forward();
+                myMul->forward();
                 end = std::chrono::system_clock::now();
-                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+                duration +=
+                    std::chrono::duration_cast<std::chrono::microseconds>(
+                        end - start);
 
                 REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
 
@@ -437,24 +449,25 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
                 delete[] array1;
                 delete[] result;
             }
-            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
-            Log::info("total time: {} μs\n", duration.count());
+            std::cout << "number of elements over time spent: "
+                      << (number_of_operation / duration.count()) << std::endl;
+            std::cout << "total time: " << duration.count() << "μs"
+                      << std::endl;
         }
 
-
         SECTION("+1-D Tensor / +1-D Tensor - broadcasting") {
             std::size_t number_of_operation = 0;
 
             for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
 
                 // generate 2 random Tensors
-                // handle dimensions, replace some dimensions with '1' to get broadcasting
+                // handle dimensions, replace some dimensions with '1' to get
+                // broadcasting
 
                 constexpr std::size_t nbDims = 4;
                 std::vector<std::size_t> dimensions;
 
-                for (std::size_t i = 0; i < nbDims; ++i)
-                {
+                for (std::size_t i = 0; i < nbDims; ++i) {
                     dimensions.push_back(dimSizeDist(gen));
                 }
 
@@ -462,77 +475,90 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
                 auto dims1 = dimensions;
                 auto dimsOut = dimensions;
 
-                for (std::size_t i = 0; i < nbDims; ++i)
-                {
-                    if (boolDist(gen))
-                    {
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    if (boolDist(gen)) {
                         dims0[i] = 1;
                     }
 
-                    if (boolDist(gen))
-                    {
+                    if (boolDist(gen)) {
                         dims1[i] = 1;
                     }
 
                     dimsOut[i] = (dims0[i] == 1) ? dims1[i] : dims0[i];
                 }
 
-                for(auto dim : dims0)
-                {
+                for (auto dim : dims0) {
                     Log::info("Dimension of input 0 : {}", dim);
                 }
 
-                for(auto dim : dims1)
-                {
+                for (auto dim : dims1) {
                     Log::info("Dimension of input 1 : {}", dim);
                 }
 
                 // create arrays and fill them with random values
-                float* array0 = new float[dims0[0]*dims0[1]*dims0[2]*dims0[3]];
-                float* array1 = new float[dims1[0]*dims1[1]*dims1[2]*dims1[3]];
-                float* result = new float[dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]];
-
-
-                for (std::size_t i = 0; i < dims0[0]*dims0[1]*dims0[2]*dims0[3]; ++i)
-                {
+                float *array0 =
+                    new float[dims0[0] * dims0[1] * dims0[2] * dims0[3]];
+                float *array1 =
+                    new float[dims1[0] * dims1[1] * dims1[2] * dims1[3]];
+                float *result = new float[dimsOut[0] * dimsOut[1] *
+                                          dimsOut[2] * dimsOut[3]];
+
+                for (std::size_t i = 0;
+                     i < dims0[0] * dims0[1] * dims0[2] * dims0[3];
+                     ++i) {
                     array0[i] = valueDist(gen);
                 }
 
-                for (std::size_t i = 0; i < dims1[0]*dims1[1]*dims1[2]*dims1[3]; ++i)
-                {
+                for (std::size_t i = 0;
+                     i < dims1[0] * dims1[1] * dims1[2] * dims1[3];
+                     ++i) {
                     array1[i] = valueDist(gen);
                 }
 
                 // compute true result
-                const std::size_t strides0[nbDims] = {dims0[1]*dims0[2]*dims0[3], dims0[2]*dims0[3], dims0[3], 1};
-                const std::size_t strides1[nbDims] = {dims1[1]*dims1[2]*dims1[3], dims1[2]*dims1[3], dims1[3], 1};
-
-                for (std::size_t a = 0; a < dimsOut[0]; ++a)
-                {
-                    for (std::size_t b = 0; b < dimsOut[1]; ++b)
-                    {
-                        const std::size_t idx0_0 = strides0[0] * ((dims0[0] > 1) ? a : 0)
-                                                    + strides0[1] * ((dims0[1] > 1) ? b : 0);
-
-                        const std::size_t idx1_0 = strides1[0] * ((dims1[0] > 1) ? a : 0)
-                                                    + strides1[1] * ((dims1[1] > 1) ? b : 0);
-
-                        for (std::size_t c = 0; c < dimsOut[2]; ++c)
-                        {
-                            const std::size_t idx_out = dimsOut[3] * (c + dimsOut[2] * (b + dimsOut[1] * a));
-
-                            for (std::size_t d = 0; d < dimsOut[3]; ++d)
-                            {
-                                std::size_t idx0 = idx0_0
-                                                    + strides0[2] * ((dims0[2] > 1) ? c : 0)
-                                                    + ((dims0[3] > 1) ? d : 0);
-
-                                std::size_t idx1 = idx1_0
-                                                    + strides1[2] * ((dims1[2] > 1) ? c : 0)
-                                                    + ((dims1[3] > 1) ? d : 0);
-
-                                result[idx_out + d] = array0[idx0] * array1[idx1];
-                                // std::cout << "(" << idx0 << ", " << idx1 << ") -> " << array0[idx0] << " * " << array1[idx1] << " -> " << idx_out + d << std::endl;
+                const std::size_t strides0[nbDims] = {
+                    dims0[1] * dims0[2] * dims0[3],
+                    dims0[2] * dims0[3],
+                    dims0[3],
+                    1};
+                const std::size_t strides1[nbDims] = {
+                    dims1[1] * dims1[2] * dims1[3],
+                    dims1[2] * dims1[3],
+                    dims1[3],
+                    1};
+
+                for (std::size_t a = 0; a < dimsOut[0]; ++a) {
+                    for (std::size_t b = 0; b < dimsOut[1]; ++b) {
+                        const std::size_t idx0_0 =
+                            strides0[0] * ((dims0[0] > 1) ? a : 0) +
+                            strides0[1] * ((dims0[1] > 1) ? b : 0);
+
+                        const std::size_t idx1_0 =
+                            strides1[0] * ((dims1[0] > 1) ? a : 0) +
+                            strides1[1] * ((dims1[1] > 1) ? b : 0);
+
+                        for (std::size_t c = 0; c < dimsOut[2]; ++c) {
+                            const std::size_t idx_out =
+                                dimsOut[3] *
+                                (c + dimsOut[2] * (b + dimsOut[1] * a));
+
+                            for (std::size_t d = 0; d < dimsOut[3]; ++d) {
+                                std::size_t idx0 =
+                                    idx0_0 +
+                                    strides0[2] * ((dims0[2] > 1) ? c : 0) +
+                                    ((dims0[3] > 1) ? d : 0);
+
+                                std::size_t idx1 =
+                                    idx1_0 +
+                                    strides1[2] * ((dims1[2] > 1) ? c : 0) +
+                                    ((dims1[3] > 1) ? d : 0);
+
+                                result[idx_out + d] =
+                                    array0[idx0] * array1[idx1];
+                                // std::cout << "(" << idx0 << ", " << idx1 <<
+                                // ") -> " << array0[idx0] << " * " <<
+                                // array1[idx1] << " -> " << idx_out + d <<
+                                // std::endl;
                             }
                         }
                     }
@@ -541,22 +567,30 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
                 // conversion to Aidge::Tensors
                 // input0
                 T0->resize(dims0);
-                T0 -> getImpl() -> setRawPtr(array0, dims0[0]*dims0[1]*dims0[2]*dims0[3]);
+                T0->getImpl()->setRawPtr(
+                    array0,
+                    dims0[0] * dims0[1] * dims0[2] * dims0[3]);
 
                 // input1
                 T1->resize(dims1);
-                T1 -> getImpl() -> setRawPtr(array1, dims1[0]*dims1[1]*dims1[2]*dims1[3]);
+                T1->getImpl()->setRawPtr(
+                    array1,
+                    dims1[0] * dims1[1] * dims1[2] * dims1[3]);
 
                 // results
                 Tres->resize(dimsOut);
-                Tres -> getImpl() -> setRawPtr(result, dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]);
+                Tres->getImpl()->setRawPtr(
+                    result,
+                    dimsOut[0] * dimsOut[1] * dimsOut[2] * dimsOut[3]);
 
                 // compute result
                 op->forwardDims();
                 start = std::chrono::system_clock::now();
-                op->forward();
+                myMul->forward();
                 end = std::chrono::system_clock::now();
-                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+                duration +=
+                    std::chrono::duration_cast<std::chrono::microseconds>(
+                        end - start);
 
                 // comparison between truth and computed result
                 REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
@@ -565,15 +599,23 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
                 delete[] array1;
                 delete[] result;
 
-                const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                const std::size_t nb_elements =
+                    std::accumulate(dimsOut.cbegin(),
+                                    dimsOut.cend(),
+                                    std::size_t(1),
+                                    std::multiplies<std::size_t>());
                 number_of_operation += nb_elements;
             }
-            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
-            Log::info("total time: {} μs\n", duration.count());
+            std::cout << "number of elements over time spent: "
+                      << (number_of_operation / duration.count()) << std::endl;
+            std::cout << "total time: " << duration.count() << "μs"
+                      << std::endl;
         }
         SECTION("+1-D Tensor / 1-D Tensor") {
             std::size_t number_of_operation = 0;
-            std::uniform_int_distribution<std::size_t> nbRemovedDimsDist(std::size_t(1), std::size_t(3));
+            std::uniform_int_distribution<std::size_t> nbRemovedDimsDist(
+                std::size_t(1),
+                std::size_t(3));
 
             for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
                 // generate 2 random Tensors
@@ -590,15 +632,24 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
                         dims1[i] = 1;
                     }
                 }
-                dims1.erase(dims1.cbegin(), dims1.cbegin() + nbRemovedDimsDist(gen));
+                dims1.erase(dims1.cbegin(),
+                            dims1.cbegin() + nbRemovedDimsDist(gen));
 
                 // create arrays and fill them with random values
-                float* array0 = new float[dims0[0]*dims0[1]*dims0[2]*dims0[3]];
-                std::size_t array1_size = std::accumulate(dims1.cbegin(), dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
-                float* array1 = new float[array1_size];
-                float* result = new float[dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]];
-
-                for (std::size_t i = 0; i < (dims0[0]*dims0[1]*dims0[2]*dims0[3]); ++i) {
+                float *array0 =
+                    new float[dims0[0] * dims0[1] * dims0[2] * dims0[3]];
+                std::size_t array1_size =
+                    std::accumulate(dims1.cbegin(),
+                                    dims1.cend(),
+                                    std::size_t(1),
+                                    std::multiplies<std::size_t>());
+                float *array1 = new float[array1_size];
+                float *result = new float[dimsOut[0] * dimsOut[1] *
+                                          dimsOut[2] * dimsOut[3]];
+
+                for (std::size_t i = 0;
+                     i < (dims0[0] * dims0[1] * dims0[2] * dims0[3]);
+                     ++i) {
                     array0[i] = valueDist(gen);
                 }
                 for (std::size_t i = 0; i < array1_size; ++i) {
@@ -607,27 +658,48 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
 
                 // compute true result
                 auto dims1_tmp = dims1;
-                dims1_tmp.insert(dims1_tmp.cbegin(), 4 - dims1_tmp.size(), std::size_t(1));
-
-                const std::size_t strides0[nbDims] = {dims0[1]*dims0[2]*dims0[3], dims0[2]*dims0[3], dims0[3], 1};
-                const std::size_t strides1[nbDims] = {dims1_tmp[1]*dims1_tmp[2]*dims1_tmp[3], dims1_tmp[2]*dims1_tmp[3], dims1_tmp[3], 1};
+                dims1_tmp.insert(dims1_tmp.cbegin(),
+                                 4 - dims1_tmp.size(),
+                                 std::size_t(1));
+
+                const std::size_t strides0[nbDims] = {
+                    dims0[1] * dims0[2] * dims0[3],
+                    dims0[2] * dims0[3],
+                    dims0[3],
+                    1};
+                const std::size_t strides1[nbDims] = {
+                    dims1_tmp[1] * dims1_tmp[2] * dims1_tmp[3],
+                    dims1_tmp[2] * dims1_tmp[3],
+                    dims1_tmp[3],
+                    1};
                 for (std::size_t a = 0; a < dimsOut[0]; ++a) {
                     for (std::size_t b = 0; b < dimsOut[1]; ++b) {
-                        const std::size_t idx0_0 = strides0[0] * ((dims0[0] > 1) ? a : 0)
-                                                    + strides0[1] * ((dims0[1] > 1) ? b : 0);
-                        const std::size_t idx1_0 = strides1[0] * ((dims1_tmp[0] > 1) ? a : 0)
-                                                    + strides1[1] * ((dims1_tmp[1] > 1) ? b : 0);
+                        const std::size_t idx0_0 =
+                            strides0[0] * ((dims0[0] > 1) ? a : 0) +
+                            strides0[1] * ((dims0[1] > 1) ? b : 0);
+                        const std::size_t idx1_0 =
+                            strides1[0] * ((dims1_tmp[0] > 1) ? a : 0) +
+                            strides1[1] * ((dims1_tmp[1] > 1) ? b : 0);
                         for (std::size_t c = 0; c < dimsOut[2]; ++c) {
-                            const std::size_t idx_out = dimsOut[3] * (c + dimsOut[2] * (b + dimsOut[1] * a));
+                            const std::size_t idx_out =
+                                dimsOut[3] *
+                                (c + dimsOut[2] * (b + dimsOut[1] * a));
                             for (std::size_t d = 0; d < dimsOut[3]; ++d) {
-                                std::size_t idx0 = idx0_0
-                                                    + strides0[2] * ((dims0[2] > 1) ? c : 0)
-                                                    + ((dims0[3] > 1) ? d : 0);
-                                std::size_t idx1 = idx1_0
-                                                    + strides1[2] * ((dims1_tmp[2] > 1) ? c : 0)
-                                                    + ((dims1_tmp[3] > 1) ? d : 0);
-                                result[idx_out + d] = array0[idx0] * array1[idx1];
-                                // std::cout << "(" << idx0 << ", " << idx1 << ") -> " << array0[idx0] << " * " << array1[idx1] << " -> " << idx_out + d << std::endl;
+                                std::size_t idx0 =
+                                    idx0_0 +
+                                    strides0[2] * ((dims0[2] > 1) ? c : 0) +
+                                    ((dims0[3] > 1) ? d : 0);
+                                std::size_t idx1 =
+                                    idx1_0 +
+                                    strides1[2] *
+                                        ((dims1_tmp[2] > 1) ? c : 0) +
+                                    ((dims1_tmp[3] > 1) ? d : 0);
+                                result[idx_out + d] =
+                                    array0[idx0] * array1[idx1];
+                                // std::cout << "(" << idx0 << ", " << idx1 <<
+                                // ") -> " << array0[idx0] << " * " <<
+                                // array1[idx1] << " -> " << idx_out + d <<
+                                // std::endl;
                             }
                         }
                     }
@@ -636,22 +708,28 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
                 // conversion to Aidge::Tensors
                 // input0
                 T0->resize(dims0);
-                T0 -> getImpl() -> setRawPtr(array0, dims0[0]*dims0[1]*dims0[2]*dims0[3]);
+                T0->getImpl()->setRawPtr(
+                    array0,
+                    dims0[0] * dims0[1] * dims0[2] * dims0[3]);
 
                 // input1
                 T1->resize(dims1);
-                T1 -> getImpl() -> setRawPtr(array1, array1_size);
+                T1->getImpl()->setRawPtr(array1, array1_size);
 
                 // results
                 Tres->resize(dimsOut);
-                Tres -> getImpl() -> setRawPtr(result, dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]);
+                Tres->getImpl()->setRawPtr(
+                    result,
+                    dimsOut[0] * dimsOut[1] * dimsOut[2] * dimsOut[3]);
 
                 // compute result
                 op->forwardDims();
                 start = std::chrono::system_clock::now();
-                op->forward();
+                myMul->forward();
                 end = std::chrono::system_clock::now();
-                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+                duration +=
+                    std::chrono::duration_cast<std::chrono::microseconds>(
+                        end - start);
 
                 // comparison between truth and computed result
                 REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
@@ -660,13 +738,20 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
                 delete[] array1;
                 delete[] result;
 
-                const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                const std::size_t nb_elements =
+                    std::accumulate(dimsOut.cbegin(),
+                                    dimsOut.cend(),
+                                    std::size_t(1),
+                                    std::multiplies<std::size_t>());
                 number_of_operation += nb_elements;
             }
 
-            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
-            Log::info("total time: {} μs\n", duration.count());
+            std::cout << "number of elements over time spent: "
+                      << (number_of_operation / duration.count()) << std::endl;
+            std::cout << "total time: " << duration.count() << "μs"
+                      << std::endl;
         }
     }
 }
 } // namespace Aidge
+
-- 
GitLab


From f8cfae3355e1a1b489b45fe3c742069dfc92c016 Mon Sep 17 00:00:00 2001
From: NAUD Maxence <maxence.naud@cea.fr>
Date: Sat, 25 Jan 2025 01:49:54 +0000
Subject: [PATCH 20/30] Remove 'iostream' include from 'Test_MulImpl.cpp'

---
 unit_tests/operator/Test_MulImpl.cpp | 214 +++++++++++----------------
 1 file changed, 84 insertions(+), 130 deletions(-)

diff --git a/unit_tests/operator/Test_MulImpl.cpp b/unit_tests/operator/Test_MulImpl.cpp
index 7518bd18..2937e949 100644
--- a/unit_tests/operator/Test_MulImpl.cpp
+++ b/unit_tests/operator/Test_MulImpl.cpp
@@ -9,24 +9,29 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
 #include <chrono>
 #include <cstddef> // std::size_t
 #include <cstdint> // std::uint16_t
-#include <iostream>
 #include <memory>
 #include <numeric> // std::accumulate
-#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution,
+                   // std::uniform_int_distribution
+
+#include <catch2/catch_test_macros.hpp>
 
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/MulImpl.hpp"
+#include "aidge/data/DataType.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/Mul.hpp"
+#include "aidge/utils/ArrayHelpers.hpp"
+#include "aidge/utils/Log.hpp"
 #include "aidge/utils/TensorUtils.hpp"
 
 namespace Aidge {
 
 TEST_CASE("[CPU/Operator] Mul(Backward)", "[Mul][CPU][Backward]") {
-    std::shared_ptr<Node> myMul = Mul();
-    auto op = std::static_pointer_cast<OperatorTensor>(myMul->getOperator());
+    std::shared_ptr<Mul_Op> op = std::make_shared<Mul_Op>();
     op->setDataType(DataType::Float32);
     op->setBackend("cpu");
 
@@ -34,19 +39,10 @@ TEST_CASE("[CPU/Operator] Mul(Backward)", "[Mul][CPU][Backward]") {
 
     SECTION("Case 1: 1D and 2D Tensors") {
         const auto T0 = std::make_shared<Tensor>(
-            Array2D<float, 2, 3>({{{1, 2, 3}, {4, 5, 6}}}));
+            Array2D<cpptype_t<DataType::Float32>, 2, 3>({{{1, 2, 3}, {4, 5, 6}}}));
 
         const auto T1 =
-            std::make_shared<Tensor>(Array1D<float, 3>({0.1, 0.2, 0.3}));
-
-        float *input0 = static_cast<float *>(T0->getImpl()->rawPtr());
-        float *input1 = static_cast<float *>(T1->getImpl()->rawPtr());
-
-        // TODO Use
-        T0->setDataType(DataType::Float32);
-        T0->setBackend("cpu");
-        T1->setDataType(DataType::Float32);
-        T1->setBackend("cpu");
+            std::make_shared<Tensor>(Array1D<cpptype_t<DataType::Float32>, 3>({0.1, 0.2, 0.3}));
 
         op->associateInput(0, T0);
         op->associateInput(1, T1);
@@ -54,16 +50,15 @@ TEST_CASE("[CPU/Operator] Mul(Backward)", "[Mul][CPU][Backward]") {
             Array2D<float, 2, 3>({{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}})));
         op->forwardDims();
 
-        myMul->backward();
+        op->backward();
 
-        const auto expectedGrad0 = std::make_shared<Tensor>(
-            Array2D<float, 2, 3>({{{0.1, 0.2, 0.3}, {0.1, 0.2, 0.3}}}));
+        const Tensor expectedGrad0 =
+            Array2D<cpptype_t<DataType::Float32>, 2, 3>({{{0.1, 0.2, 0.3}, {0.1, 0.2, 0.3}}});
 
-        const auto expectedGrad1 =
-            std::make_shared<Tensor>(Array1D<float, 3>({5, 7, 9}));
+        const Tensor expectedGrad1 = Array1D<cpptype_t<DataType::Float32>, 3>({5, 7, 9});
 
-        REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0));
-        REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1));
     }
 
     SECTION("Case 2: 3D and 1D tensors") {
@@ -77,31 +72,25 @@ TEST_CASE("[CPU/Operator] Mul(Backward)", "[Mul][CPU][Backward]") {
         const auto newGrad = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
             {{{{1, 1, 1}, {1, 1, 1}}, {{1, 1, 1}, {1, 1, 1}}}}));
 
-        const auto expectedGrad0 = std::make_shared<Tensor>(
+        const Tensor expectedGrad0 =
             Array3D<float, 2, 2, 3>({{{{0.3, 0.2, 0.1}, {0.3, 0.2, 0.1}},
-                                      {{0.3, 0.2, 0.1}, {0.3, 0.2, 0.1}}}}));
-
-        const auto expectedGrad1 =
-            std::make_shared<Tensor>(Array1D<float, 3>({22.0, 26.0, 30.0}));
+                                      {{0.3, 0.2, 0.1}, {0.3, 0.2, 0.1}}}});
 
-        for (auto T : {T0, T1, newGrad, expectedGrad0, expectedGrad1}) {
-            T->setBackend("cpu");
-            T->setDataType(DataType::Float32);
-        }
+        const Tensor expectedGrad1 = Array1D<cpptype_t<DataType::Float32>, 3>({22.0, 26.0, 30.0});
 
         op->associateInput(0, T0);
         op->associateInput(1, T1);
         op->getOutput(0)->setGrad(newGrad);
         op->forwardDims();
 
-        myMul->backward();
+        op->backward();
 
-        REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0));
-        REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1));
     }
 
     SECTION("Case 3: 4D and 2D tensors") {
-        const auto T0 = std::make_shared<Tensor>(Array4D<float, 2, 2, 3, 3>(
+        const auto T0 = std::make_shared<Tensor>(Array4D<cpptype_t<DataType::Float32>, 2, 2, 3, 3>(
             {{{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}, {7.0, 8.0, 9.0}},
                {{10.0, 11.0, 12.0}, {13.0, 14.0, 15.0}, {16.0, 17.0, 18.0}}},
               {{{19.0, 20.0, 21.0}, {22.0, 23.0, 24.0}, {25.0, 26.0, 27.0}},
@@ -109,42 +98,37 @@ TEST_CASE("[CPU/Operator] Mul(Backward)", "[Mul][CPU][Backward]") {
                 {31.0, 32.0, 33.0},
                 {34.0, 35.0, 36.0}}}}}));
 
-        const auto T1 = std::make_shared<Tensor>(Array2D<float, 3, 3>(
+        const auto T1 = std::make_shared<Tensor>(Array2D<cpptype_t<DataType::Float32>, 3, 3>(
             {{{0.5, 0.3, 0.1}, {0.4, 0.2, 0.6}, {0.7, 0.8, 0.9}}}));
 
         const auto newGrad =
-            std::make_shared<Tensor>(Array4D<float, 2, 2, 3, 3>(
+            std::make_shared<Tensor>(Array4D<cpptype_t<DataType::Float32>, 2, 2, 3, 3>(
                 {{{{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}},
                    {{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}},
                   {{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}},
                    {{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}}}}));
 
-        const auto expectedGrad0 =
-            std::make_shared<Tensor>(Array4D<float, 2, 2, 3, 3>(
+        const Tensor expectedGrad0 =
+            Array4D<cpptype_t<DataType::Float32>, 2, 2, 3, 3>(
                 {{{{{0.5, 0.3, 0.1}, {0.4, 0.2, 0.6}, {0.7, 0.8, 0.9}},
                    {{0.5, 0.3, 0.1}, {0.4, 0.2, 0.6}, {0.7, 0.8, 0.9}}},
                   {{{0.5, 0.3, 0.1}, {0.4, 0.2, 0.6}, {0.7, 0.8, 0.9}},
-                   {{0.5, 0.3, 0.1}, {0.4, 0.2, 0.6}, {0.7, 0.8, 0.9}}}}}));
+                   {{0.5, 0.3, 0.1}, {0.4, 0.2, 0.6}, {0.7, 0.8, 0.9}}}}});
 
-        const auto expectedGrad1 = std::make_shared<Tensor>(
-            Array2D<float, 3, 3>({{{58.0, 62.0, 66.0},
+        const Tensor expectedGrad1 =
+            Array2D<cpptype_t<DataType::Float32>, 3, 3>({{{58.0, 62.0, 66.0},
                                    {70.0, 74.0, 78.0},
-                                   {82.0, 86.0, 90.0}}}));
-
-        for (const auto T : {T0, T1, newGrad, expectedGrad0, expectedGrad1}) {
-            T->setBackend("cpu");
-            T->setDataType(DataType::Float32);
-        }
+                                   {82.0, 86.0, 90.0}}});
 
         op->associateInput(0, T0);
         op->associateInput(1, T1);
         op->getOutput(0)->setGrad(newGrad);
         op->forwardDims();
 
-        myMul->backward();
+        op->backward();
 
-        REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0));
-        REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1));
     }
 
     SECTION("Case 4: 3D and 2D tensors") {
@@ -161,12 +145,12 @@ TEST_CASE("[CPU/Operator] Mul(Backward)", "[Mul][CPU][Backward]") {
                                       }}}));
 
         const auto T1 = std::make_shared<Tensor>(
-            Array2D<float, 3, 4>({{{0.1, 0.2, 0.3, 0.4},
+            Array2D<cpptype_t<DataType::Float32>, 3, 4>({{{0.1, 0.2, 0.3, 0.4},
                                    {0.5, 0.6, 0.7, 0.8},
                                    {0.9, 1.0, 1.1, 1.2}}}));
 
         const auto newGrad = std::make_shared<Tensor>(
-            Array3D<float, 2, 3, 4>({{{
+            Array3D<cpptype_t<DataType::Float32>, 2, 3, 4>({{{
                                           {1.0, 1.0, 1.0, 1.0},
                                           {1.0, 1.0, 1.0, 1.0},
                                           {1.0, 1.0, 1.0, 1.0},
@@ -177,81 +161,68 @@ TEST_CASE("[CPU/Operator] Mul(Backward)", "[Mul][CPU][Backward]") {
                                           {1.0, 1.0, 1.0, 1.0},
                                       }}}));
 
-        const auto expectedGrad0 = std::make_shared<Tensor>(
-            Array3D<float, 2, 3, 4>({{{{0.1, 0.2, 0.3, 0.4},
+        const Tensor expectedGrad0 =
+            Array3D<cpptype_t<DataType::Float32>, 2, 3, 4>({{{{0.1, 0.2, 0.3, 0.4},
                                        {0.5, 0.6, 0.7, 0.8},
                                        {0.9, 1.0, 1.1, 1.2}},
                                       {{0.1, 0.2, 0.3, 0.4},
                                        {0.5, 0.6, 0.7, 0.8},
-                                       {0.9, 1.0, 1.1, 1.2}}}}));
+                                       {0.9, 1.0, 1.1, 1.2}}}});
 
-        const auto expectedGrad1 = std::make_shared<Tensor>(
-            Array2D<float, 3, 4>({{{14.0, 16.0, 18.0, 20.0},
+        const Tensor expectedGrad1 =
+            Array2D<cpptype_t<DataType::Float32>, 3, 4>({{{14.0, 16.0, 18.0, 20.0},
                                    {22.0, 24.0, 26.0, 28.0},
-                                   {30.0, 32.0, 34.0, 36.0}}}));
-
-        for (const auto T : {T0, T1, newGrad, expectedGrad0, expectedGrad1}) {
-            T->setBackend("cpu");
-            T->setDataType(DataType::Float32);
-        }
+                                   {30.0, 32.0, 34.0, 36.0}}});
 
         op->associateInput(0, T0);
         op->associateInput(1, T1);
         op->getOutput(0)->setGrad(newGrad);
         op->forwardDims();
 
-        myMul->backward();
+        op->backward();
 
-        REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0));
-        REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1));
     }
 
     SECTION("Case 5: Tensors with random values") {
 
         // Use random values
-        std::vector<std::size_t> dims0 = {5, 2, 1, 7}; // First tensor
-        std::vector<std::size_t> dims1 = {2, 6, 7};    // Second tensor
-        std::vector<std::size_t> outputDims = {5, 2, 6, 7};
-
-        const auto input0Size = 5 * 2 * 1 * 7;
-        const auto input1Size = 2 * 6 * 7;
-        const auto outputSize = 5 * 2 * 6 * 7;
+        const std::vector<std::size_t> dims0 = {5, 2, 1, 7}; // First tensor
+        const std::vector<std::size_t> dims1 = {2, 6, 7};    // Second tensor
+        const std::vector<std::size_t> outputDims = {5, 2, 6, 7};
 
         std::random_device rd;
         std::mt19937 gen(rd());
         std::uniform_real_distribution<float> dist(0.1f, 1.0f);
 
-        std::vector<float> input0Data(input0Size);
-        std::vector<float> input1Data(input1Size);
-
-        // Fill with random values
-        for (auto &val : input0Data) {
-            val = dist(gen);
-        }
-        for (auto &val : input1Data) {
-            val = dist(gen);
-        }
-
-        auto T0 = std::make_shared<Tensor>();
-        auto T1 = std::make_shared<Tensor>();
-
+        auto T0 = std::make_shared<Tensor>(dims0);
         T0->setDataType(DataType::Float32);
         T0->setBackend("cpu");
-        T0->resize(dims0);
-        T0->getImpl()->setRawPtr(input0Data.data(), input0Size);
+        float* input0Data = static_cast<float*>(T0->getImpl()->rawPtr());
+        // Fill with random values
+        for (std::size_t i = 0; i < T0->size(); ++i) {
+            input0Data[i] = dist(gen);
+        }
 
+        auto T1 = std::make_shared<Tensor>(dims1);
         T1->setDataType(DataType::Float32);
         T1->setBackend("cpu");
-        T1->resize(dims1);
-        T1->getImpl()->setRawPtr(input1Data.data(), input1Size);
+        float* input1Data = static_cast<float*>(T1->getImpl()->rawPtr());
+        // Fill with random values
+        for (std::size_t i = 0; i < T1->size(); ++i) {
+            input1Data[i] = dist(gen);
+        }
 
         op->associateInput(0, T0);
         op->associateInput(1, T1);
 
         op->forwardDims();
-        myMul->forward();
+        op->forward();
 
-        std::vector<float> expectedOutput(outputSize);
+        Tensor expectedOutput{outputDims};
+        expectedOutput.setBackend("cpu");
+        float* expectedOutputData = static_cast<float*>(expectedOutput.getImpl()->rawPtr());
 
         for (std::size_t n = 0; n < 5; ++n) {
             for (std::size_t c = 0; c < 2; ++c) {
@@ -263,8 +234,7 @@ TEST_CASE("[CPU/Operator] Mul(Backward)", "[Mul][CPU][Backward]") {
                         std::size_t in1Idx =
                             w + 7 * (h + 6 * c);           // no n dimension
 
-                        expectedOutput[outIdx] =
-                            input0Data[in0Idx] * input1Data[in1Idx];
+                        expectedOutputData[outIdx] = input0Data[in0Idx] * input1Data[in1Idx];
                     }
                 }
             }
@@ -272,18 +242,10 @@ TEST_CASE("[CPU/Operator] Mul(Backward)", "[Mul][CPU][Backward]") {
 
         auto outputTensor = op->getOutput(0);
 
-        // Verify forward pass
-        auto expectedOutputTensor = std::make_shared<Tensor>();
-        expectedOutputTensor->resize(outputDims);
-        expectedOutputTensor->setBackend("cpu");
-        expectedOutputTensor->setDataType(DataType::Float32);
-        expectedOutputTensor->getImpl()->setRawPtr(expectedOutput.data(),
-                                                     expectedOutput.size());
-
-        REQUIRE(approxEq<float>(*outputTensor, *expectedOutputTensor));
+        REQUIRE(approxEq<float>(*outputTensor, expectedOutput));
 
         // Backward pass
-        std::vector<float> gradOutputData(outputSize);
+        std::vector<float> gradOutputData(expectedOutput.size());
         for (auto &val : gradOutputData) {
             val = dist(gen);
         }
@@ -291,11 +253,11 @@ TEST_CASE("[CPU/Operator] Mul(Backward)", "[Mul][CPU][Backward]") {
         op->getOutput(0)->setGrad(std::make_shared<Tensor>());
         op->getOutput(0)->grad()->resize(outputDims);
         op->getOutput(0)->grad()->getImpl()->setRawPtr(gradOutputData.data(),
-                                                       outputSize);
+                                                       expectedOutput.size());
 
         // Compute reference gradients
-        std::vector<float> expectedGrad0(input0Size, 0.0f);
-        std::vector<float> expectedGrad1(input1Size, 0.0f);
+        std::vector<float> expectedGrad0(T0->size(), 0.0f);
+        std::vector<float> expectedGrad1(T1->size(), 0.0f);
 
         for (std::size_t n = 0; n < 5; ++n) {
             for (std::size_t c = 0; c < 2; ++c) {
@@ -318,7 +280,7 @@ TEST_CASE("[CPU/Operator] Mul(Backward)", "[Mul][CPU][Backward]") {
         }
 
         // Perform backward pass
-        myMul->backward();
+        op->backward();
 
         auto expectedGrad0Tensor = std::make_shared<Tensor>();
         expectedGrad0Tensor->resize(T0->dims());
@@ -327,8 +289,7 @@ TEST_CASE("[CPU/Operator] Mul(Backward)", "[Mul][CPU][Backward]") {
         expectedGrad0Tensor->getImpl()->setRawPtr(expectedGrad0.data(),
                                                     expectedGrad0.size());
 
-        auto expectedGrad1Tensor = std::make_shared<Tensor>();
-        expectedGrad1Tensor->resize(T1->dims());
+        auto expectedGrad1Tensor = std::make_shared<Tensor>(T1->dims());
         expectedGrad1Tensor->setBackend("cpu");
         expectedGrad1Tensor->setDataType(DataType::Float32);
         expectedGrad1Tensor->getImpl()->setRawPtr(expectedGrad1.data(),
@@ -365,8 +326,7 @@ TEST_CASE("[cpu/operator] Mul(forward)", "[Mul][CPU]") {
                                                           std::size_t(3));
     std::uniform_int_distribution<int> boolDist(0, 1);
 
-    std::shared_ptr<Node> myMul = Mul();
-    auto op = std::static_pointer_cast<OperatorTensor>(myMul->getOperator());
+    std::shared_ptr<Mul_Op> op = std::make_shared<Mul_Op>();
     op->setDataType(DataType::Float32);
     op->setBackend("cpu");
 
@@ -437,7 +397,7 @@ TEST_CASE("[cpu/operator] Mul(forward)", "[Mul][CPU]") {
 
                 op->forwardDims();
                 start = std::chrono::system_clock::now();
-                myMul->forward();
+                op->forward();
                 end = std::chrono::system_clock::now();
                 duration +=
                     std::chrono::duration_cast<std::chrono::microseconds>(
@@ -449,10 +409,8 @@ TEST_CASE("[cpu/operator] Mul(forward)", "[Mul][CPU]") {
                 delete[] array1;
                 delete[] result;
             }
-            std::cout << "number of elements over time spent: "
-                      << (number_of_operation / duration.count()) << std::endl;
-            std::cout << "total time: " << duration.count() << "μs"
-                      << std::endl;
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {}μs\n", duration.count());
         }
 
         SECTION("+1-D Tensor / +1-D Tensor - broadcasting") {
@@ -586,7 +544,7 @@ TEST_CASE("[cpu/operator] Mul(forward)", "[Mul][CPU]") {
                 // compute result
                 op->forwardDims();
                 start = std::chrono::system_clock::now();
-                myMul->forward();
+                op->forward();
                 end = std::chrono::system_clock::now();
                 duration +=
                     std::chrono::duration_cast<std::chrono::microseconds>(
@@ -606,10 +564,8 @@ TEST_CASE("[cpu/operator] Mul(forward)", "[Mul][CPU]") {
                                     std::multiplies<std::size_t>());
                 number_of_operation += nb_elements;
             }
-            std::cout << "number of elements over time spent: "
-                      << (number_of_operation / duration.count()) << std::endl;
-            std::cout << "total time: " << duration.count() << "μs"
-                      << std::endl;
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {}μs\n", duration.count());
         }
         SECTION("+1-D Tensor / 1-D Tensor") {
             std::size_t number_of_operation = 0;
@@ -725,7 +681,7 @@ TEST_CASE("[cpu/operator] Mul(forward)", "[Mul][CPU]") {
                 // compute result
                 op->forwardDims();
                 start = std::chrono::system_clock::now();
-                myMul->forward();
+                op->forward();
                 end = std::chrono::system_clock::now();
                 duration +=
                     std::chrono::duration_cast<std::chrono::microseconds>(
@@ -746,10 +702,8 @@ TEST_CASE("[cpu/operator] Mul(forward)", "[Mul][CPU]") {
                 number_of_operation += nb_elements;
             }
 
-            std::cout << "number of elements over time spent: "
-                      << (number_of_operation / duration.count()) << std::endl;
-            std::cout << "total time: " << duration.count() << "μs"
-                      << std::endl;
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {}μs\n", duration.count());
         }
     }
 }
-- 
GitLab


From e06442459603d6c15b6163c378fb3de68aa9504e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gr=C3=A9goire=20KUBLER?= <gregoire.kubler@proton.me>
Date: Wed, 11 Dec 2024 16:16:30 +0100
Subject: [PATCH 21/30] feat : added Expand operator

---
 include/aidge/backend/cpu.hpp                 |   1 +
 .../aidge/backend/cpu/operator/ExpandImpl.hpp |  35 +++
 .../cpu/operator/ExpandImpl_kernels.hpp       | 215 ++++++++++++++++++
 src/operator/ExpandImpl.cpp                   |  56 +++++
 unit_tests/operator/Test_ExpandImpl.cpp       | 113 +++++++++
 5 files changed, 420 insertions(+)
 create mode 100644 include/aidge/backend/cpu/operator/ExpandImpl.hpp
 create mode 100644 include/aidge/backend/cpu/operator/ExpandImpl_kernels.hpp
 create mode 100644 src/operator/ExpandImpl.cpp
 create mode 100644 unit_tests/operator/Test_ExpandImpl.cpp

diff --git a/include/aidge/backend/cpu.hpp b/include/aidge/backend/cpu.hpp
index 539a3128..0c8ab84d 100644
--- a/include/aidge/backend/cpu.hpp
+++ b/include/aidge/backend/cpu.hpp
@@ -30,6 +30,7 @@
 #include "aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp"
 #include "aidge/backend/cpu/operator/DivImpl.hpp"
 #include "aidge/backend/cpu/operator/ErfImpl.hpp"
+#include "aidge/backend/cpu/operator/ExpandImpl.hpp"
 #include "aidge/backend/cpu/operator/FCImpl.hpp"
 #include "aidge/backend/cpu/operator/FoldImpl.hpp"
 #include "aidge/backend/cpu/operator/GlobalAveragePoolingImpl.hpp"
diff --git a/include/aidge/backend/cpu/operator/ExpandImpl.hpp b/include/aidge/backend/cpu/operator/ExpandImpl.hpp
new file mode 100644
index 00000000..adfc6ab1
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/ExpandImpl.hpp
@@ -0,0 +1,35 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_EXPANDIMPL_H_
+#define AIDGE_CPU_OPERATOR_EXPANDIMPL_H_
+
+#include <memory>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Expand.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using ExpandImpl_cpu = OperatorImpl_cpu<Expand_Op,
+                                        void(const std::shared_ptr<Tensor> &,
+                                             const std::shared_ptr<Tensor> &,
+                                             void *,
+                                             const std::vector<DimSize_t> &)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(Expand_Op, "cpu", Aidge::ExpandImpl_cpu::create);
+} // namespace Aidge
+
+#endif /* _AIDGE_CPU_OPERATOR_EXPANDIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/ExpandImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ExpandImpl_kernels.hpp
new file mode 100644
index 00000000..3f4341c3
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/ExpandImpl_kernels.hpp
@@ -0,0 +1,215 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_EXPANDIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_EXPANDIMPL_KERNELS_H_
+
+#include "aidge/backend/cpu/operator/ExpandImpl.hpp"
+#include "aidge/utils/Registrar.hpp"
+
+#include <aidge/data/Data.hpp>
+#include <aidge/data/Tensor.hpp>
+#include <aidge/data/half.hpp>
+#include <aidge/scheduler/ProdConso.hpp>
+#include <aidge/utils/Types.h>
+#include <cmath>
+#include <cstdint> // std::int32_t, std::int64_t
+#include <memory>
+#include <numeric>
+
+namespace {
+// suppose values are contiguous in memory
+template <class IO>
+void expandContiguousArray(const std::size_t inputStackSize,
+                           const std::size_t outputStackSize,
+                           const IO *input,
+                           IO *output) {
+    for (std::size_t i = 0; i < outputStackSize; ++i) {
+        output[i] = (inputStackSize == 1) ? input[0] : input[i];
+    }
+    return;
+}
+} // namespace
+
+namespace Aidge {
+
+template <class IO>
+void ExpandImpl_cpu_forward_kernel(
+    const std::shared_ptr<Tensor> &inData,
+    const std::shared_ptr<Tensor> &_inExpandShape,
+    void *_output,
+    const std::vector<DimSize_t> &outputDims) {
+
+    // retrieving data of inputShape & dimensions of inputDims
+    // as the process will require to modify the values
+    IO *output = static_cast<IO *>(_output);
+    std::vector<DimSize_t> inExpandShape(_inExpandShape->size());
+    for (DimSize_t i = 0; i < _inExpandShape->size(); ++i) {
+        inExpandShape[i] = _inExpandShape->get<std::int64_t>(i);
+    }
+    std::vector<DimSize_t> inDataDims = inData->dims();
+
+    // Example with 2 tensors
+    // [5,2,1,7] & [2,6,7]
+    // 1. Same number of dimensions but adding 1s to le left of "smallest"
+    // tensor -> [5,2,1,7] & [1,2,6,7]
+    // 2. Find the highest equal dimension -> 3
+    //    Exception: if the first diverging dimension is the last one, then ->
+    //    4 (dims.size())
+    // 3. Compute the highest number of contiguous data -> 7
+    // 4. Compute stride and offset step for the broadcast mechanism
+    // 5. Call a simple kernel
+
+    // ## Compute compatible input dimensions
+    // special case for equal dimensions, the kernel is called with the entire
+    // arrays at once
+
+    if (inDataDims == inExpandShape) {
+        const std::size_t input0ContiguousSize =
+            std::accumulate(inDataDims.cbegin(),
+                            inDataDims.cend(),
+                            static_cast<std::size_t>(1),
+                            std::multiplies<std::size_t>());
+        for (std::size_t i = 0; i < input0ContiguousSize; ++i) {
+            output[i] = inData->get<IO>(i);
+        }
+        return;
+    }
+
+    // set dimensions to be of equal size by filling the smallest one with
+    // ones.
+    if (inDataDims.size() > inExpandShape.size()) {
+        inExpandShape.insert(inExpandShape.cbegin(),
+                             inDataDims.size() - inExpandShape.size(),
+                             static_cast<DimSize_t>(1));
+    } else if (_inExpandShape->size() > inDataDims.size()) {
+        inDataDims.insert(inDataDims.cbegin(),
+                          inExpandShape.size() - inDataDims.size(),
+                          static_cast<DimSize_t>(1));
+    }
+
+    const std::size_t nbDims = inDataDims.size();
+
+    // Find the highest equal dimension
+    // std::size_t contiguousIdx = nbDims - 1;
+    std::size_t contiguousIdx = nbDims;
+    while (contiguousIdx-- > 0) {
+        // for (; contiguousIdx+1 > 0; --contiguousIdx) {
+        if (inDataDims[contiguousIdx] != inExpandShape[contiguousIdx]) {
+            break;
+        }
+    }
+    if (contiguousIdx == (nbDims - 1)) {
+        // last dimensions of one of the input Tensor are of size 1
+        const std::vector<std::size_t> &dims =
+            (inDataDims[contiguousIdx] == 1) ? inDataDims : inExpandShape;
+        while ((contiguousIdx + 1 > 0) && (dims[contiguousIdx] == 1)) {
+            --contiguousIdx;
+        }
+    }
+    ++contiguousIdx;
+
+    // Compute the highest number of contiguous data for each Tensor
+    const std::size_t inputDataContiguousSize =
+        std::accumulate(inDataDims.cbegin() + contiguousIdx,
+                        inDataDims.cend(),
+                        static_cast<std::size_t>(1),
+                        std::multiplies<std::size_t>());
+    const std::size_t outputContiguousSize =
+        std::accumulate(outputDims.cbegin() + contiguousIdx,
+                        outputDims.cend(),
+                        static_cast<std::size_t>(1),
+                        std::multiplies<std::size_t>());
+
+    // initialize strides to iterate through data because of broadcasting
+    std::unique_ptr<std::int32_t[]> stridePostIn =
+        std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> strideStepIn =
+        std::make_unique<std::int32_t[]>(contiguousIdx);
+    if (contiguousIdx > 0) {
+        stridePostIn[contiguousIdx - 1] = 1;
+        for (std::size_t i = contiguousIdx - 2;
+             i != static_cast<std::size_t>(-1);
+             --i) {
+            stridePostIn[i] = stridePostIn[i + 1] *
+                              static_cast<std::int32_t>(inDataDims[i + 1]);
+        }
+        for (std::size_t i = 0; i != contiguousIdx; ++i) {
+            strideStepIn[i] = (inDataDims[i] == 1) ? 1 - stridePostIn[i] : 1;
+        }
+    }
+
+    // variables for arrays offsets
+    std::size_t offsetInData = 0;
+    std::size_t offsetOut = 0;
+
+    std::size_t dim = contiguousIdx - 1;
+    const std::size_t nbStacks =
+        std::accumulate(outputDims.cbegin(),
+                        outputDims.cbegin() + contiguousIdx,
+                        static_cast<std::size_t>(1),
+                        std::multiplies<std::size_t>());
+
+    for (std::size_t stack = 0; stack < nbStacks;) {
+        expandContiguousArray<IO>(
+            inputDataContiguousSize,
+            outputContiguousSize,
+            &static_cast<const IO *>(
+                inData->getImpl()
+                    ->rawPtr())[offsetInData * inputDataContiguousSize],
+            &output[offsetOut * outputContiguousSize]);
+        if (++stack < nbStacks) {
+            std::size_t tmpStack = stack;
+            while (tmpStack % outputDims[dim] == 0) {
+                tmpStack /= outputDims[dim];
+                dim--;
+            }
+            offsetInData += strideStepIn[dim];
+            ++offsetOut;
+            dim = contiguousIdx - 1;
+        }
+    }
+}
+
+REGISTRAR(ExpandImpl_cpu,
+          {{DataType::Int16, DataType::Int64}, {DataType::Int16}},
+          {ProdConso::inPlaceModel,
+           Aidge::ExpandImpl_cpu_forward_kernel<std::int16_t>,
+           nullptr});
+REGISTRAR(ExpandImpl_cpu,
+          {{DataType::Int32, DataType::Int64}, {DataType::Int32}},
+          {ProdConso::inPlaceModel,
+           Aidge::ExpandImpl_cpu_forward_kernel<std::int32_t>,
+           nullptr});
+REGISTRAR(ExpandImpl_cpu,
+          {{DataType::Int64, DataType::Int64}, {DataType::Int64}},
+          {ProdConso::inPlaceModel,
+           Aidge::ExpandImpl_cpu_forward_kernel<std::int64_t>,
+           nullptr});
+
+REGISTRAR(ExpandImpl_cpu,
+          {{DataType::Float16, DataType::Int64}, {DataType::Float16}},
+          {ProdConso::inPlaceModel,
+           Aidge::ExpandImpl_cpu_forward_kernel<half_float::half>,
+           nullptr});
+REGISTRAR(ExpandImpl_cpu,
+          {{DataType::Float32, DataType::Int64}, {DataType::Float32}},
+          {ProdConso::inPlaceModel,
+           Aidge::ExpandImpl_cpu_forward_kernel<float>,
+           nullptr});
+REGISTRAR(ExpandImpl_cpu,
+          {{DataType::Float64, DataType::Int64}, {DataType::Float64}},
+          {ProdConso::inPlaceModel,
+           Aidge::ExpandImpl_cpu_forward_kernel<double>,
+           nullptr});
+} // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_EXPANDIMPL_KERNELS_H_ */
diff --git a/src/operator/ExpandImpl.cpp b/src/operator/ExpandImpl.cpp
new file mode 100644
index 00000000..dfd4d2d8
--- /dev/null
+++ b/src/operator/ExpandImpl.cpp
@@ -0,0 +1,56 @@
+
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include "aidge/backend/cpu/operator/ExpandImpl.hpp"
+
+#include <vector>
+
+#include "aidge/backend/cpu/operator/ExpandImpl_kernels.hpp"
+#include "aidge/data/Data.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/Expand.hpp"
+#include "aidge/utils/ErrorHandling.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+
+template <> void ExpandImpl_cpu::forward() {
+    const Expand_Op &op_ = static_cast<const Expand_Op &>(mOp);
+    // Check if input are provided
+    AIDGE_ASSERT(op_.getInput(0),
+                 "{}: missing input 0: {}",
+                 Expand_Op::Type,
+                 Expand_Op::getInputsName()[0]);
+    AIDGE_ASSERT(op_.getInput(1),
+                 "{}: missing input 1: {}",
+                 Expand_Op::Type,
+                 Expand_Op::getInputsName()[1]);
+
+    // Find the correct kernel type
+    const auto impl =
+        Registrar<ExpandImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Call kernel
+    impl.forward(op_.getInput(0),
+                 op_.getInput(1),
+                 op_.getOutput(0)->getImpl()->rawPtr(),
+                 op_.getOutput(0)->dims());
+}
+
+template <> void ExpandImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(
+        std::runtime_error,
+        "Backward not yet implemented for Expand_Op on backend cpu");
+}
+
+} // namespace Aidge
diff --git a/unit_tests/operator/Test_ExpandImpl.cpp b/unit_tests/operator/Test_ExpandImpl.cpp
new file mode 100644
index 00000000..3fcb5e44
--- /dev/null
+++ b/unit_tests/operator/Test_ExpandImpl.cpp
@@ -0,0 +1,113 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <aidge/data/Data.hpp>
+#include <aidge/operator/OperatorTensor.hpp>
+#include <aidge/utils/ArrayHelpers.hpp>
+#include <aidge/utils/TensorUtils.hpp>
+#include <aidge/utils/Types.h>
+#include <catch2/catch_test_macros.hpp>
+#include <cstdint>
+#include <cstdlib>
+#include <memory>
+
+#include "aidge/data/Tensor.hpp"
+#include "aidge/filler/Filler.hpp"
+#include "aidge/operator/Expand.hpp"
+
+#include "aidge/backend/cpu.hpp"
+
+using std::shared_ptr;
+
+using namespace Aidge;
+
+void setupTestExpand(shared_ptr<Tensor> inputData,
+                     shared_ptr<Tensor> inputShape,
+                     shared_ptr<OperatorTensor> &op,
+                     shared_ptr<Tensor> &expectedOutput) {
+
+    op->getOutput(0)->setDataType(inputData->dataType());
+
+    inputData->setBackend("cpu");
+    op->associateInput(0, inputData);
+
+    inputShape->setBackend("cpu");
+    op->associateInput(1, inputShape);
+
+    expectedOutput->setBackend("cpu");
+    expectedOutput->setDataType(DataType::Int32);
+}
+
+TEST_CASE("[cpu/operator] Expand(forward)", "[Expand][CPU]") {
+    auto node = Expand();
+    auto op = std::static_pointer_cast<OperatorTensor>(node->getOperator());
+    op->setBackend("cpu");
+
+    SECTION("Expand shape is bigger than inputData") {
+        auto inputData = std::make_shared<Tensor>(Array1D<int, 2>({1, 3}));
+        auto inputShape =
+            std::make_shared<Tensor>(Array1D<std::int64_t, 4>({1, 3, 4, 2}));
+        auto expectedOutput = std::make_shared<Tensor>(
+            Array4D<int, 1, 3, 4, 2>({{{{{1, 3}, {1, 3}, {1, 3}, {1, 3}},
+                                        {{1, 3}, {1, 3}, {1, 3}, {1, 3}},
+                                        {{1, 3}, {1, 3}, {1, 3}, {1, 3}}}}}));
+        setupTestExpand(inputData, inputShape, op, expectedOutput);
+
+        // forwardDims has already been tested in core
+        CHECK(op->forwardDims(true));
+        REQUIRE_NOTHROW(op->forward());
+        CHECK(approxEq<int>(*expectedOutput, *op->getOutput(0)));
+    }
+    SECTION("Expand shape has less dimensions than inputData") {
+        auto inputData = std::make_shared<Tensor>(
+            Array3D<int, 2, 1, 3>({{{2, 1, 3}, {2, 1, 3}}}));
+        auto inputShape =
+            std::make_shared<Tensor>(Array1D<std::int64_t, 2>({2, 3}));
+        auto expectedOutput = std::make_shared<Tensor>(Array3D<int, 2, 2, 3>(
+            {{{{2, 1, 3}, {2, 1, 3}}, {{2, 1, 3}, {2, 1, 3}}}}));
+        setupTestExpand(inputData, inputShape, op, expectedOutput);
+
+        // forwardDims has already been tested in core
+        CHECK(op->forwardDims(true));
+        REQUIRE_NOTHROW(op->forward());
+        CHECK(approxEq<int>(*expectedOutput, *op->getOutput(0)));
+    }
+    SECTION("Expand shape = {1} leads to input equal to output.") {
+        auto inputData = std::make_shared<Tensor>(
+            Array4D<int, 2, 1, 3, 1>({{{2, 1, 3}, {2, 1, 3}}}));
+        auto inputShape =
+            std::make_shared<Tensor>(Array1D<std::int64_t, 1>({1}));
+        auto expectedOutput = std::make_shared<Tensor>(
+            Array4D<int, 2, 1, 3, 1>({{{2, 1, 3}, {2, 1, 3}}}));
+        setupTestExpand(inputData, inputShape, op, expectedOutput);
+
+        // forwardDims has already been tested in core
+        CHECK(op->forwardDims(true));
+        REQUIRE_NOTHROW(op->forward());
+        CHECK(approxEq<int>(*expectedOutput, *op->getOutput(0)));
+    }
+    SECTION("The only common dimension is the last one & its equal to 1") {
+        auto inputData = std::make_shared<Tensor>(
+            Array4D<int, 1, 1, 3, 1>({{{{2, 1, 3}}}}));
+        auto inputShape =
+            std::make_shared<Tensor>(Array1D<std::int64_t, 3>({2, 1, 1}));
+        auto expectedOutput = std::make_shared<Tensor>(
+            Array4D<int, 1, 2, 3, 1>({{{{2, 1, 3}, {2, 1, 3}}}}));
+        setupTestExpand(inputData, inputShape, op, expectedOutput);
+
+        // forwardDims has already been tested in core
+        CHECK(op->forwardDims(true));
+        REQUIRE_NOTHROW(op->forward());
+        CHECK(approxEq<int>(*expectedOutput, *op->getOutput(0)));
+    }
+    SECTION("N-Dim to N-Dim") {}
+    auto inputData = std::shared_ptr<Tensor>();
+}
-- 
GitLab


From 3a1702a144dabd840d044971d5afef9c7383ad23 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gr=C3=A9goire=20KUBLER?= <gregoire.kubler@proton.me>
Date: Mon, 27 Jan 2025 14:00:55 +0100
Subject: [PATCH 22/30] chore : cleanup headers in test to avoid windows
 compiler error : "Fatal Error C1128"

---
 unit_tests/scheduler/Test_Scheduler.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/unit_tests/scheduler/Test_Scheduler.cpp b/unit_tests/scheduler/Test_Scheduler.cpp
index 9224d6f9..956169c3 100644
--- a/unit_tests/scheduler/Test_Scheduler.cpp
+++ b/unit_tests/scheduler/Test_Scheduler.cpp
@@ -25,7 +25,12 @@
 #include "aidge/scheduler/SequentialScheduler.hpp"
 #include "aidge/scheduler/ParallelScheduler.hpp"
 
-#include "aidge/backend/cpu.hpp"
+#include "aidge/backend/cpu/operator/FCImpl.hpp"
+#include "aidge/backend/cpu/operator/ConvImpl.hpp"
+#include "aidge/backend/cpu/operator/ReLUImpl.hpp"
+#include "aidge/backend/cpu/operator/SqrtImpl.hpp"
+#include "aidge/backend/cpu/operator/AddImpl.hpp"
+
 #include "aidge/recipes/GraphViewHelper.hpp"
 
 
-- 
GitLab


From 755009ee85020a14c450ba92067ca1b9c3e61592 Mon Sep 17 00:00:00 2001
From: NAUD Maxence <maxence.naud@cea.fr>
Date: Tue, 28 Jan 2025 15:42:04 +0000
Subject: [PATCH 23/30] Enforce C++-14 in 'CMakeLists.txt'

---
 CMakeLists.txt | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a2f50c50..66ef8ff2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,8 @@
 cmake_minimum_required(VERSION 3.18)
-set(CXX_STANDARD 14)
+
+set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS  OFF)
 
 file(STRINGS "${CMAKE_SOURCE_DIR}/version.txt" version)
 
-- 
GitLab


From bbb0b049b1771df71c147a168202189ad16b1a51 Mon Sep 17 00:00:00 2001
From: NAUD Maxence <maxence.naud@cea.fr>
Date: Tue, 28 Jan 2025 15:42:57 +0000
Subject: [PATCH 24/30] Change Python minimum version 3.7 -> 3.8

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index baa61de5..39bed4d2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ description="CPU implementation of operators of the AIDGE framework"
 dependencies = [
     "numpy",
 ]
-requires-python = ">= 3.7"
+requires-python = ">= 3.8"
 readme = "README.md"
 license = { file = "LICENSE" }
 classifiers = [
-- 
GitLab


From 3c2b7dcde7b8d5edd00d2db840800d526d2d244b Mon Sep 17 00:00:00 2001
From: NAUD Maxence <maxence.naud@cea.fr>
Date: Tue, 28 Jan 2025 15:43:52 +0000
Subject: [PATCH 25/30] UPD: 'setup.py' to access compilation options from
 environment variables set by 'setup.sh'

---
 setup.py | 53 ++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 34 insertions(+), 19 deletions(-)

diff --git a/setup.py b/setup.py
index 82edd09c..366a4825 100644
--- a/setup.py
+++ b/setup.py
@@ -39,6 +39,7 @@ class AidgePkgBuild(build_ext):
         # This lists the number of processors available on the machine
         # The compilation will use half of them
         max_jobs = str(ceil(multiprocessing.cpu_count() / 2))
+        max_jobs = os.environ.get("AIDGE_NB_PROC", max_jobs)
 
         cwd = pathlib.Path().absolute()
 
@@ -53,14 +54,19 @@ class AidgePkgBuild(build_ext):
         package_prefix = build_lib if not self.editable_mode else SETUP_DIR
         pybind_install_prefix = (package_prefix / PROJECT_NAME).absolute()
 
-        os.chdir(str(build_temp))
-
-        compile_type = os.environ.get("AIDGE_PYTHON_BUILD_TYPE", "Release")
         install_path = (
             os.path.join(sys.prefix, "lib", "libAidge")
             if "AIDGE_INSTALL" not in os.environ
             else os.environ["AIDGE_INSTALL"]
         )
+
+        # Read environment variables for CMake options
+        c_compiler = os.environ.get("AIDGE_C_COMPILER", "gcc")
+        cxx_compiler = os.environ.get("AIDGE_CXX_COMPILER", "g++")
+        build_type = os.environ.get("AIDGE_BUILD_TYPE", "Release")
+        asan = os.environ.get("AIDGE_ASAN", "OFF")
+        cmake_arch = os.environ.get("AIDGE_CMAKE_ARCH", "")
+
         build_gen = os.environ.get("AIDGE_BUILD_GEN", "")
         build_gen_opts = (
             ["-G", build_gen]
@@ -69,26 +75,35 @@ class AidgePkgBuild(build_ext):
         )
         test_onoff = os.environ.get("AIDGE_BUILD_TEST", "OFF")
 
-        self.spawn(
-            [
-                "cmake",
-                *build_gen_opts,
-                str(cwd),
-                f"-DTEST={test_onoff}",
-                f"-DCMAKE_INSTALL_PREFIX:PATH={install_path}",
-                f"-DCMAKE_BUILD_TYPE={compile_type}",
-                "-DPYBIND=ON",
-                f"-DPYBIND_INSTALL_PREFIX:PATH={pybind_install_prefix}",
-                "-DCMAKE_EXPORT_COMPILE_COMMANDS=ON",
-                "-DCOVERAGE=OFF",
-            ]
-        )
+        os.chdir(str(build_temp))
+
+        cmake_cmd = [
+            "cmake",
+            *build_gen_opts,
+            str(cwd),
+            f"-DTEST={test_onoff}",
+            f"-DCMAKE_INSTALL_PREFIX:PATH={install_path}",
+            f"-DCMAKE_BUILD_TYPE={build_type}",
+            f"-DCMAKE_C_COMPILER={c_compiler}",
+            f"-DCMAKE_CXX_COMPILER={cxx_compiler}",
+            f"-DENABLE_ASAN={asan}",
+            "-DPYBIND=ON",
+            f"-DPYBIND_INSTALL_PREFIX:PATH={pybind_install_prefix}",
+            "-DCMAKE_EXPORT_COMPILE_COMMANDS=1",
+            "-DCOVERAGE=OFF",
+        ]
+
+        # Append architecture-specific arguments if provided
+        if cmake_arch:
+            cmake_cmd.append(cmake_arch)
+
+        self.spawn(cmake_cmd)
 
         if not self.dry_run:
             self.spawn(
-                ["cmake", "--build", ".", "--config", compile_type, "-j", max_jobs]
+                ["cmake", "--build", ".", "--config", build_type, "-j", max_jobs]
             )
-            self.spawn(["cmake", "--install", ".", "--config", compile_type])
+            self.spawn(["cmake", "--install", ".", "--config", build_type])
         os.chdir(str(cwd))
 
 
-- 
GitLab


From fe9e4ffae43c16a48eda8c2c3cf3aa6179383553 Mon Sep 17 00:00:00 2001
From: NAUD Maxence <maxence.naud@cea.fr>
Date: Wed, 29 Jan 2025 23:44:43 +0000
Subject: [PATCH 26/30] FEAT: unit-tests/CMakeLists.txt add minimum version for
 Catch2

---
 unit_tests/CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/unit_tests/CMakeLists.txt b/unit_tests/CMakeLists.txt
index 7e63fb2b..6c7af9c3 100644
--- a/unit_tests/CMakeLists.txt
+++ b/unit_tests/CMakeLists.txt
@@ -1,4 +1,6 @@
-find_package(Catch2 QUIET)
+set(CATCH2_MIN_VERSION 3.3.0)
+
+find_package(Catch2 ${CATCH2_MIN_VERSION} QUIET)
 
 if(NOT Catch2_FOUND)
     message(STATUS "Catch2 not found in system, retrieving from git")
-- 
GitLab


From 8de69102f83bd9cf3a2ed1906cb17463a0c1b354 Mon Sep 17 00:00:00 2001
From: NAUD Maxence <maxence.naud@cea.fr>
Date: Thu, 30 Jan 2025 15:17:45 +0000
Subject: [PATCH 27/30] UPD: only include what is used in Test files

---
 unit_tests/operator/Test_ErfImpl.cpp        | 40 ++++++--------
 unit_tests/operator/Test_ExpandImpl.cpp     | 60 +++++++++------------
 unit_tests/operator/Test_FCImpl.cpp         | 25 ++++-----
 unit_tests/operator/Test_FoldImpl.cpp       |  3 +-
 unit_tests/operator/Test_LeakyReLUImpl.cpp  |  7 ++-
 unit_tests/operator/Test_MaxPoolingImpl.cpp | 27 +++++-----
 unit_tests/operator/Test_Memorize.cpp       | 15 +++---
 unit_tests/operator/Test_PadImpl.cpp        | 10 ++--
 unit_tests/operator/Test_PaddedConv.cpp     | 12 ++---
 unit_tests/operator/Test_ReLUImpl.cpp       | 53 +++++++++---------
 unit_tests/operator/Test_ReduceMeanImpl.cpp | 19 ++++---
 unit_tests/operator/Test_SoftmaxImpl.cpp    | 41 ++++++--------
 unit_tests/operator/Test_SqrtImpl.cpp       | 53 ++++++++----------
 13 files changed, 170 insertions(+), 195 deletions(-)

diff --git a/unit_tests/operator/Test_ErfImpl.cpp b/unit_tests/operator/Test_ErfImpl.cpp
index 2826b5b5..c2fdd1c8 100644
--- a/unit_tests/operator/Test_ErfImpl.cpp
+++ b/unit_tests/operator/Test_ErfImpl.cpp
@@ -9,14 +9,16 @@
  *
  ********************************************************************************/
 
+#include <memory>
+
 #include <catch2/catch_test_macros.hpp>
 
+#include "aidge/backend/cpu/operator/ErfImpl.hpp"
+#include "aidge/data/DataType.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/Erf.hpp"
-
-#include "aidge/backend/cpu.hpp"
-
-#include <memory>
+#include "aidge/utils/ArrayHelpers.hpp"
+#include "aidge/utils/TensorUtils.hpp"
 
 
 using namespace Aidge;
@@ -27,23 +29,18 @@ TEST_CASE("[cpu/operator] Erf(forward)") {
             {0.41384590, 0.43120754, 0.93762982, 0.31049860, 0.77547199, 0.09514862,
               0.16145366, 0.42776686, 0.43487436, 0.41170865}
         });
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array1D<float,10> {
+        Tensor expectedOutput = Array1D<float,10> {
                 {0.44163144, 0.45801866, 0.81516320, 0.33941913, 0.72722000, 0.10704061,
               0.18061027, 0.45479023, 0.46144873, 0.43959764}
-        });
+        };
 
-        std::shared_ptr<Node> myErf = Erf();
-        auto op = std::static_pointer_cast<OperatorTensor>(myErf -> getOperator());
+        auto op = std::make_shared<Erf_Op>();
         op->associateInput(0,input0);
         op->setDataType(DataType::Float32);
         op->setBackend("cpu");
-        myErf->forward();
+        op->forward();
 
-        float* resPtr = static_cast<float*>(op->getOutput(0)->getImpl()->rawPtr());
-        float* expectedPtr = static_cast<float*>(expectedOutput->getImpl()->rawPtr());
-        for (std::size_t i = 0; i< expectedOutput->size(); ++i) {
-            REQUIRE(std::abs(resPtr[i]-expectedPtr[i]) < 0.00001);
-        }
+        REQUIRE(approxEq<float>(*(op->getOutput(0)), expectedOutput, 1e-5f, 1e-8f));
     }
 
     SECTION("3D Tensor") {
@@ -59,7 +56,7 @@ TEST_CASE("[cpu/operator] Erf(forward)") {
                 }
             }
         });
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array3D<float,2,2,3> {
+        Tensor expectedOutput = Array3D<float,2,2,3> {
             {
                 {
                     {0.83003384, 0.77721894, 0.72857803},
@@ -70,19 +67,14 @@ TEST_CASE("[cpu/operator] Erf(forward)") {
                     {0.81564975, 0.83322692, 0.37109339}
                 }
             }
-        });
+        };
 
-        std::shared_ptr<Node> myErf = Erf();
-        auto op = std::static_pointer_cast<OperatorTensor>(myErf -> getOperator());
+        auto op = std::make_shared<Erf_Op>();
         op->associateInput(0,input0);
         op->setDataType(DataType::Float32);
         op->setBackend("cpu");
-        myErf->forward();
+        op->forward();
 
-        float* resPtr = static_cast<float*>(op->getOutput(0)->getImpl()->rawPtr());
-        float* expectedPtr = static_cast<float*>(expectedOutput->getImpl()->rawPtr());
-        for (std::size_t i = 0; i< expectedOutput->size(); ++i) {
-            REQUIRE(std::abs(resPtr[i]-expectedPtr[i]) < 0.00001);
-        }
+        REQUIRE(approxEq<float>(*(op->getOutput(0)), expectedOutput, 1e-5f, 1e-8f));
     }
 }
\ No newline at end of file
diff --git a/unit_tests/operator/Test_ExpandImpl.cpp b/unit_tests/operator/Test_ExpandImpl.cpp
index 3fcb5e44..878c6081 100644
--- a/unit_tests/operator/Test_ExpandImpl.cpp
+++ b/unit_tests/operator/Test_ExpandImpl.cpp
@@ -9,21 +9,16 @@
  *
  ********************************************************************************/
 
-#include <aidge/data/Data.hpp>
-#include <aidge/operator/OperatorTensor.hpp>
-#include <aidge/utils/ArrayHelpers.hpp>
-#include <aidge/utils/TensorUtils.hpp>
-#include <aidge/utils/Types.h>
-#include <catch2/catch_test_macros.hpp>
-#include <cstdint>
-#include <cstdlib>
 #include <memory>
 
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/ExpandImpl.hpp"
+#include "aidge/data/DataType.hpp"
 #include "aidge/data/Tensor.hpp"
-#include "aidge/filler/Filler.hpp"
 #include "aidge/operator/Expand.hpp"
-
-#include "aidge/backend/cpu.hpp"
+#include "aidge/utils/ArrayHelpers.hpp"
 
 using std::shared_ptr;
 
@@ -31,8 +26,7 @@ using namespace Aidge;
 
 void setupTestExpand(shared_ptr<Tensor> inputData,
                      shared_ptr<Tensor> inputShape,
-                     shared_ptr<OperatorTensor> &op,
-                     shared_ptr<Tensor> &expectedOutput) {
+                     shared_ptr<Expand_Op> &op) {
 
     op->getOutput(0)->setDataType(inputData->dataType());
 
@@ -41,72 +35,68 @@ void setupTestExpand(shared_ptr<Tensor> inputData,
 
     inputShape->setBackend("cpu");
     op->associateInput(1, inputShape);
-
-    expectedOutput->setBackend("cpu");
-    expectedOutput->setDataType(DataType::Int32);
 }
 
 TEST_CASE("[cpu/operator] Expand(forward)", "[Expand][CPU]") {
-    auto node = Expand();
-    auto op = std::static_pointer_cast<OperatorTensor>(node->getOperator());
+    std::shared_ptr<Expand_Op> op = std::make_shared<Expand_Op>();
     op->setBackend("cpu");
 
     SECTION("Expand shape is bigger than inputData") {
         auto inputData = std::make_shared<Tensor>(Array1D<int, 2>({1, 3}));
         auto inputShape =
             std::make_shared<Tensor>(Array1D<std::int64_t, 4>({1, 3, 4, 2}));
-        auto expectedOutput = std::make_shared<Tensor>(
-            Array4D<int, 1, 3, 4, 2>({{{{{1, 3}, {1, 3}, {1, 3}, {1, 3}},
+        Tensor expectedOutput =
+            Array4D<cpptype_t<DataType::Int32>, 1, 3, 4, 2>({{{{{1, 3}, {1, 3}, {1, 3}, {1, 3}},
                                         {{1, 3}, {1, 3}, {1, 3}, {1, 3}},
-                                        {{1, 3}, {1, 3}, {1, 3}, {1, 3}}}}}));
-        setupTestExpand(inputData, inputShape, op, expectedOutput);
+                                        {{1, 3}, {1, 3}, {1, 3}, {1, 3}}}}});
+        setupTestExpand(inputData, inputShape, op);
 
         // forwardDims has already been tested in core
         CHECK(op->forwardDims(true));
         REQUIRE_NOTHROW(op->forward());
-        CHECK(approxEq<int>(*expectedOutput, *op->getOutput(0)));
+        REQUIRE(expectedOutput == *op->getOutput(0));
     }
     SECTION("Expand shape has less dimensions than inputData") {
         auto inputData = std::make_shared<Tensor>(
             Array3D<int, 2, 1, 3>({{{2, 1, 3}, {2, 1, 3}}}));
         auto inputShape =
             std::make_shared<Tensor>(Array1D<std::int64_t, 2>({2, 3}));
-        auto expectedOutput = std::make_shared<Tensor>(Array3D<int, 2, 2, 3>(
-            {{{{2, 1, 3}, {2, 1, 3}}, {{2, 1, 3}, {2, 1, 3}}}}));
-        setupTestExpand(inputData, inputShape, op, expectedOutput);
+        Tensor expectedOutput = Array3D<cpptype_t<DataType::Int32>, 2, 2, 3>(
+            {{{{2, 1, 3}, {2, 1, 3}}, {{2, 1, 3}, {2, 1, 3}}}});
+        setupTestExpand(inputData, inputShape, op);
 
         // forwardDims has already been tested in core
         CHECK(op->forwardDims(true));
         REQUIRE_NOTHROW(op->forward());
-        CHECK(approxEq<int>(*expectedOutput, *op->getOutput(0)));
+        REQUIRE(expectedOutput == *op->getOutput(0));
     }
     SECTION("Expand shape = {1} leads to input equal to output.") {
         auto inputData = std::make_shared<Tensor>(
             Array4D<int, 2, 1, 3, 1>({{{2, 1, 3}, {2, 1, 3}}}));
         auto inputShape =
             std::make_shared<Tensor>(Array1D<std::int64_t, 1>({1}));
-        auto expectedOutput = std::make_shared<Tensor>(
-            Array4D<int, 2, 1, 3, 1>({{{2, 1, 3}, {2, 1, 3}}}));
-        setupTestExpand(inputData, inputShape, op, expectedOutput);
+        Tensor expectedOutput =
+            Array4D<cpptype_t<DataType::Int32>, 2, 1, 3, 1>({{{2, 1, 3}, {2, 1, 3}}});
+        setupTestExpand(inputData, inputShape, op);
 
         // forwardDims has already been tested in core
         CHECK(op->forwardDims(true));
         REQUIRE_NOTHROW(op->forward());
-        CHECK(approxEq<int>(*expectedOutput, *op->getOutput(0)));
+        REQUIRE(expectedOutput == *op->getOutput(0));
     }
     SECTION("The only common dimension is the last one & its equal to 1") {
         auto inputData = std::make_shared<Tensor>(
             Array4D<int, 1, 1, 3, 1>({{{{2, 1, 3}}}}));
         auto inputShape =
             std::make_shared<Tensor>(Array1D<std::int64_t, 3>({2, 1, 1}));
-        auto expectedOutput = std::make_shared<Tensor>(
-            Array4D<int, 1, 2, 3, 1>({{{{2, 1, 3}, {2, 1, 3}}}}));
-        setupTestExpand(inputData, inputShape, op, expectedOutput);
+        Tensor expectedOutput =
+            Array4D<cpptype_t<DataType::Int32>, 1, 2, 3, 1>({{{{2, 1, 3}, {2, 1, 3}}}});
+        setupTestExpand(inputData, inputShape, op);
 
         // forwardDims has already been tested in core
         CHECK(op->forwardDims(true));
         REQUIRE_NOTHROW(op->forward());
-        CHECK(approxEq<int>(*expectedOutput, *op->getOutput(0)));
+        REQUIRE(expectedOutput == *op->getOutput(0));
     }
     SECTION("N-Dim to N-Dim") {}
     auto inputData = std::shared_ptr<Tensor>();
diff --git a/unit_tests/operator/Test_FCImpl.cpp b/unit_tests/operator/Test_FCImpl.cpp
index b2566f26..8ac0afc3 100644
--- a/unit_tests/operator/Test_FCImpl.cpp
+++ b/unit_tests/operator/Test_FCImpl.cpp
@@ -9,13 +9,16 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
 #include <memory>
 
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/FCImpl.hpp"
+#include "aidge/data/DataType.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/FC.hpp"
-
-#include "aidge/backend/cpu.hpp"
+#include "aidge/utils/ArrayHelpers.hpp"
 
 using namespace Aidge;
 
@@ -42,11 +45,13 @@ TEST_CASE("[cpu/oeprator] FC(forward)", "[FC][CPU]") {
               9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
               13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}}});
     std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<int, 5>{{1, 2, 3, 4, 5}});
-    std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<int, 2, 5>{
-            {{23601, 23602, 23603, 23604, 23605}, {68601, 68602, 68603, 68604, 68605}}});
+    Tensor myOutput = Array2D<int, 2, 5>{
+            {{23601, 23602, 23603, 23604, 23605}, {68601, 68602, 68603, 68604, 68605}}};
 
     std::shared_ptr<Node> myFC = FC(75, 5, false, "myfc");
-    auto op = std::static_pointer_cast<OperatorTensor>(myFC -> getOperator());
+    auto op = std::static_pointer_cast<FC_Op>(myFC -> getOperator());
+    op -> setDataType(DataType::Int32);
+    op -> setBackend("cpu");
     op -> associateInput(1, myWeights);
     op -> associateInput(2, myBias);
 
@@ -62,10 +67,8 @@ TEST_CASE("[cpu/oeprator] FC(forward)", "[FC][CPU]") {
                   120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
                   135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149}}});
         op->associateInput(0, myInput);
-        op -> setDataType(DataType::Int32);
-        op -> setBackend("cpu");
         myFC->forward();
-        REQUIRE(*(op->getOutput(0)) == *myOutput);
+        REQUIRE(*(op->getOutput(0)) == myOutput);
     }
     SECTION("4D input") {
         std::shared_ptr<Tensor> myInput =
@@ -100,10 +103,8 @@ TEST_CASE("[cpu/oeprator] FC(forward)", "[FC][CPU]") {
                                                                      {140, 141, 142, 143, 144},
                                                                      {145, 146, 147, 148, 149}}}}});
         op->associateInput(0, myInput);
-        op -> setDataType(DataType::Int32);
-        op -> setBackend("cpu");
         myFC->forward();
-        REQUIRE(*(op->getOutput(0)) == *myOutput);
+        REQUIRE(*(op->getOutput(0)) == myOutput);
     }
 
     // std::cout << static_cast<Tensor>((*myFC->getOperator())["weight"])[0][0][0][0] << std::endl;
diff --git a/unit_tests/operator/Test_FoldImpl.cpp b/unit_tests/operator/Test_FoldImpl.cpp
index 6832f5a4..184b9e9a 100644
--- a/unit_tests/operator/Test_FoldImpl.cpp
+++ b/unit_tests/operator/Test_FoldImpl.cpp
@@ -13,6 +13,7 @@
 #include <cstdlib>
 #include <memory>
 
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/graph/GraphView.hpp"
 #include "aidge/scheduler/SequentialScheduler.hpp"
@@ -21,8 +22,6 @@
 #include "aidge/operator/MatMul.hpp"
 #include "aidge/operator/Reshape.hpp"
 
-#include "aidge/backend/cpu.hpp"
-
 using namespace Aidge;
 
 TEST_CASE("[cpu/operator] Fold(forward)", "[Fold][CPU]") {
diff --git a/unit_tests/operator/Test_LeakyReLUImpl.cpp b/unit_tests/operator/Test_LeakyReLUImpl.cpp
index 85dd9f99..b60b8bb3 100644
--- a/unit_tests/operator/Test_LeakyReLUImpl.cpp
+++ b/unit_tests/operator/Test_LeakyReLUImpl.cpp
@@ -9,13 +9,16 @@
  *
  ********************************************************************************/
 
+#include <memory>
+
 #include <catch2/catch_test_macros.hpp>
 
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/LeakyReLUImpl.hpp"
+#include "aidge/data/DataType.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/LeakyReLU.hpp"
 
-#include "aidge/backend/cpu.hpp"
-
 using namespace Aidge;
 
 TEST_CASE("[cpu/operator] LeakyReLU(forward)", "[LeakyReLU][CPU]") {
diff --git a/unit_tests/operator/Test_MaxPoolingImpl.cpp b/unit_tests/operator/Test_MaxPoolingImpl.cpp
index af04ede4..de02df2b 100644
--- a/unit_tests/operator/Test_MaxPoolingImpl.cpp
+++ b/unit_tests/operator/Test_MaxPoolingImpl.cpp
@@ -9,15 +9,17 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
+#include <array>
 #include <memory>
-#include <cstdlib>
 
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/MaxPoolingImpl.hpp"
+#include "aidge/data/DataType.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/MaxPooling.hpp"
 
-#include "aidge/backend/cpu.hpp"
-
 using namespace Aidge;
 
 
@@ -53,10 +55,9 @@ TEST_CASE("[cpu/operator] MaxPooling(forward)", "[MaxPooling][CPU]") {
         }
     });
     SECTION("Stride") {
-        std::shared_ptr<Node> myMaxPool = MaxPooling({2,2}, "mycdw", {2,2});
-        auto op = std::static_pointer_cast<OperatorTensor>(myMaxPool -> getOperator());
+        std::shared_ptr<MaxPooling_Op<2>> op = std::make_shared<MaxPooling_Op<2>>(std::array<std::size_t, 2>({2,2}), std::array<std::size_t, 2>({2,2}));
 
-        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,2,2,2> {
+        Tensor myOutput = Array4D<float,2,2,2,2> {
             {
                 {
                     {{  0.7995,  0.6142},
@@ -71,12 +72,12 @@ TEST_CASE("[cpu/operator] MaxPooling(forward)", "[MaxPooling][CPU]") {
                      {0.0857,  0.6776}}
                 }
             }
-        });
-        myMaxPool->getOperator()->associateInput(0,myInput);
-        myMaxPool->getOperator()->setDataType(DataType::Float32);
-        myMaxPool->getOperator()->setBackend("cpu");
-        myMaxPool->forward();
+        };
+        op->associateInput(0,myInput);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cpu");
+        op->forward();
         op->getOutput(0)->print();
-        REQUIRE(*(op->getOutput(0)) == *myOutput);
+        REQUIRE(*(op->getOutput(0)) == myOutput);
     }
 }
\ No newline at end of file
diff --git a/unit_tests/operator/Test_Memorize.cpp b/unit_tests/operator/Test_Memorize.cpp
index 45ab40c5..6c1a617e 100644
--- a/unit_tests/operator/Test_Memorize.cpp
+++ b/unit_tests/operator/Test_Memorize.cpp
@@ -9,21 +9,22 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
 #include <memory>
 #include <string>
 
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/AddImpl.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/graph/Node.hpp"
 #include "aidge/graph/GraphView.hpp"
 #include "aidge/graph/OpArgs.hpp"
+#include "aidge/operator/Add.hpp"
 #include "aidge/operator/Memorize.hpp"
 #include "aidge/operator/Producer.hpp"
-#include "aidge/scheduler/SequentialScheduler.hpp"
-
-#include "aidge/backend/cpu.hpp"
 #include "aidge/recipes/GraphViewHelper.hpp"
-
+#include "aidge/scheduler/SequentialScheduler.hpp"
 
 namespace Aidge {
 
@@ -56,10 +57,10 @@ TEST_CASE("[cpu/operator] Memorize(forward)", "[Memorize][CPU]") {
         REQUIRE_NOTHROW(scheduler.forward());
         scheduler.saveSchedulingDiagram("simple");
 
-        const auto expectedOutput = std::make_shared<Tensor>(Array1D<int, 1>{{4}});
+        const Tensor expectedOutput = Array1D<int, 1>{{4}};
         std::shared_ptr<Tensor> other = std::static_pointer_cast<OperatorTensor>(mem->getOperator())->getOutput(0);
         other->print();
-        REQUIRE((*other == *expectedOutput));
+        REQUIRE((*other == expectedOutput));
     }
 }
 } // namespace Aidge
diff --git a/unit_tests/operator/Test_PadImpl.cpp b/unit_tests/operator/Test_PadImpl.cpp
index cdd3a5f9..f7823d02 100644
--- a/unit_tests/operator/Test_PadImpl.cpp
+++ b/unit_tests/operator/Test_PadImpl.cpp
@@ -9,15 +9,17 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
-#include <cstdlib>
 #include <memory>
 
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/PadImpl.hpp"
+#include "aidge/data/DataType.hpp"
 #include "aidge/data/Tensor.hpp"
+#include "aidge/graph/Node.hpp"
 #include "aidge/operator/Pad.hpp"
 
-#include "aidge/backend/cpu.hpp"
-
 using namespace Aidge;
 
 TEST_CASE("[cpu/operator] Pad(forward)", "[Pad][CPU]") {
diff --git a/unit_tests/operator/Test_PaddedConv.cpp b/unit_tests/operator/Test_PaddedConv.cpp
index b7584ad0..4b76fe06 100644
--- a/unit_tests/operator/Test_PaddedConv.cpp
+++ b/unit_tests/operator/Test_PaddedConv.cpp
@@ -9,16 +9,16 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
-#include <cstdlib>
 #include <memory>
 
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/PaddedConvImpl.hpp"
+#include "aidge/data/DataType.hpp"
 #include "aidge/data/Tensor.hpp"
-#include "aidge/operator/MetaOperator.hpp"
+#include "aidge/graph/Node.hpp"
 #include "aidge/operator/MetaOperatorDefs.hpp"
-#include "aidge/scheduler/SequentialScheduler.hpp"
-
-#include "aidge/backend/cpu.hpp"
 
 using namespace Aidge;
 
diff --git a/unit_tests/operator/Test_ReLUImpl.cpp b/unit_tests/operator/Test_ReLUImpl.cpp
index 106d29ec..eebdf7ac 100644
--- a/unit_tests/operator/Test_ReLUImpl.cpp
+++ b/unit_tests/operator/Test_ReLUImpl.cpp
@@ -9,15 +9,16 @@
  *
  ********************************************************************************/
 
+#include <memory>
+
 #include <catch2/catch_test_macros.hpp>
 
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/ReLUImpl.hpp"
+#include "aidge/data/DataType.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/ReLU.hpp"
 
-#include "aidge/backend/cpu.hpp"
-
-#include <memory>
-
 
 using namespace Aidge;
 
@@ -26,17 +27,16 @@ TEST_CASE("[cpu/operator] ReLU(forward)", "[ReLU][CPU]") {
         std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array1D<int,10> {
             {0, 1, 2,-3, 4,-5,-6, 7, 8, 9}
         });
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array1D<int,10> {
+        Tensor expectedOutput = Array1D<int,10> {
             {0, 1, 2, 0, 4, 0, 0, 7, 8, 9}
-        });
+        };
 
-        std::shared_ptr<Node> myReLU = ReLU();
-        auto op = std::static_pointer_cast<OperatorTensor>(myReLU -> getOperator());
+        std::shared_ptr<ReLU_Op> op = std::make_shared<ReLU_Op>();
         op->associateInput(0,input0);
         op->setDataType(DataType::Int32);
         op->setBackend("cpu");
-        myReLU->forward();
-        REQUIRE(*(op->getOutput(0)) == *expectedOutput);
+        op->forward();
+        REQUIRE(*(op->getOutput(0)) == expectedOutput);
     }
 
     SECTION("2D Tensor") {
@@ -46,20 +46,19 @@ TEST_CASE("[cpu/operator] ReLU(forward)", "[ReLU][CPU]") {
                 {-5, 4, 2,-3, 4,-5,-6, 7,-1,10}
             }
         });
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array2D<int,2,10> {
+        Tensor expectedOutput = Array2D<int,2,10> {
             {
                 { 0, 1, 2, 0, 4, 0, 0, 7, 8, 9},
                 { 0, 4, 2, 0, 4, 0, 0, 7, 0,10}
             }
-        });
+        };
 
-        std::shared_ptr<Node> myReLU = ReLU();
-        auto op = std::static_pointer_cast<OperatorTensor>(myReLU -> getOperator());
+        std::shared_ptr<ReLU_Op> op = std::make_shared<ReLU_Op>();
         op->associateInput(0,input0);
         op->setDataType(DataType::Int32);
         op->setBackend("cpu");
-        myReLU->forward();
-        REQUIRE(*op->getOutput(0) == *expectedOutput);
+        op->forward();
+        REQUIRE(*op->getOutput(0) == expectedOutput);
     }
 
     SECTION("3D Tensor") {
@@ -75,7 +74,7 @@ TEST_CASE("[cpu/operator] ReLU(forward)", "[ReLU][CPU]") {
                 }
             }
         });
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array3D<int,2,2,10> {
+        Tensor expectedOutput = Array3D<int,2,2,10> {
             {
                 {
                     { 0, 1, 2, 0, 4, 0, 0, 7, 8, 9},
@@ -86,15 +85,14 @@ TEST_CASE("[cpu/operator] ReLU(forward)", "[ReLU][CPU]") {
                     { 0, 4, 2, 0, 4, 0, 0, 7, 0,10}
                 }
             }
-        });
+        };
 
-        std::shared_ptr<Node> myReLU = ReLU();
-        auto op = std::static_pointer_cast<OperatorTensor>(myReLU -> getOperator());
+        std::shared_ptr<ReLU_Op> op = std::make_shared<ReLU_Op>();
         op->associateInput(0,input0);
         op->setDataType(DataType::Int32);
         op->setBackend("cpu");
-        myReLU->forward();
-        REQUIRE(*(op->getOutput(0)) == *expectedOutput);
+        op->forward();
+        REQUIRE(*(op->getOutput(0)) == expectedOutput);
     }
 
     SECTION("4D Tensor") {
@@ -122,7 +120,7 @@ TEST_CASE("[cpu/operator] ReLU(forward)", "[ReLU][CPU]") {
                 }
             }
         });
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,2,2,2,10> {
+        Tensor expectedOutput = Array4D<int,2,2,2,10> {
             {
                 {
                     {
@@ -145,14 +143,13 @@ TEST_CASE("[cpu/operator] ReLU(forward)", "[ReLU][CPU]") {
                     }
                 }
             }
-        });
+        };
 
-        std::shared_ptr<Node> myReLU = ReLU();
-        auto op = std::static_pointer_cast<OperatorTensor>(myReLU -> getOperator());
+        std::shared_ptr<ReLU_Op> op = std::make_shared<ReLU_Op>();
         op->associateInput(0,input0);
         op->setDataType(DataType::Int32);
         op->setBackend("cpu");
-        myReLU->forward();
-        REQUIRE(*op->getOutput(0) == *expectedOutput);
+        op->forward();
+        REQUIRE(*op->getOutput(0) == expectedOutput);
     }
 }
\ No newline at end of file
diff --git a/unit_tests/operator/Test_ReduceMeanImpl.cpp b/unit_tests/operator/Test_ReduceMeanImpl.cpp
index dd647c7b..30ffeb0d 100644
--- a/unit_tests/operator/Test_ReduceMeanImpl.cpp
+++ b/unit_tests/operator/Test_ReduceMeanImpl.cpp
@@ -9,16 +9,23 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
+#include <algorithm>   // std::fill
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::int32_t, std::uint16_t
 #include <memory>
-#include <numeric>   // std::accumulate
-#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <random>      // std::random_device, std::mt19937
+                       // std::uniform_int_distribution, std::uniform_real_distribution
+#include <vector>
+
+#include <catch2/catch_test_macros.hpp>
+#include <fmt/core.h>
 
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/ReduceMeanImpl.hpp"
+#include "aidge/data/DataType.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/ReduceMean.hpp"
-#include "aidge/operator/Conv.hpp"
-
-#include "aidge/backend/cpu.hpp"
+#include "aidge/operator/OperatorTensor.hpp"
 #include "aidge/utils/TensorUtils.hpp"
 
 using namespace Aidge;
diff --git a/unit_tests/operator/Test_SoftmaxImpl.cpp b/unit_tests/operator/Test_SoftmaxImpl.cpp
index da6c6f0d..bc452a40 100644
--- a/unit_tests/operator/Test_SoftmaxImpl.cpp
+++ b/unit_tests/operator/Test_SoftmaxImpl.cpp
@@ -9,14 +9,16 @@
  *
  ********************************************************************************/
 
+#include <memory>
+
 #include <catch2/catch_test_macros.hpp>
 
+#include "aidge/backend/cpu/operator/SoftmaxImpl.hpp"
+#include "aidge/data/DataType.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/Softmax.hpp"
-
-#include "aidge/backend/cpu.hpp"
-
-#include <memory>
+#include "aidge/utils/ArrayHelpers.hpp"
+#include "aidge/utils/TensorUtils.hpp"
 
 using namespace Aidge;
 
@@ -30,28 +32,22 @@ TEST_CASE("[cpu/operator] Softmax(forward)", "[Softmax][CPU]") {
                     0.35077620, -0.78156322, -0.98952234,  0.04166317,  1.34357309}
             }
         });
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array2D<float,2,10> {
+        Tensor expectedOutput = Array2D<float,2,10> {
             {
                 {0.04883239, 0.11326669, 0.05974559, 0.09930880, 0.09267281, 0.03006749,
                     0.15842478, 0.24514021, 0.07825989, 0.07428131},
                 {0.05429055, 0.27136859, 0.28389078, 0.02240700, 0.06262558, 0.06087753,
                     0.01961952, 0.01593576, 0.04469007, 0.16429459}
             }
-        });
+        };
 
-        std::shared_ptr<Node> mySoftmax = Softmax(1);
-        auto op = std::static_pointer_cast<OperatorTensor>(mySoftmax -> getOperator());
+        std::shared_ptr<Softmax_Op> op = std::make_shared<Softmax_Op>(1);
         op->associateInput(0,input);
         op->setDataType(DataType::Float32);
         op->setBackend("cpu");
-        mySoftmax->forward();
-
-        float* resPtr = static_cast<float*>(op->getOutput(0)->getImpl()->rawPtr());
-        float* expectedPtr = static_cast<float*>(expectedOutput->getImpl()->rawPtr());
-        for (std::size_t i = 0; i< expectedOutput->size(); ++i) {
-            REQUIRE(std::abs(resPtr[i]-expectedPtr[i]) < 0.00001);
-        }
+        op->forward();
 
+        REQUIRE(approxEq<float>(*(op->getOutput(0)), expectedOutput, 1e-5f, 1e-8f));
     }
     SECTION("4D Tensor") {
         std::shared_ptr<Tensor> input = std::make_shared<Tensor>(Array4D<float,2,3,3,3> {
@@ -80,7 +76,7 @@ TEST_CASE("[cpu/operator] Softmax(forward)", "[Softmax][CPU]") {
                 }
             }
         });
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<float,2,3,3,3> {
+        Tensor expectedOutput = Array4D<float,2,3,3,3> {
             {
                 {
                     {{0.45109013, 0.42849392, 0.43775153},
@@ -105,19 +101,14 @@ TEST_CASE("[cpu/operator] Softmax(forward)", "[Softmax][CPU]") {
                      {0.34566763, 0.32462072, 0.48979440}}
                 }
             }
-        });
+        };
 
-        std::shared_ptr<Node> mySoftmax = Softmax(1);
-        auto op = std::static_pointer_cast<OperatorTensor>(mySoftmax -> getOperator());
+        std::shared_ptr<Softmax_Op> op = std::make_shared<Softmax_Op>(1);
         op->associateInput(0,input);
         op->setDataType(DataType::Float32);
         op->setBackend("cpu");
-        mySoftmax->forward();
+        op->forward();
 
-        float* resPtr = static_cast<float*>(op->getOutput(0)->getImpl()->rawPtr());
-        float* expectedPtr = static_cast<float*>(expectedOutput->getImpl()->rawPtr());
-        for (std::size_t i = 0; i< expectedOutput->size(); ++i) {
-            REQUIRE(std::abs(resPtr[i]-expectedPtr[i]) < 0.00001);
-        }
+        REQUIRE(approxEq<float>(*(op->getOutput(0)), expectedOutput, 1e-5f, 1e-8f));
     }
 }
\ No newline at end of file
diff --git a/unit_tests/operator/Test_SqrtImpl.cpp b/unit_tests/operator/Test_SqrtImpl.cpp
index d630c66c..aac50460 100644
--- a/unit_tests/operator/Test_SqrtImpl.cpp
+++ b/unit_tests/operator/Test_SqrtImpl.cpp
@@ -9,14 +9,16 @@
  *
  ********************************************************************************/
 
+#include <memory>
+
 #include <catch2/catch_test_macros.hpp>
 
+#include "aidge/backend/cpu/operator/SqrtImpl.hpp"
+#include "aidge/data/DataType.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/Sqrt.hpp"
-
-#include "aidge/backend/cpu.hpp"
-
-#include <memory>
+#include "aidge/utils/ArrayHelpers.hpp"
+#include "aidge/utils/TensorUtils.hpp"
 
 using namespace Aidge;
 
@@ -28,26 +30,20 @@ TEST_CASE("[cpu/operator] Sqrt(forward)", "[Sqrt][CPU]") {
                 { 0.00000000,  1.84539008}
             }
         });
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array2D<float,2,2> {
+        Tensor expectedOutput = Array2D<float,2,2> {
             {
                 {4.00000000, 0.78883994},
                 {0.00000000, 1.35845140}
             }
-        });
+        };
 
-        std::shared_ptr<Node> mySqrt = Sqrt();
-        auto op = std::static_pointer_cast<OperatorTensor>(mySqrt -> getOperator());
-        mySqrt->getOperator()->associateInput(0,input);
-        mySqrt->getOperator()->setDataType(DataType::Float32);
-        mySqrt->getOperator()->setBackend("cpu");
-        mySqrt->forward();
-
-        float* resPtr = static_cast<float*>(op->getOutput(0)->getImpl()->rawPtr());
-        float* expectedPtr = static_cast<float*>(expectedOutput->getImpl()->rawPtr());
-        for (std::size_t i = 0; i< 4; ++i) {
-            REQUIRE(std::abs(resPtr[i]-expectedPtr[i]) < 0.00001);
-        }
+        std::shared_ptr<Sqrt_Op> op = std::make_shared<Sqrt_Op>();
+        op->associateInput(0,input);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cpu");
+        op->forward();
 
+        REQUIRE(approxEq<float>(*(op->getOutput(0)), expectedOutput, 1e-5f, 1e-8f));
     }
 
     SECTION("4D Tensor") {
@@ -78,7 +74,7 @@ TEST_CASE("[cpu/operator] Sqrt(forward)", "[Sqrt][CPU]") {
             }
         });
 
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<float,2,3,3,3> {
+        Tensor expectedOutput = Array4D<float,2,3,3,3> {
             {
                 {
                     {{0.24936883, 0.6844717,  0.7804763},
@@ -103,19 +99,14 @@ TEST_CASE("[cpu/operator] Sqrt(forward)", "[Sqrt][CPU]") {
                      {0.3608653,  0.8571328,  0.16447252}}
                 }
             }
-        });
+        };
 
-        std::shared_ptr<Node> mySqrt = Sqrt();
-        auto op = std::static_pointer_cast<OperatorTensor>(mySqrt -> getOperator());
-        mySqrt->getOperator()->associateInput(0,input);
-        mySqrt->getOperator()->setDataType(DataType::Float32);
-        mySqrt->getOperator()->setBackend("cpu");
-        mySqrt->forward();
+        std::shared_ptr<Sqrt_Op> op = std::make_shared<Sqrt_Op>();
+        op->associateInput(0,input);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cpu");
+        op->forward();
 
-        float* resPtr = static_cast<float*>(op->getOutput(0)->getImpl()->rawPtr());
-        float* expectedPtr = static_cast<float*>(expectedOutput->getImpl()->rawPtr());
-        for (std::size_t i = 0; i< 54; ++i) {
-            REQUIRE(std::abs(resPtr[i]-expectedPtr[i]) < 0.00001);
-        }
+        REQUIRE(approxEq<float>(*(op->getOutput(0)), expectedOutput, 1e-5f, 1e-8f));
     }
 }
\ No newline at end of file
-- 
GitLab


From ce9095326f4d8f94362835c72992500ef8dd2a27 Mon Sep 17 00:00:00 2001
From: NAUD Maxence <maxence.naud@cea.fr>
Date: Thu, 30 Jan 2025 18:05:47 +0000
Subject: [PATCH 28/30] Remove include of cpu.hpp from Test_CastMove.cpp

---
 unit_tests/scheduler/Test_CastMove.cpp | 39 +++++++++++++++-----------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/unit_tests/scheduler/Test_CastMove.cpp b/unit_tests/scheduler/Test_CastMove.cpp
index 5ca2cd9d..b78e864f 100644
--- a/unit_tests/scheduler/Test_CastMove.cpp
+++ b/unit_tests/scheduler/Test_CastMove.cpp
@@ -13,15 +13,20 @@
 #include <memory>
 #include <string>
 
+
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/ConvImpl.hpp"
+#include "aidge/backend/cpu/operator/FCImpl.hpp"
 #include "aidge/data/Tensor.hpp"
-#include "aidge/utils/TensorUtils.hpp"
 #include "aidge/graph/Node.hpp"
 #include "aidge/graph/GraphView.hpp"
 #include "aidge/graph/OpArgs.hpp"
+#include "aidge/operator/Conv.hpp"
+#include "aidge/operator/FC.hpp"
 #include "aidge/scheduler/SequentialScheduler.hpp"
 #include "aidge/recipes/Recipes.hpp"
-
-#include "aidge/backend/cpu.hpp"
+#include "aidge/utils/ArrayHelpers.hpp"
+#include "aidge/utils/TensorUtils.hpp"
 
 using namespace Aidge;
 
@@ -205,15 +210,15 @@ TEST_CASE("[cpu/castmove] CastMove(forward)") {
         REQUIRE_NOTHROW(scheduler.forward());
         scheduler.saveSchedulingDiagram("schedulingSequential");
 
-        std::shared_ptr<Tensor> expectedOutput1 = std::make_shared<Tensor>(Array4D<int, 2, 3, 3, 3>{
+        Tensor expectedOutput1 = Array4D<int, 2, 3, 3, 3>{
                 {{{{367, 412, 457}, {592, 637, 682}, {817, 862, 907}},
                   {{854, 980, 1106}, {1484, 1610, 1736}, {2114, 2240, 2366}},
                   {{1341, 1548, 1755}, {2376, 2583, 2790}, {3411, 3618, 3825}}},
                  {{{1492, 1537, 1582}, {1717, 1762, 1807}, {1942, 1987, 2032}},
                   {{4004, 4130, 4256}, {4634, 4760, 4886}, {5264, 5390, 5516}},
-                  {{6516, 6723, 6930}, {7551, 7758, 7965}, {8586, 8793, 9000}}}}});
+                  {{6516, 6723, 6930}, {7551, 7758, 7965}, {8586, 8793, 9000}}}}};
 
-        std::shared_ptr<Tensor> expectedOutput2 = std::make_shared<Tensor>(Array4D<int, 2, 4, 3, 3>{
+        Tensor expectedOutput2 = Array4D<int, 2, 4, 3, 3>{
                 {{{{6099, 7017, 7935}, {10689, 11607, 12525}, {15279, 16197, 17115}},
                   {{13786, 15838, 17890}, {24046, 26098, 28150}, {34306, 36358, 38410}},
                   {{21473, 24659, 27845}, {37403, 40589, 43775}, {53333, 56519, 59705}},
@@ -221,26 +226,26 @@ TEST_CASE("[cpu/castmove] CastMove(forward)") {
                  {{{29049, 29967, 30885}, {33639, 34557, 35475}, {38229, 39147, 40065}},
                   {{65086, 67138, 69190}, {75346, 77398, 79450}, {85606, 87658, 89710}},
                   {{101123, 104309, 107495}, {117053, 120239, 123425}, {132983, 136169, 139355}},
-                  {{137160, 141480, 145800}, {158760, 163080, 167400}, {180360, 184680, 189000}}}}});
+                  {{137160, 141480, 145800}, {158760, 163080, 167400}, {180360, 184680, 189000}}}}};
 
-        std::shared_ptr<Tensor> expectedOutput3 = std::make_shared<Tensor>(Array4D<int, 2, 3, 3, 3>{
+        Tensor expectedOutput3 = Array4D<int, 2, 3, 3, 3>{
                 {{{{214731, 246591, 278451}, {374031, 405891, 437751}, {533331, 565191, 597051}},
                   {{496804, 570568, 644332}, {865624, 939388, 1013152}, {1234444, 1308208, 1381972}},
                   {{778877, 894545, 1010213}, {1357217, 1472885, 1588553}, {1935557, 2051225, 2166893}}},
                  {{{1011231, 1043091, 1074951}, {1170531, 1202391, 1234251}, {1329831, 1361691, 1393551}},
                   {{2340904, 2414668, 2488432}, {2709724, 2783488, 2857252}, {3078544, 3152308, 3226072}},
-                  {{3670577, 3786245, 3901913}, {4248917, 4364585, 4480253}, {4827257, 4942925, 5058593}}}}});
+                  {{3670577, 3786245, 3901913}, {4248917, 4364585, 4480253}, {4827257, 4942925, 5058593}}}}};
 
         Tensor expectedOutput4 = Array2D<int, 2, 5>{
                 {{205050376, 198925904, 181355097, 196978090, 238868348},
                 {598467376, 561797804, 560823897, 593043790, 698672948}}};
-        std::shared_ptr<Tensor> other1 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv1")->getOperator())->getOutput(0);
-        REQUIRE(approxEq<float, int>(*other1, *expectedOutput1, 0.0, 1.0e-12));
-        std::shared_ptr<Tensor> other2 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv2")->getOperator())->getOutput(0);
-        REQUIRE(approxEq<int>(*other2, *expectedOutput2, 0.0, 1.0e-12));
-        std::shared_ptr<Tensor> other3 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv3")->getOperator())->getOutput(0);
-        REQUIRE(approxEq<double, int>(*other3, *expectedOutput3, 0.0, 1.0e-12));
-        std::shared_ptr<Tensor> other4 = std::static_pointer_cast<OperatorTensor>(g->getNode("fc")->getOperator())->getOutput(0);
-        REQUIRE(approxEq<int>(*other4, expectedOutput4, 0.0, 1.0e-12));
+        std::shared_ptr<Tensor> other1 = std::static_pointer_cast<Conv_Op<2>>(g->getNode("conv1")->getOperator())->getOutput(0);
+        REQUIRE(approxEq<float, int>(*other1, expectedOutput1, 0.0, 1.0e-12));
+        std::shared_ptr<Tensor> other2 = std::static_pointer_cast<Conv_Op<2>>(g->getNode("conv2")->getOperator())->getOutput(0);
+        REQUIRE(*other2 == expectedOutput2);
+        std::shared_ptr<Tensor> other3 = std::static_pointer_cast<Conv_Op<2>>(g->getNode("conv3")->getOperator())->getOutput(0);
+        REQUIRE(approxEq<double, int>(*other3, expectedOutput3, 0.0, 1.0e-12));
+        std::shared_ptr<Tensor> other4 = std::static_pointer_cast<FC_Op>(g->getNode("fc")->getOperator())->getOutput(0);
+        REQUIRE(*other4 == expectedOutput4);
     }
 }
-- 
GitLab


From 6b0cb49ea4e22a71cb7af53d3f936527658367b2 Mon Sep 17 00:00:00 2001
From: Jerome Hue <jerome.hue@cea.fr>
Date: Fri, 31 Jan 2025 10:52:26 +0000
Subject: [PATCH 29/30] FEAT: Add Heaviside implementation for CPU backend.

---
 include/aidge/backend/cpu.hpp                 |  2 +-
 .../backend/cpu/operator/HeavisideImpl.hpp    | 32 ++++++
 .../cpu/operator/HeavisideImpl_kernels.hpp    | 46 +++++++++
 src/operator/HeavisideImpl.cpp                | 37 +++++++
 unit_tests/operator/Test_HeavisideImpl.cpp    | 98 +++++++++++++++++++
 5 files changed, 214 insertions(+), 1 deletion(-)
 create mode 100644 include/aidge/backend/cpu/operator/HeavisideImpl.hpp
 create mode 100644 include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp
 create mode 100644 src/operator/HeavisideImpl.cpp
 create mode 100644 unit_tests/operator/Test_HeavisideImpl.cpp

diff --git a/include/aidge/backend/cpu.hpp b/include/aidge/backend/cpu.hpp
index 0c8ab84d..5db19a2b 100644
--- a/include/aidge/backend/cpu.hpp
+++ b/include/aidge/backend/cpu.hpp
@@ -34,6 +34,7 @@
 #include "aidge/backend/cpu/operator/FCImpl.hpp"
 #include "aidge/backend/cpu/operator/FoldImpl.hpp"
 #include "aidge/backend/cpu/operator/GlobalAveragePoolingImpl.hpp"
+#include "aidge/backend/cpu/operator/HeavisideImpl.hpp"
 #include "aidge/backend/cpu/operator/LRNImpl.hpp"
 #include "aidge/backend/cpu/operator/LeakyReLUImpl.hpp"
 #include "aidge/backend/cpu/operator/LnImpl.hpp"
@@ -59,4 +60,3 @@
 #include "aidge/backend/cpu/data/TensorImpl.hpp"
 
 #endif /* AIDGE_CPU_IMPORTS_H_ */
-
diff --git a/include/aidge/backend/cpu/operator/HeavisideImpl.hpp b/include/aidge/backend/cpu/operator/HeavisideImpl.hpp
new file mode 100644
index 00000000..7a3ba9ad
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/HeavisideImpl.hpp
@@ -0,0 +1,32 @@
+/********************************************************************************
+ * Copyright (c) 2025 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_HEAVISIDEIMPL_H_
+#define AIDGE_CPU_OPERATOR_HEAVISIDEIMPL_H_
+
+#include <cstddef> // std::size_t
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Heaviside.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/future_std/span.hpp"
+
+namespace Aidge {
+using HeavisideImplCpu =
+    OperatorImpl_cpu<Heaviside_Op,
+                     void(std::size_t, const void *, void *, const float),
+                     void(const float, std::size_t, const void *, void *)>;
+
+// Implementation entry point registration for operator Heaviside
+REGISTRAR(Heaviside_Op, "cpu", HeavisideImplCpu::create);
+} // namespace Aidge
+
+#endif // AIDGE_CPU_OPERATOR_HEAVISIDEIMPL_H_
diff --git a/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp b/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp
new file mode 100644
index 00000000..3fd6ca7d
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp
@@ -0,0 +1,46 @@
+/********************************************************************************
+ * Copyright (c) 2025 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_HEAVISIDEIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_HEAVISIDEIMPL_KERNELS_H_
+
+#include "aidge/utils/Registrar.hpp"
+
+#include <cstddef> // std::size_t
+
+#include "aidge/backend/cpu/operator/HeavisideImpl.hpp"
+#include "aidge/utils/ErrorHandling.hpp"
+
+
+namespace Aidge {
+
+template <class I, class O>
+void HeavisideImplCpuForwardKernel(std::size_t inputLenght,
+                                   const void *input_,
+                                   void *output_,
+                                   const float value) {
+    const I *input = static_cast<const I *>(input_);
+    O *output = static_cast<O *>(output_);
+
+    for (std::size_t i = 0; i < inputLenght; ++i) {
+        output[i] = (input[i] > 0) ? 1 : (input[i] == 0 ? value : 0);
+    }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(HeavisideImplCpu,
+          {DataType::Float32},
+          {ProdConso::inPlaceModel,
+           Aidge::HeavisideImplCpuForwardKernel<float, float>,
+           nullptr});
+} // namespace Aidge
+
+#endif // AIDGE_CPU_OPERATOR_HEAVISIDEIMPL_KERNELS_H__H_
diff --git a/src/operator/HeavisideImpl.cpp b/src/operator/HeavisideImpl.cpp
new file mode 100644
index 00000000..56ceb9b0
--- /dev/null
+++ b/src/operator/HeavisideImpl.cpp
@@ -0,0 +1,37 @@
+/********************************************************************************
+ * Copyright (c) 2025 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include "aidge/backend/cpu/operator/HeavisideImpl.hpp"
+
+#include <stdexcept>
+
+#include "aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include "aidge/utils/ErrorHandling.hpp"
+
+template <> void Aidge::HeavisideImplCpu::forward() {
+    const Heaviside_Op &op_ = dynamic_cast<const Heaviside_Op &>(mOp);
+    std::shared_ptr<Tensor> input0 = op_.getInput(0);
+    std::shared_ptr<Tensor> output0 = op_.getOutput(0);
+    AIDGE_ASSERT(input0, "missing input #0");
+
+    const auto impl =
+        Registrar<HeavisideImplCpu>::create(getBestMatch(getRequiredSpec()));
+
+    impl.forward(input0->size(),
+                 getCPUPtr(mOp.getRawInput(0)),
+                 getCPUPtr(mOp.getRawOutput(0)),
+                 op_.value());
+}
+
+template <> void Aidge::HeavisideImplCpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Heaviside backward not implemented yet");
+}
diff --git a/unit_tests/operator/Test_HeavisideImpl.cpp b/unit_tests/operator/Test_HeavisideImpl.cpp
new file mode 100644
index 00000000..4cbdf1a0
--- /dev/null
+++ b/unit_tests/operator/Test_HeavisideImpl.cpp
@@ -0,0 +1,98 @@
+/********************************************************************************
+ * Copyright (c) 2025 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include "aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp"
+
+#include <memory>
+#include <cstdlib>
+#include <random>
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/data/Tensor.hpp"
+#include "aidge/backend/cpu/operator/HeavisideImpl.hpp"
+#include "aidge/graph/Node.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+namespace Aidge
+{
+
+TEST_CASE("[cpu/operator] Heaviside(forward)", "[Heaviside][CPU]") {
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> valueDist(-1.0f, 1.0f);
+    std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
+    std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5));
+
+    SECTION("1D Tensor") {
+
+        std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array1D<float,10> {
+            {0, 1, 2,-3, 4,-5,-6, 7, 8, 9}
+        });
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array1D<float,10> {
+            {0.5, 1, 1, 0, 1, 0, 0, 1, 1, 1}
+        });
+
+        std::shared_ptr<Node> heaviside = Heaviside(0.5);
+        auto op = std::static_pointer_cast<OperatorTensor>(heaviside->getOperator());
+        op->associateInput(0, input0);
+        op->setBackend("cpu");
+        op->setDataType(DataType::Float32);
+
+        op->forward();
+        REQUIRE(approxEq<float>(*op->getOutput(0),*expectedOutput));
+    }
+
+    SECTION("+1-D Tensor")
+    {
+        auto dims = std::vector<std::size_t>();
+        auto nbDims = nbDimsDist(gen);
+
+        for (auto i = 0u; i < nbDims; ++i) {
+            dims.push_back(dimSizeDist(gen));
+        }
+
+        auto numberOfElements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+        float* inputArray = new float[numberOfElements];
+        float* resultArray = new float[numberOfElements];
+
+        for(auto i = 0u; i < numberOfElements; ++i)
+        {
+            inputArray[i] = valueDist(gen);
+            resultArray[i] = inputArray[i] > 0 ? 1 : (inputArray[i] == 0 ? 0.5 : 0);
+        }
+
+        auto T0 = std::make_shared<Tensor>();
+        T0->setDataType(DataType::Float32);
+        T0->setBackend("cpu");
+
+        auto T1 = std::make_shared<Tensor>();
+        T1->setDataType(DataType::Float32);
+        T1->setBackend("cpu");
+
+        T0->resize(dims);
+        T0->getImpl()->setRawPtr(inputArray, numberOfElements);
+        T1->resize(dims);
+        T1->getImpl()->setRawPtr(resultArray, numberOfElements);
+
+        std::shared_ptr<Node> heaviside = Heaviside(0.5);
+        auto op = std::static_pointer_cast<OperatorTensor>(heaviside->getOperator());
+        op->associateInput(0, T0);
+        op->setBackend("cpu");
+        op->setDataType(DataType::Float32);
+
+        op->forward();
+
+        REQUIRE(approxEq<float>(*(op->getOutput(0)), *T1));
+    }
+}
+}
-- 
GitLab


From 71080feb778df490bd24a0a562229e8e8573ee7c Mon Sep 17 00:00:00 2001
From: Maxence Naud <maxence.naud@cea.fr>
Date: Fri, 31 Jan 2025 14:11:53 +0000
Subject: [PATCH 30/30] update version 0.4.1 -> 0.5.0

---
 CHANGELOG   | 2 ++
 version.txt | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG b/CHANGELOG
index a461371a..9153a9a2 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,5 @@
+# Verson 0.5.0 (January 31, 2025)
+
 # Verson 0.4.0 (December 6, 2024)
 
 # Version 0.2.2 (May 14, 2024)
diff --git a/version.txt b/version.txt
index 267577d4..8f0916f7 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.4.1
+0.5.0
-- 
GitLab