diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 4a443fc403ab341d770455b203524c7f6e65f42a..59b890b815549bce6ee40e5e6651ab1b3e9bd0c9 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -29,10 +29,4 @@ test:ubuntu_python:
     - DEPENDENCY_JOB="build:ubuntu_python"
     - !reference [.ubuntu:download:artifacts, script]
     # Need to install extra dependence for tests:
-    - python -m pip install torch torchvision
-coverage:ubuntu_python:
-  before_script:
-    - !reference [.setup:coverage:ubuntu_python, before_script]
-    - DEPS_NAMES=("aidge_onnx" "aidge_quantization")
-    - DEPENDENCY_JOB="build:ubuntu_python"
-    - !reference [.ubuntu:download:artifacts, script]
\ No newline at end of file
+    - python -m pip install torch torchvision
\ No newline at end of file
diff --git a/aidge_export_cpp/benchmark.py b/aidge_export_cpp/benchmark.py
index c6e4b600255b18cc59e817090374650589fbed10..542f6711e635066769fa1cd42125f66772b56e93 100644
--- a/aidge_export_cpp/benchmark.py
+++ b/aidge_export_cpp/benchmark.py
@@ -15,31 +15,42 @@ def measure_inference_time(model: aidge_core.GraphView, input_data: list[str, np
     model.set_backend("cpu")
 
     # create input Tensor list for the GraphView
-    ordered_inputs: list[aidge_core.Tensor] = []
+    ordered_inputs: list[aidge_core.Tensor] = [aidge_core.Tensor(i[1]) for i in input_data]
     # [tmp fix] manual transpositin of data for input of export BEFORE converting to Tensor
-    for i in input_data:
-        nb_dims = len(i[1].shape)
-        if nb_dims == 3:
-            ordered_inputs.append(aidge_core.Tensor(i[1].transpose(0,2,1).reshape(i[1].shape).copy()))
-        if nb_dims == 4:
-            ordered_inputs.append(aidge_core.Tensor(np.transpose(i[1], axes=(0,2,3,1)).reshape(i[1].shape).copy()))
-        else:
-            ordered_inputs.append(aidge_core.Tensor(i[1]))
+    # for i in input_data:
+    #     nb_dims = len(i[1].shape)
+    #     if nb_dims == 3:
+    #         ordered_inputs.append(aidge_core.Tensor(i[1].transpose(0,2,1).reshape(i[1].shape).copy()))
+    #     if nb_dims == 4:
+    #         ordered_inputs.append(aidge_core.Tensor(np.transpose(i[1], axes=(0,2,3,1)).reshape(i[1].shape).copy()))
+    #     else:
+    #         ordered_inputs.append(aidge_core.Tensor(i[1]))
 
     # set inputs for the export
     for i, inp in enumerate(model.get_ordered_inputs()):
         op = inp[0].get_operator()
-        op.set_input(i, ordered_inputs[i])
+        ordered_inputs[i].set_data_format(aidge_core.dformat.nchw)
+        op.associate_input(i, ordered_inputs[i])
 
+    scheduler = aidge_core.SequentialScheduler(model)
+    scheduler.generate_scheduling()
+
+    for i in range(len(ordered_inputs)):
+        ordered_inputs[i].set_data_format(aidge_core.dformat.nhwc)
+
+    model.set_dataformat(aidge_core.dformat.nhwc)
+    model.set_backend(aidge_export_cpp.ExportLibCpp._name)
+    aidge_core.adapt_to_backend(model)
+    aidge_core.adapt_fc_params_format(model)
     model.forward_dims([t.dims() for t in ordered_inputs])
 
-    scheduler = aidge_core.SequentialScheduler(model)
+    scheduler.reset_scheduling()
     scheduler.generate_scheduling()
 
     # for ordered_input in ordered_inputs:
         # ordered_input.set_backend("cpu")
     operator_type: str = model.get_ordered_outputs()[0][0].get_operator().type()
-    print("  ├─Generating export...", end="", flush=True)
+
     folder_name: str = f"{operator_type.lower()}_test_export_cpp"
     with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f):
         aidge_core.export_utils.scheduler_export(
@@ -50,12 +61,10 @@ def measure_inference_time(model: aidge_core.GraphView, input_data: list[str, np
             memory_manager_args={"wrapping": False }
         )
         aidge_core.export_utils.generate_main_inference_time_cpp(folder_name, model, nb_iterations, nb_warmup)
-    print(" ok")
 
-    print("  ├─Compiling...", end="", flush=True)
     with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f):
         run(['make'], cwd=folder_name, stdout=f)
-    print(" ok")
+
     timings_str = run(f'./{folder_name}/bin/run_export', capture_output=True, text=True)
 
     folder_path = os.path.abspath(folder_name)
@@ -70,30 +79,31 @@ def compute_output(model: aidge_core.GraphView, input_data: list[str, np.ndarray
     model.set_backend("cpu")
 
     # create input Tensor list for the GraphView
-    ordered_inputs: list[aidge_core.Tensor] = []
-    # [tmp fix] manual transpositin of data for input of export BEFORE converting to Tensor
-    for i in input_data:
-        nb_dims = len(i[1].shape)
-        if nb_dims == 3:
-            ordered_inputs.append(aidge_core.Tensor(i[1].transpose(0,2,1).reshape(i[1].shape).copy()))
-        if nb_dims == 4:
-            ordered_inputs.append(aidge_core.Tensor(np.transpose(i[1], axes=(0,2,3,1)).reshape(i[1].shape).copy()))
-        else:
-            ordered_inputs.append(aidge_core.Tensor(i[1]))
+    ordered_inputs: list[aidge_core.Tensor] = [aidge_core.Tensor(i[1]) for i in input_data]
 
     # set inputs for the export
     for i, inp in enumerate(model.get_ordered_inputs()):
         op = inp[0].get_operator()
-        op.set_input(i, ordered_inputs[i])
-
-    model.forward_dims([t.dims() for t in ordered_inputs])
+        ordered_inputs[i].set_data_format(aidge_core.dformat.nchw)
+        op.associate_input(i, ordered_inputs[i])
 
     scheduler = aidge_core.SequentialScheduler(model)
     scheduler.generate_scheduling()
 
+    for i in range(len(ordered_inputs)):
+        ordered_inputs[i].set_data_format(aidge_core.dformat.nhwc)
+
+    model.set_dataformat(aidge_core.dformat.nhwc)
+    model.set_backend(aidge_export_cpp.ExportLibCpp._name)
+    aidge_core.adapt_to_backend(model)
+    aidge_core.adapt_fc_params_format(model)
+    model.forward_dims([t.dims() for t in ordered_inputs])
+
+    scheduler.reset_scheduling()
+    scheduler.generate_scheduling()
 
     operator_type: str = model.get_ordered_outputs()[0][0].get_operator().type()
-    print("  │ Generating export...", end="", flush=True)
+
     folder_name: str = f"{operator_type.lower()}_test_export_cpp"
     with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f):
         aidge_core.export_utils.scheduler_export(
@@ -104,12 +114,10 @@ def compute_output(model: aidge_core.GraphView, input_data: list[str, np.ndarray
             memory_manager_args={"wrapping": False }
         )
         aidge_core.export_utils.generate_main_display_output_cpp(folder_name, model)
-    print(" ok")
 
-    print("  │ Compiling...", end="", flush=True)
     with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f):
         run(['make'], cwd=folder_name, stdout=f)
-    print(" ok")
+
     output_str: str = run(f'./{folder_name}/bin/run_export', capture_output=True, text=True)
     folder_path = os.path.abspath(folder_name)
     if os.path.exists(folder_path):
@@ -117,22 +125,12 @@ def compute_output(model: aidge_core.GraphView, input_data: list[str, np.ndarray
 
     outputs_str: list[str] = output_str.stdout.strip().split('\n')
     outputs = [np.array([float(val) for val in single_output_str.split(' ') if val.strip()]) for i, single_output_str in enumerate(outputs_str)]
-
-    for i, pair in enumerate(model.get_ordered_outputs()):
-        dims = pair[0].get_operator().get_output(pair[1]).dims()
-        nb_dims = len(dims)
-        dims_permutted = dims
-        if nb_dims == 3:
-            dims_permutted = [dims[0], dims[2], dims[1]]
-        if nb_dims == 4:
-            dims_permutted = [dims[0], dims[2], dims[3], dims[1]]
-
-        if np.prod(dims) != outputs[i].size:
-            aidge_core.Log.fatal("Incompatible export output size ({}) with required shape {}", outputs[i].size, dims)
-        outputs[i] = outputs[i].reshape(dims_permutted)
-        if nb_dims == 3:
-            outputs[i] = outputs[i].transpose(0,2,1)
-        if nb_dims == 4:
-            outputs[i] = outputs[i].transpose(0,3,1,2)
-
-    return outputs
+    output_tensors = []
+    outputs_dims = [pair[0].get_operator().get_output(pair[1]).dims() for pair in model.get_ordered_outputs()]
+    for out_idx, arr in enumerate(outputs):
+        t = aidge_core.Tensor(arr.reshape(outputs_dims[out_idx]))
+        t.set_data_format(aidge_core.dformat.nhwc)
+        t.set_data_format(aidge_core.dformat.nchw)
+        output_tensors.append(np.array(t))
+
+    return output_tensors
diff --git a/aidge_export_cpp/export.py b/aidge_export_cpp/export.py
index 10741b0d7bd5116b4d5d833157d2bad504d9aca0..4cd93b85466d8008c8d44d0b57ee428a5f1e0c5a 100644
--- a/aidge_export_cpp/export.py
+++ b/aidge_export_cpp/export.py
@@ -1,33 +1,34 @@
 import os
 import shutil
-import numpy as np
 from pathlib import Path
 from typing import List, Union
 
 import aidge_core
 from aidge_core.mem_info import generate_optimized_memory_info
-from aidge_core.export_utils import scheduler_export, generate_main_cpp, aidge2c, generate_file
+from aidge_core.export_utils import scheduler_export, generate_main_cpp
 
-from aidge_export_cpp import ExportLibCpp, ROOT
-from aidge_export_cpp.export_utils import read_log_file
+from aidge_export_cpp import ExportLibCpp
+from aidge_export_cpp.export_utils import *
 
 
 def export(export_folder_name: str,
-           graphview: aidge_core.GraphView,
+           model: aidge_core.GraphView,
            scheduler: Union[List[aidge_core.Node],
                             aidge_core.Scheduler],
            inputs_tensor: aidge_core.Tensor = None,
            labels: aidge_core.Tensor = None,
            dev_mode: bool = False,
-           aidge_cmp: bool = False):
+           aidge_cmp: bool = False,
+           memory_manager = generate_optimized_memory_info,
+           memory_manager_args = {}):
     
     """ Export an aidge_core.Scheduler to C++ code
     
     :param export_folder_name: Export folder name
     :type export_folder_name: str
-    :param graph_view: An instance of :py:class:`aidge_core.graph_view`, providing access to nodes and
+    :param model: An instance of :py:class:`aidge_core.graph_view`, providing access to nodes and
                        ordered input/output data within the computational graph.
-    :type graph_view: aidge_core.GraphView
+    :type model: aidge_core.GraphView
     :param scheduler: Scheduler instance managing the computation graph.
                       Uses `graph_view` and `get_sequential_static_scheduling` methods
     :param inputs_tensor: **For future** argument to provide tensor to use in the main function, not implemented yet!
@@ -43,52 +44,112 @@ def export(export_folder_name: str,
     :type dev_mode: boolean
     """
 
-    export_folder_name = Path(export_folder_name)
+    # Graph Log (Save the state of the graph through export steps)
+    os.makedirs("graph_log", exist_ok=True)
+    model.save(f"graph_log/0_export_start")
+
+    # Remove scaling producers from the export
+    exclude_unwanted_producers(model)
+
+    # Fuse nodes into MetaOps adapted to the CPP Export
+    cpp_fuse_to_metaops(model)
+    model.save(f"graph_log/1_fused_model")
+
+    # Reset the scheduler after graph modification
+    scheduler = aidge_core.SequentialScheduler(model) if scheduler is None else scheduler
+    scheduler.reset_scheduling()
+    scheduler.generate_scheduling()
+
+    # Normalize nodes names
+    set_nodes_names(scheduler)
+    model.save(f"graph_log/2_named_model")
+
+    # Last inference to set the inputs as well as the ifmaps (aidge_cmp)
+    if inputs_tensor is not None:
+        output_array = propagate(model, scheduler, inputs_tensor)
+        aidge_core.Log.notice(f"Exported sample results : {np.argmax(output_array)} ( {str(np.max(output_array))} )")
+        aidge_core.Log.notice(f"Label : {labels}")
+    elif aidge_cmp:
+        aidge_cmp = False
+        aidge_core.Log.error("aidge_cmp : No input_tensor has been provided to the export() function.\n\
+                             Therefore ifmaps have not been generated and aidge_cmp cannot be used.")
+
+    # Set nodes datatypes if the model has been quantized
+    # TODO : Should be changed with future quantization feature
+    if inputs_tensor is not None:
+        if inputs_tensor.dtype() == aidge_core.dtype.int32:
+            set_nodes_datatypes(model)      # Set datatype to int8 only
+            inputs_tensor.set_datatype(aidge_core.dtype.int8)
+            model.save(f"graph_log/3_set_datatypes")
+
+    # [aidge_cmp] Export feature maps tensors as json
+    if aidge_cmp:
+        generate_aidge_ifmaps(model)
+
+    # [aidge_cmp] Set flags on each node
+    if aidge_cmp:
+        for node in model.get_nodes():
+            node.attributes().aidge_cmp = True
+
+    # Set model's dataformat (NHWC)
+    ## Inputs
+    for in_node in model.get_ordered_inputs():
+        input = in_node[0].get_operator().get_input(0)
+        if input is not None:
+            # Transpose the input
+            input_cpy = input.clone()
+            input_cpy.set_data_format(aidge_core.dformat.nchw)
+            input_cpy.set_data_format(aidge_core.dformat.nhwc)
+            in_node[0].get_operator().set_input(0, input_cpy)
+    ## Rest of the graph
+    model.set_dataformat(aidge_core.dformat.nhwc)
+    model.save(f"graph_log/4_set_dataformats")
+
+    # Set model's backend
+    model.set_backend(ExportLibCpp._name)
+
+    # Adapt the graph to the selected backend
+    aidge_core.adapt_to_backend(model)
+    model.save(f"graph_log/5_adapt_to_backend")
+    aidge_core.adapt_fc_params_format(model)
+    model.save(f"graph_log/6_adapt_fc_params_format")
+
+    # At this point, the graph dimensions are supposed to be statically
+    # forwardable, thus allow_data_dependency can be safely set to True
+    dims = []
+    for in_node in model.get_ordered_inputs():
+        dims.append(in_node[0].get_operator().get_input(0).dims())
+    model.forward_dims(dims=dims, allow_data_dependency=True)
+
+    # Reset the scheduling as the graph may have been changed
+    scheduler.reset_scheduling()
+    scheduler.generate_scheduling()
 
     # Remove existing export
+    export_folder_name = Path(export_folder_name)
     if os.path.isdir(export_folder_name):
         print("Removing existing export directory...")
         shutil.rmtree(export_folder_name)
 
-    # Generate Model Files
-    """
-    Perform the following tasks :
-    - Generate the parameters and layers config files
-    - Generate the forward.cpp file
-    - Copy all needed kernels
-    """
+    # Save the model
+    model.save("graph_log/7_exported_model")
+
+    # Setup stats folder
+    if "stats_folder" not in memory_manager_args:
+        memory_manager_args["stats_folder"] = f"{export_folder_name}/stats"
 
+    # Generate the export
     scheduler_export(scheduler,
                      export_folder_name,
                      ExportLibCpp,
-                     memory_manager=generate_optimized_memory_info,
-                     memory_manager_args={
-                         "stats_folder": f"{export_folder_name}/stats"},
+                     memory_manager=memory_manager,
+                     memory_manager_args=memory_manager_args,
                      dev_mode=dev_mode)
+    model.save(f"{export_folder_name}/graph")
     
     # Generate main file
-    generate_main_cpp(export_folder_name, graphview, labels=labels, inputs_tensor=inputs_tensor)
+    generate_main_cpp(export_folder_name, model, labels=labels, inputs_tensor=inputs_tensor)
 
     # Generate log files (aidge_cmp option)
-    """
-    If the aidge_cmp option has been enabled, the generated log_outputs will
-    be copied into the generated export in order to be used as reference. 
-    """
     if aidge_cmp:
-        ranked_nodes = graphview.get_ranked_nodes_name("{0}[{1}#{3}]")
-        os.makedirs(export_folder_name / "data" / "aidge_outputs")
-        os.makedirs(export_folder_name / "data" / "export_outputs")
-        for node in graphview.get_nodes():
-            if node.type() != "Producer":
-                file_path = 'log_outputs/' + ranked_nodes[node] + '/output_0.log'
-                data_t = aidge2c(node.get_operator().get_output(0).dtype())
-                name = node.name() + '_output_0_aidge'
-                dims = node.get_operator().get_output(0).dims()
-                values = read_log_file(file_path)
-
-                generate_file(export_folder_name / "data" / "aidge_outputs" / (node.name() + ".hpp"),
-                              ROOT / "templates" / "data" / "aidge_tensor.jinja",
-                              data_t=data_t,
-                              name=name,
-                              dims=dims,
-                              values=values)
+        export_aidge_ifmaps(export_folder_name)
diff --git a/aidge_export_cpp/export_utils.py b/aidge_export_cpp/export_utils.py
index e22524fb9058dfb4c8b023d0df8fbe11e2ff791b..4782119649439b50befd91cb5a9b2c2d10a86610 100644
--- a/aidge_export_cpp/export_utils.py
+++ b/aidge_export_cpp/export_utils.py
@@ -1,13 +1,17 @@
 import os
+import json
+import numpy as np
 from collections import OrderedDict
 
 import aidge_core
-from aidge_core.export_utils import get_node_from_metaop
+from aidge_core.export_utils import get_node_from_metaop, aidge2c, generate_file
+
+from aidge_export_cpp import ROOT
 
 def cpp_fuse_to_metaops(graph_view: aidge_core.GraphView):
     """ 
     Fuse nodes into metaops adapted for the CPP Export
-    TODO: These recipes should be into aidge_core
+    TODO: These recipes should be in aidge_core
 
     :param graph_view: An instance of :py:class:`aidge_core.GraphView`, providing access to nodes and
                        ordered input/output data within the computational graph.
@@ -137,8 +141,9 @@ def set_nodes_datatypes(graph_view: aidge_core.GraphView):
             if get_node_from_metaop(node, "FC") or \
                get_node_from_metaop(node, "Conv2D") or \
                get_node_from_metaop(node, "ConvDepthWise2D"):
-
-                node.get_operator().get_input(0).set_datatype(aidge_core.dtype.int8)    # Input
+                
+                if node.get_operator().get_input(0) is not None:
+                    node.get_operator().get_input(0).set_datatype(aidge_core.dtype.int8)    # Input
                 node.get_operator().get_input(1).set_datatype(aidge_core.dtype.int8)    # Weights
                 if node.get_parent(2) is not None:
                     node.get_operator().get_input(2).set_datatype(aidge_core.dtype.int32)   # Biases
@@ -148,26 +153,8 @@ def set_nodes_datatypes(graph_view: aidge_core.GraphView):
 
     # Set input node's datatype
     for n in graph_view.get_input_nodes():
-        n.get_operator().get_input(0).set_datatype(aidge_core.dtype.int8)
-
-
-
-def read_log_file(file_path: str):
-    """ Read log file
-    Used to read the aidge generated log files containing the intermediate
-    tensors of the exported model. 
-
-    :param file_path: Path to the file to read. 
-    :type file_path: str
-    """
-    # Check if the file exists
-    if not os.path.isfile(file_path):
-        print(f"File not found: {file_path}")
-        return None
-
-    with open(file_path, 'r') as file:
-        content = file.read()
-    return content
+        if n.get_operator().get_input(0) is not None:
+            n.get_operator().get_input(0).set_datatype(aidge_core.dtype.int8)
 
 
 
@@ -226,3 +213,68 @@ def normalize(array):
     array = (array - array.min()) / (array.max() - array.min())
     return 2 * array - 1
 
+
+
+def generate_aidge_ifmaps(model):
+
+    json_nodes = []
+    for node in model.get_nodes():
+        if node.type() != "Producer":
+
+            output = node.get_operator().get_output(0)
+            data = {
+                "name": node.name(),
+                "dims": output.dims(),
+                "dtype": aidge2c(output.dtype()),
+                "dformat": str(output.dformat()),
+                "values": np.array(output).tolist()
+            }
+            json_nodes.append(data)
+
+    # Write the entire list to the JSON file after the loop
+    with open('aidge_output.json', 'w') as file:
+        json.dump(json_nodes, file, indent=2, separators=(",", ": "))
+
+
+
+def export_aidge_ifmaps(export_folder_name):
+    os.makedirs(export_folder_name / "data" / "aidge_outputs")
+    os.makedirs(export_folder_name / "data" / "export_outputs")
+
+    # Load the JSON data from the file
+    with open('aidge_output.json', 'r') as file:
+        json_nodes = json.load(file)
+
+    # Access the data
+    for node in json_nodes:
+        name = node["name"]
+        dims = node["dims"]
+        dtype = node["dtype"]
+        dformat = node["dformat"]
+        values = node["values"]
+
+        generate_file(export_folder_name / "data" / "aidge_outputs" / (name + ".hpp"),
+                      ROOT / "templates" / "data" / "aidge_tensor.jinja",
+                      dtype=dtype,
+                      dformat=dformat,
+                      name=name + "_output_0_aidge",
+                      dims=dims,
+                      values=values)
+
+    # Remove the JSON file
+    os.remove('aidge_output.json')
+
+
+
+def propagate(model, scheduler, tensor):
+    """
+    Propagate the given tensor into the model and return the
+    output tensor.
+    """
+    # Run the inference
+    scheduler.forward(True, [tensor])
+    # Gather the results
+    output_node = model.get_ordered_outputs()[0][0]
+    output_tensor = output_node.get_operator().get_output(0).clone()
+    output_tensor.set_backend("cpu")
+    return np.array(output_tensor)
diff --git a/aidge_export_cpp/kernels/activation.hpp b/aidge_export_cpp/kernels/activation.hpp
index ee80ed275ab9edf574dee6e7d32276f00ba92412..616b09581f55b2312780e4391592dba1214566a7 100644
--- a/aidge_export_cpp/kernels/activation.hpp
+++ b/aidge_export_cpp/kernels/activation.hpp
@@ -3,9 +3,22 @@
 
 #include "network/activation_utils.hpp"
 #include "network/rescaling_utils.hpp"
+#include <sys/types.h>
 
-template<int NB_DATA,
+template<size_t NB_ELTS,
          ActivationFunction_T ACTIVATION,
+         // Memory mapping: inputs
+         size_t INPUT_MEM_CONT_OFFSET,
+         size_t INPUT_MEM_CONT_SIZE,
+         size_t INPUT_MEM_WRAP_OFFSET,
+         size_t INPUT_MEM_WRAP_SIZE,
+         size_t INPUT_MEM_STRIDE,
+         // Memory mapping: outputs
+         size_t OUTPUT_MEM_CONT_OFFSET,
+         size_t OUTPUT_MEM_CONT_SIZE,
+         size_t OUTPUT_MEM_WRAP_OFFSET,
+         size_t OUTPUT_MEM_WRAP_SIZE,
+         size_t OUTPUT_MEM_STRIDE,
          typename Input_T, typename Output_T, typename Rescaling_T>
 __attribute__((always_inline)) inline
 void activation_forward (
@@ -13,12 +26,22 @@ void activation_forward (
     Output_T* __restrict outputs,
     const Rescaling_T& __restrict rescaling)
 {
-    for (int i = 0; i < NB_DATA; ++i)
-    {
-        outputs[i] = activation_forward_value<Output_T>(inputs[i], i, ACTIVATION, rescaling);
-    }
+    int inOffset = 0;
+    int outOffset = 0;
 
-}
+    for (size_t i = 0; i < NB_ELTS; ++i) {
+        if (INPUT_MEM_WRAP_SIZE > 0 && i == static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) {
+            inOffset = (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET
+                        - INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
+        }
+
+        if (OUTPUT_MEM_WRAP_SIZE > 0 && i == static_cast<int>(OUTPUT_MEM_CONT_SIZE / sizeof(Output_T))) {
+            outOffset = (OUTPUT_MEM_WRAP_OFFSET - OUTPUT_MEM_CONT_OFFSET
+                        - OUTPUT_MEM_CONT_SIZE) / sizeof(Output_T);
+        }
 
+        outputs[outOffset + i] = activation_forward_value<Output_T>(inputs[inOffset + i], i, ACTIVATION, rescaling);
+    }
+}
 
 #endif  // __AIDGE_EXPORT_CPP_KERNELS_ACTIVATION__
diff --git a/aidge_export_cpp/kernels/batchnorm.hpp b/aidge_export_cpp/kernels/batchnorm.hpp
index 27866ab923eb8a519e684030cfb63f894c15ec98..b2a3452c9c12d0e64cc656718d6653409647105c 100644
--- a/aidge_export_cpp/kernels/batchnorm.hpp
+++ b/aidge_export_cpp/kernels/batchnorm.hpp
@@ -3,13 +3,13 @@
 
 #include "network/typedefs.hpp"
 #include "network/activation_utils.hpp"
-
+#include <sys/types.h>
 #include <math.h>
 
 // WARNING: this kernel only works for 32-bits floating point values
 
-template<int NB_BATCHES, int NB_OUTPUTS,
-         int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
+template<size_t NB_BATCHES, size_t NB_OUTPUTS,
+         size_t OUTPUTS_HEIGHT, size_t OUTPUTS_WIDTH,
          ActivationFunction_T ACTIVATION,
          typename Input_T, typename Output_T,
          typename Param_T,
@@ -25,14 +25,14 @@ void batchnorm_forward (
     const double epsilon,
     const Rescaling_T& __restrict rescaling)
 {
-    for (unsigned int batch = 0; batch < NB_BATCHES; ++batch) {
-        for (unsigned int output = 0; output < NB_OUTPUTS; ++output) {
+    for (size_t batch = 0; batch < NB_BATCHES; ++batch) {
+        for (size_t output = 0; output < NB_OUTPUTS; ++output) {
             // If the variance is 0, we need to avoid division by 0
             Output_T var = sqrt(variances[output] > 0.0 ? variances[output] + epsilon : epsilon);
 
-            for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
-                for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
-                    const int outputOffset = batch * OUTPUTS_WIDTH * OUTPUTS_HEIGHT * NB_OUTPUTS + output * OUTPUTS_WIDTH * OUTPUTS_HEIGHT + OUTPUTS_WIDTH * oy + ox;
+            for (size_t oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
+                for (size_t ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
+                    const size_t outputOffset = batch * OUTPUTS_WIDTH * OUTPUTS_HEIGHT * NB_OUTPUTS + output * OUTPUTS_WIDTH * OUTPUTS_HEIGHT + OUTPUTS_WIDTH * oy + ox;
 
                     const Output_T normalized = (inputs[outputOffset] - means[output]) / var;
                     const Output_T sAs = scales[output] * normalized + biases[output];
diff --git a/aidge_export_cpp/kernels/concat.hpp b/aidge_export_cpp/kernels/concat.hpp
index dde8c4fc3a9ce9eea5d4ae4cfad35c078f60450d..effaeba35bbd2997df0116e7a04dc53e5f7f3798 100644
--- a/aidge_export_cpp/kernels/concat.hpp
+++ b/aidge_export_cpp/kernels/concat.hpp
@@ -1,39 +1,37 @@
 #ifndef __AIDGE_EXPORT_CPP_KERNELS_CONCAT__
 #define __AIDGE_EXPORT_CPP_KERNELS_CONCAT__
 
-template<int AXIS_SIZE_POST,
-         int AXIS_SIZE_PRE,
-         unsigned int NB_INPUTS,
+#include <sys/types.h>
+
+template<size_t AXIS_SIZE_POST,
+         size_t AXIS_SIZE_PRE,
+         const size_t AXIS_SIZE[],
+         size_t TOTAL_AXIS_SIZE,
+         size_t NB_INPUTS,
          typename T>
 __attribute__((always_inline)) inline static
 void concat_forward (
     const T* const * __restrict inputs,
-    const unsigned int* __restrict sizes,
     T* __restrict output)
 {
-    unsigned int total_concat_axis_size = 0;
-    for (unsigned int n = 0; n < NB_INPUTS; ++n)
-        total_concat_axis_size += sizes[n];
-
-    for (int i = 0; i < AXIS_SIZE_PRE; ++i) {
+    for (size_t i = 0; i < AXIS_SIZE_PRE; ++i) {
         // Loop over post-axis (e.g., dims after axis 1)
-        for (int j = 0; j < AXIS_SIZE_POST; ++j) {
-            unsigned int axis_offset = 0;
+        for (size_t j = 0; j < AXIS_SIZE_POST; ++j) {
+            size_t axis_offset = 0;
 
             // Loop over each input tensor
-            for (unsigned int n = 0; n < NB_INPUTS; ++n) {
-                for (unsigned int k = 0; k < sizes[n]; ++k) {
-                    const int input_idx  = i * sizes[n] * AXIS_SIZE_POST + k * AXIS_SIZE_POST + j;
+            for (size_t n = 0; n < NB_INPUTS; ++n) {
+                for (size_t k = 0; k < AXIS_SIZE[n]; ++k) {
+                    const size_t input_idx  = i * AXIS_SIZE[n] * AXIS_SIZE_POST + k * AXIS_SIZE_POST + j;
 
-                    output[i * total_concat_axis_size * AXIS_SIZE_POST + (axis_offset + k) * AXIS_SIZE_POST + j] =
+                    output[i * TOTAL_AXIS_SIZE * AXIS_SIZE_POST + (axis_offset + k) * AXIS_SIZE_POST + j] =
                         inputs[n][input_idx];
                 }
 
-                axis_offset += sizes[n];  // move along axis in output
+                axis_offset += AXIS_SIZE[n];  // move along axis in output
             }
         }
     }
-
 }
 
 #endif  // __AIDGE_EXPORT_CPP_KERNELS_CONCAT__
\ No newline at end of file
diff --git a/aidge_export_cpp/kernels/convolution.hpp b/aidge_export_cpp/kernels/convolution.hpp
index ed62401e69ff8d53b23ba9f88917bb54acd3740a..01b3f1b8e2ac3bc731f7067f6b6c3e8ee8706089 100644
--- a/aidge_export_cpp/kernels/convolution.hpp
+++ b/aidge_export_cpp/kernels/convolution.hpp
@@ -6,17 +6,29 @@
 #include "network/utils.hpp"
 #include "network/macs.hpp"
 #include "network/activation_utils.hpp"
-
-
-template<int NB_CHANNELS,
-         int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
-         int NB_OUTPUTS,
-         int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
-         int PADDING_Y, int PADDING_X,
-         int STRIDE_Y, int STRIDE_X,
-         int DILATION_Y, int DILATION_X,
-         int KERNEL_HEIGHT, int KERNEL_WIDTH,
+#include <sys/types.h>
+
+template<size_t NB_CHANNELS,
+         size_t CHANNELS_HEIGHT, size_t CHANNELS_WIDTH,
+         size_t NB_OUTPUTS,
+         size_t OUTPUTS_HEIGHT, size_t OUTPUTS_WIDTH,
+         size_t PADDING_Y, size_t PADDING_X,
+         size_t STRIDE_Y, size_t STRIDE_X,
+         size_t DILATION_Y, size_t DILATION_X,
+         size_t KERNEL_HEIGHT, size_t KERNEL_WIDTH,
          ActivationFunction_T ACTIVATION,
+         // Memory mapping: inputs
+         size_t INPUT_MEM_CONT_OFFSET,
+         size_t INPUT_MEM_CONT_SIZE,
+         size_t INPUT_MEM_WRAP_OFFSET,
+         size_t INPUT_MEM_WRAP_SIZE,
+         size_t INPUT_MEM_STRIDE,
+         // Memory mapping: outputs
+         size_t OUTPUT_MEM_CONT_OFFSET,
+         size_t OUTPUT_MEM_CONT_SIZE,
+         size_t OUTPUT_MEM_WRAP_OFFSET,
+         size_t OUTPUT_MEM_WRAP_SIZE,
+         size_t OUTPUT_MEM_STRIDE,
          typename Input_T, typename Output_T,
          typename Weight_T, typename Bias_T,
          typename Rescaling_T>
@@ -28,62 +40,82 @@ void convolution_forward(
     const Bias_T* __restrict biases,
     const Rescaling_T& __restrict rescaling)
 {
-    constexpr int DILATED_KERNEL_HEIGHT
-            = KERNEL_HEIGHT + (DILATION_Y - 1) * (KERNEL_HEIGHT - 1);
-
-    constexpr int DILATED_KERNEL_WIDTH
-            = KERNEL_WIDTH + (DILATION_X - 1) * (KERNEL_WIDTH - 1);
-
-    constexpr int OUTPUTS_HEIGHT_NOPAD
+    constexpr size_t OUTPUTS_HEIGHT_NOPAD
         = (CHANNELS_HEIGHT - DILATION_Y * (KERNEL_HEIGHT - 1) - 1 + STRIDE_Y) / STRIDE_Y;
-    constexpr int OUTPUTS_WIDTH_NOPAD
+    constexpr size_t OUTPUTS_WIDTH_NOPAD
         = (CHANNELS_WIDTH - DILATION_X * (KERNEL_WIDTH - 1) - 1 + STRIDE_X) / STRIDE_X;
 
-    for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
-        const int syMin = (PADDING_Y == 0) ? 0
-            : max(PADDING_Y - (oy * STRIDE_Y), 0);
-        const int syMax = (PADDING_Y == 0
-                && OUTPUTS_HEIGHT == OUTPUTS_HEIGHT_NOPAD) ? DILATED_KERNEL_HEIGHT
-            : clamp(CHANNELS_HEIGHT + PADDING_Y - (oy * STRIDE_Y),
-                    0, DILATED_KERNEL_HEIGHT);
-        const int iy = (oy * STRIDE_Y) - PADDING_Y;
+    for (size_t oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
+        const size_t syMin = (PADDING_Y == 0) ? 0
+             : max((PADDING_Y - (oy * STRIDE_Y) + DILATION_Y - 1) / DILATION_Y, 0);
+        const size_t syMax = (PADDING_Y == 0
+                && OUTPUTS_HEIGHT == OUTPUTS_HEIGHT_NOPAD) ? KERNEL_HEIGHT
+            : clamp((CHANNELS_HEIGHT + PADDING_Y - (oy * STRIDE_Y)) / DILATION_Y,
+                    0, KERNEL_HEIGHT);
+        const int iy = static_cast<int>(oy * STRIDE_Y) - static_cast<int>(PADDING_Y);
 
 #ifdef _OPENMP
 #pragma omp parallel for collapse(2)
 #endif
-        for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
-            for (int output = 0; output < NB_OUTPUTS; ++output) {
+        for (size_t ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
+            for (size_t output = 0; output < NB_OUTPUTS; ++output) {
                 // moved to inner loop for collapsing -->
-                const int sxMin = (PADDING_X == 0) ? 0
-                    : max(PADDING_X - (ox * STRIDE_X), 0);
-                const int sxMax = (PADDING_X == 0
+                const size_t sxMin = (PADDING_X == 0) ? 0
+                    : max((PADDING_X - (ox * STRIDE_X) + DILATION_X - 1) / DILATION_X, 0);
+                const size_t sxMax = (PADDING_X == 0
                         && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
-                            ? DILATED_KERNEL_WIDTH
-                    : clamp(CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X),
-                            0, DILATED_KERNEL_WIDTH);
-                const int ix = (ox * STRIDE_X) - PADDING_X;
+                            ? KERNEL_WIDTH
+                    : clamp((CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X)) / DILATION_X,
+                            0, KERNEL_WIDTH);
+                const int ix = static_cast<int>(ox * STRIDE_X) - static_cast<int>(PADDING_X);
 
-                const int oPos = (ox + OUTPUTS_WIDTH * oy);
-                const int oOffset = NB_OUTPUTS * oPos;
+                const size_t oPos = (ox + OUTPUTS_WIDTH * oy);
+                int oOffset = (OUTPUT_MEM_STRIDE / sizeof(Output_T)) * oPos;
+
+                if (OUTPUT_MEM_WRAP_SIZE > 0 && oOffset >= static_cast<int>(OUTPUT_MEM_CONT_SIZE / sizeof(Output_T))) {
+                    oOffset += (OUTPUT_MEM_WRAP_OFFSET - OUTPUT_MEM_CONT_OFFSET
+                                - OUTPUT_MEM_CONT_SIZE) / sizeof(Output_T);
+                }
 
                 // <--
                 // Check if the biases are defined
                 Bias_T weightedSum = biases ? biases[output] : 0;
 
-                for (int sy = 0; sy < KERNEL_HEIGHT; ++sy) {
+                for (size_t sy = 0; sy < KERNEL_HEIGHT; ++sy) {
                     if ((PADDING_Y != 0
                             || OUTPUTS_HEIGHT != OUTPUTS_HEIGHT_NOPAD)
-                        && ((sy*DILATION_Y < syMin) || (sy*DILATION_Y >= syMax)))
+                        && sy >= syMax - syMin)
                     {
-                        continue;
+                        break;
                     }
 
-                    const int iPos = ix + CHANNELS_WIDTH * (iy + sy*DILATION_Y);
-                    const int iOffset = NB_CHANNELS * iPos;
+                    const size_t iPos = static_cast<size_t>(sxMin * DILATION_X + ix)
+                                        + CHANNELS_WIDTH * (static_cast<size_t>(iy + (syMin + sy) * DILATION_Y));
+                    int iOffset = (INPUT_MEM_STRIDE / sizeof(Input_T)) * iPos;
 
-                    const int wOffset = (output*KERNEL_HEIGHT + sy) * KERNEL_WIDTH * NB_CHANNELS;
+                    // Wrapping cannot occur in the middle of a line, except if
+                    // there is only one line (1D)!
+                    bool wrapInRange = false;
 
-                    if (DILATION_X == 1 && ((PADDING_X == 0 && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
+                    if (INPUT_MEM_WRAP_SIZE > 0
+                        && iOffset >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
+                    {
+                        iOffset += (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET
+                                    - INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
+                    }
+                    else if (INPUT_MEM_WRAP_SIZE > 0 && KERNEL_WIDTH > 1
+                        && CHANNELS_HEIGHT == 1 // single line (1D)!
+                        && iOffset + KERNEL_WIDTH * NB_CHANNELS
+                            > (INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
+                    {
+                        wrapInRange = true;
+                    }
+
+                    const size_t wOffset = NB_CHANNELS * (sxMin
+                        + KERNEL_WIDTH * (syMin + sy + KERNEL_HEIGHT * output));
+
+                    if (!wrapInRange && NB_CHANNELS == (INPUT_MEM_STRIDE / sizeof(Input_T))
+                        && DILATION_X == 1 && ((PADDING_X == 0 && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
                         || sxMax - sxMin == KERNEL_WIDTH))
                     {
                         macsOnRange<KERNEL_WIDTH * NB_CHANNELS>(
@@ -92,16 +124,24 @@ void convolution_forward(
                             weightedSum);
                     }
                     else {
-                        for (int sx = 0; sx < KERNEL_WIDTH; ++sx) {
+                        for (size_t sx = 0; sx < KERNEL_WIDTH; ++sx) {
                             if ((PADDING_X != 0
                                     || OUTPUTS_WIDTH != OUTPUTS_WIDTH_NOPAD)
-                                && ((sx*DILATION_X < sxMin) || (sx*DILATION_X >= sxMax)))
+                                && sx >= sxMax - sxMin)
                             {
-                                continue;
+                                break;
                             }
 
-                            const int iOffsetInRange = iOffset
-                                + sx * DILATION_X * NB_CHANNELS;
+                            int iOffsetInRange = iOffset
+                                + sx * DILATION_X * (INPUT_MEM_STRIDE / sizeof(Input_T));
+
+                            if (wrapInRange
+                                && iOffsetInRange >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
+                            {
+                                iOffsetInRange += (INPUT_MEM_WRAP_OFFSET
+                                            - INPUT_MEM_CONT_OFFSET
+                                            - INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
+                            }
 
                             macsOnRange<NB_CHANNELS>(
                                 // same input line so no wrapping can occur
@@ -118,16 +158,28 @@ void convolution_forward(
     }
 }
 
-// Template specialization when biases are not given to the convolution
-template<int NB_CHANNELS,
-         int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
-         int NB_OUTPUTS,
-         int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
-         int PADDING_Y, int PADDING_X,
-         int STRIDE_Y, int STRIDE_X,
-         int DILATION_Y, int DILATION_X,
-         int KERNEL_HEIGHT, int KERNEL_WIDTH,
+// Template overloading when biases are not given to the convolution
+template<size_t NB_CHANNELS,
+         size_t CHANNELS_HEIGHT, size_t CHANNELS_WIDTH,
+         size_t NB_OUTPUTS,
+         size_t OUTPUTS_HEIGHT, size_t OUTPUTS_WIDTH,
+         size_t PADDING_Y, size_t PADDING_X,
+         size_t STRIDE_Y, size_t STRIDE_X,
+         size_t DILATION_Y, size_t DILATION_X,
+         size_t KERNEL_HEIGHT, size_t KERNEL_WIDTH,
          ActivationFunction_T ACTIVATION,
+         // Memory mapping: inputs
+         size_t INPUT_MEM_CONT_OFFSET,
+         size_t INPUT_MEM_CONT_SIZE,
+         size_t INPUT_MEM_WRAP_OFFSET,
+         size_t INPUT_MEM_WRAP_SIZE,
+         size_t INPUT_MEM_STRIDE,
+         // Memory mapping: outputs
+         size_t OUTPUT_MEM_CONT_OFFSET,
+         size_t OUTPUT_MEM_CONT_SIZE,
+         size_t OUTPUT_MEM_WRAP_OFFSET,
+         size_t OUTPUT_MEM_WRAP_SIZE,
+         size_t OUTPUT_MEM_STRIDE,
          typename Input_T, typename Output_T,
          typename Weight_T,
          typename Rescaling_T>
@@ -155,7 +207,19 @@ void convolution_forward(
                         DILATION_X,
                         KERNEL_HEIGHT,
                         KERNEL_WIDTH,
-                        ACTIVATION>
+                        ACTIVATION,
+                        // Memory mapping: inputs
+                        INPUT_MEM_CONT_OFFSET,
+                        INPUT_MEM_CONT_SIZE,
+                        INPUT_MEM_WRAP_OFFSET,
+                        INPUT_MEM_WRAP_SIZE,
+                        INPUT_MEM_STRIDE,
+                        // Memory mapping: outputs
+                        OUTPUT_MEM_CONT_OFFSET,
+                        OUTPUT_MEM_CONT_SIZE,
+                        OUTPUT_MEM_WRAP_OFFSET,
+                        OUTPUT_MEM_WRAP_SIZE,
+                        OUTPUT_MEM_STRIDE>
                         (inputs, outputs, weights, b, rescaling);
 }
 
diff --git a/aidge_export_cpp/kernels/convolution_depthwise.hpp b/aidge_export_cpp/kernels/convolution_depthwise.hpp
index 244dd86bc01be7142474380f1e3393ce32446aaf..613271cae946a82cd65d0d8dbc5db66fe42745d4 100644
--- a/aidge_export_cpp/kernels/convolution_depthwise.hpp
+++ b/aidge_export_cpp/kernels/convolution_depthwise.hpp
@@ -6,16 +6,29 @@
 #include "network/utils.hpp"
 #include "network/macs.hpp"
 #include "network/activation_utils.hpp"
-
-template<int NB_CHANNELS, 
-         int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
-         int NB_OUTPUTS,
-         int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
-         int PADDING_Y, int PADDING_X,
-         int STRIDE_Y, int STRIDE_X,
-         int DILATION_Y, int DILATION_X,
-         int KERNEL_HEIGHT, int KERNEL_WIDTH,
+#include <sys/types.h>
+
+template<size_t NB_CHANNELS, 
+         size_t CHANNELS_HEIGHT, size_t CHANNELS_WIDTH,
+         size_t NB_OUTPUTS,
+         size_t OUTPUTS_HEIGHT, size_t OUTPUTS_WIDTH,
+         size_t PADDING_Y, size_t PADDING_X,
+         size_t STRIDE_Y, size_t STRIDE_X,
+         size_t DILATION_Y, size_t DILATION_X,
+         size_t KERNEL_HEIGHT, size_t KERNEL_WIDTH,
          ActivationFunction_T ACTIVATION,
+         // Memory mapping: inputs
+         size_t INPUT_MEM_CONT_OFFSET,
+         size_t INPUT_MEM_CONT_SIZE,
+         size_t INPUT_MEM_WRAP_OFFSET,
+         size_t INPUT_MEM_WRAP_SIZE,
+         size_t INPUT_MEM_STRIDE,
+         // Memory mapping: outputs
+         size_t OUTPUT_MEM_CONT_OFFSET,
+         size_t OUTPUT_MEM_CONT_SIZE,
+         size_t OUTPUT_MEM_WRAP_OFFSET,
+         size_t OUTPUT_MEM_WRAP_SIZE,
+         size_t OUTPUT_MEM_STRIDE,
          typename Input_T, typename Output_T,
          typename Weight_T, typename Bias_T,
          typename Rescaling_T>
@@ -30,50 +43,55 @@ void convolution_depthwise_forward(
     static_assert(NB_OUTPUTS % NB_CHANNELS == 0,
         "NB_OUTPUTS should be a multiple of NB_CHANNELS.");
 
-    constexpr int DILATED_KERNEL_HEIGHT 
+    constexpr size_t DILATED_KERNEL_HEIGHT 
             = KERNEL_HEIGHT + (DILATION_Y - 1) * (KERNEL_HEIGHT - 1);
 
-    constexpr int DILATED_KERNEL_WIDTH 
+    constexpr size_t DILATED_KERNEL_WIDTH 
             = KERNEL_WIDTH + (DILATION_X - 1) * (KERNEL_WIDTH - 1);
 
-    constexpr int OUTPUTS_HEIGHT_NOPAD
+    constexpr size_t OUTPUTS_HEIGHT_NOPAD
         = (CHANNELS_HEIGHT - DILATION_Y * (KERNEL_HEIGHT - 1) - 1 + STRIDE_Y) / STRIDE_Y;
-    constexpr int OUTPUTS_WIDTH_NOPAD
+    constexpr size_t OUTPUTS_WIDTH_NOPAD
         = (CHANNELS_WIDTH - DILATION_X * (KERNEL_WIDTH - 1) - 1 + STRIDE_X) / STRIDE_X;
 
-    for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
-        const int syMin = (PADDING_Y == 0) ? 0
+    for (size_t oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
+        const size_t syMin = (PADDING_Y == 0) ? 0
             : max(PADDING_Y - (oy * STRIDE_Y), 0);
-        const int syMax = (PADDING_Y == 0
+        const size_t syMax = (PADDING_Y == 0
                 && OUTPUTS_HEIGHT == OUTPUTS_HEIGHT_NOPAD) ? DILATED_KERNEL_HEIGHT
             : clamp(CHANNELS_HEIGHT + PADDING_Y - (oy * STRIDE_Y), 
                     0, DILATED_KERNEL_HEIGHT);
-        const int iy = (oy * STRIDE_Y) - PADDING_Y;
+        const int iy = static_cast<int>(oy * STRIDE_Y) - static_cast<int>(PADDING_Y);
 
 #ifdef _OPENMP
 #pragma omp parallel for collapse(2)
 #endif
-        for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
-            for (int output = 0; output < NB_OUTPUTS; ++output) {
+        for (size_t ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
+            for (size_t output = 0; output < NB_OUTPUTS; ++output) {
                 // moved to inner loop for collapsing -->
-                const int sxMin = (PADDING_X == 0) ? 0
+                const size_t sxMin = (PADDING_X == 0) ? 0
                     : max(PADDING_X - (ox * STRIDE_X), 0);
-                const int sxMax = (PADDING_X == 0
+                const size_t sxMax = (PADDING_X == 0
                         && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
                             ? DILATED_KERNEL_WIDTH
                     : clamp(CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X), 
                             0, DILATED_KERNEL_WIDTH);
-                const int ix = (ox * STRIDE_X) - PADDING_X;
+                const int ix = static_cast<int>(ox * STRIDE_X) - static_cast<int>(PADDING_X);
 
-                const int oPos = (ox + OUTPUTS_WIDTH * oy);
-                const int oOffset = NB_OUTPUTS * oPos;
+                const size_t oPos = (ox + OUTPUTS_WIDTH * oy);
+                int oOffset = (OUTPUT_MEM_STRIDE / sizeof(Output_T)) * oPos;
+
+                if (OUTPUT_MEM_WRAP_SIZE > 0 && oOffset >= static_cast<int>(OUTPUT_MEM_CONT_SIZE / sizeof(Output_T))) {
+                    oOffset += (OUTPUT_MEM_WRAP_OFFSET - OUTPUT_MEM_CONT_OFFSET
+                                - OUTPUT_MEM_CONT_SIZE) / sizeof(Output_T);
+                }
                 // <--
 
-                const int channel = (output * NB_CHANNELS) / NB_OUTPUTS;
+                const size_t channel = (output * NB_CHANNELS) / NB_OUTPUTS;
 
                 Bias_T weightedSum = biases ? biases[output] : 0;
 
-                for (int sy = 0; sy < KERNEL_HEIGHT; ++sy) {
+                for (size_t sy = 0; sy < KERNEL_HEIGHT; ++sy) {
                     if ((PADDING_Y != 0
                             || OUTPUTS_HEIGHT != OUTPUTS_HEIGHT_NOPAD)
                         && ((sy*DILATION_Y < syMin) || (sy*DILATION_Y >= syMax)))
@@ -81,13 +99,33 @@ void convolution_depthwise_forward(
                         continue;
                     }
 
-                    const int iPos = ix + CHANNELS_WIDTH * (iy + sy*DILATION_Y);
-                    const int iOffset = NB_CHANNELS * iPos;
+                    const size_t iPos = static_cast<size_t>(ix)
+                        + CHANNELS_WIDTH * (static_cast<size_t>(iy + sy * DILATION_Y));
+                    int iOffset = (INPUT_MEM_STRIDE / sizeof(Input_T)) * iPos;
 
-                    const int wOffset = (output*KERNEL_HEIGHT + sy) 
+                    // Wrapping cannot occur in the middle of a line, except if
+                    // there is only one line (1D)!
+                    bool wrapInRange = false;
+
+                    if (INPUT_MEM_WRAP_SIZE > 0
+                        && iOffset >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
+                    {
+                        iOffset += (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET
+                                    - INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
+                    }
+                    else if (INPUT_MEM_WRAP_SIZE > 0 && KERNEL_WIDTH > 1
+                        && CHANNELS_HEIGHT == 1 // single line (1D)!
+                        && iOffset + KERNEL_WIDTH * NB_CHANNELS
+                            > (INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
+                    {
+                        wrapInRange = true;
+                    }
+
+                    const size_t wOffset = (output*KERNEL_HEIGHT + sy) 
                                         * KERNEL_WIDTH;
 
-                    if (DILATION_X == 1 && ((PADDING_X == 0
+                    if (!wrapInRange && NB_CHANNELS == (INPUT_MEM_STRIDE / sizeof(Input_T))
+                        && DILATION_X == 1 && ((PADDING_X == 0
                             && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
                         || sxMax - sxMin == KERNEL_WIDTH))
                     {
@@ -97,7 +135,7 @@ void convolution_depthwise_forward(
                             weightedSum);
                     }
                     else {
-                        for (int sx = 0; sx < KERNEL_WIDTH; ++sx) {
+                        for (size_t sx = 0; sx < KERNEL_WIDTH; ++sx) {
                             if ((PADDING_X != 0
                                     || OUTPUTS_WIDTH != OUTPUTS_WIDTH_NOPAD)
                                 && ((sx*DILATION_X < sxMin) || (sx*DILATION_X >= sxMax)))
@@ -105,8 +143,16 @@ void convolution_depthwise_forward(
                                 continue;
                             }
 
-                            const int iOffsetInRange = iOffset
-                                + sx * DILATION_X * NB_CHANNELS;
+                            int iOffsetInRange = iOffset
+                                + sx * DILATION_X * (INPUT_MEM_STRIDE / sizeof(Input_T));
+
+                            if (wrapInRange
+                                && iOffsetInRange >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
+                            {
+                                iOffsetInRange += (INPUT_MEM_WRAP_OFFSET
+                                            - INPUT_MEM_CONT_OFFSET
+                                            - INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
+                            }
 
                             weightedSum += inputs[iOffsetInRange + channel]
                                 * weights[wOffset + sx];
@@ -130,6 +176,18 @@ template<int NB_CHANNELS,
          int DILATION_Y, int DILATION_X,
          int KERNEL_HEIGHT, int KERNEL_WIDTH,
          ActivationFunction_T ACTIVATION,
+         // Memory mapping: inputs
+         int INPUT_MEM_CONT_OFFSET,
+         int INPUT_MEM_CONT_SIZE,
+         int INPUT_MEM_WRAP_OFFSET,
+         int INPUT_MEM_WRAP_SIZE,
+         int INPUT_MEM_STRIDE,
+         // Memory mapping: outputs
+         int OUTPUT_MEM_CONT_OFFSET,
+         int OUTPUT_MEM_CONT_SIZE,
+         int OUTPUT_MEM_WRAP_OFFSET,
+         int OUTPUT_MEM_WRAP_SIZE,
+         int OUTPUT_MEM_STRIDE,
          typename Input_T, typename Output_T,
          typename Weight_T,
          typename Rescaling_T>
@@ -157,7 +215,19 @@ void convolution_depthwise_forward(
                         DILATION_X,
                         KERNEL_HEIGHT,
                         KERNEL_WIDTH,
-                        ACTIVATION>
+                        ACTIVATION,
+                        // Memory mapping: inputs
+                        INPUT_MEM_CONT_OFFSET,
+                        INPUT_MEM_CONT_SIZE,
+                        INPUT_MEM_WRAP_OFFSET,
+                        INPUT_MEM_WRAP_SIZE,
+                        INPUT_MEM_STRIDE,
+                        // Memory mapping: outputs
+                        OUTPUT_MEM_CONT_OFFSET,
+                        OUTPUT_MEM_CONT_SIZE,
+                        OUTPUT_MEM_WRAP_OFFSET,
+                        OUTPUT_MEM_WRAP_SIZE,
+                        OUTPUT_MEM_STRIDE>
                         (inputs, outputs, weights, b, rescaling);
 }
 
diff --git a/aidge_export_cpp/kernels/elemwise.hpp b/aidge_export_cpp/kernels/elemwise.hpp
index 9468b33f6b9785f36f511b14daffe9cc4a0ed420..fed13f956f7a464ba0ca7cf3e92642e6c28cd9db 100644
--- a/aidge_export_cpp/kernels/elemwise.hpp
+++ b/aidge_export_cpp/kernels/elemwise.hpp
@@ -3,169 +3,73 @@
 
 #include "network/typedefs.hpp"
 #include "network/activation_utils.hpp"
-
-// Generic function for two inputs
-
-template<int NB_ELTS,
-         ElemWise_T ELEM_OP,
-         ActivationFunction_T ACTIVATION,
-         typename Input_T, typename Output_T,
-         typename Rescaling_T>
+#include <sys/types.h>
+
+template<size_t NB_MAT, ElemWise_T ELEM_OP,
+        size_t INPUT1_CONT_SIZE, size_t INPUT2_CONT_SIZE, size_t OUTPUT_CONT_SIZE,
+        const size_t OFFSET_IN1[], const size_t OFFSET_IN2[],
+        ActivationFunction_T ACTIVATION,
+         // Memory mapping: inputs
+         size_t INPUT1_MEM_CONT_OFFSET,
+         size_t INPUT1_MEM_CONT_SIZE,
+         size_t INPUT1_MEM_WRAP_OFFSET,
+         size_t INPUT1_MEM_WRAP_SIZE,
+         size_t INPUT1_MEM_STRIDE,
+         size_t INPUT2_MEM_CONT_OFFSET,
+         size_t INPUT2_MEM_CONT_SIZE,
+         size_t INPUT2_MEM_WRAP_OFFSET,
+         size_t INPUT2_MEM_WRAP_SIZE,
+         size_t INPUT2_MEM_STRIDE,
+         // Memory mapping: outputs
+         size_t OUTPUT_MEM_CONT_OFFSET,
+         size_t OUTPUT_MEM_CONT_SIZE,
+         size_t OUTPUT_MEM_WRAP_OFFSET,
+         size_t OUTPUT_MEM_WRAP_SIZE,
+         size_t OUTPUT_MEM_STRIDE,
+        typename Input_T, typename Output_T, typename Rescaling_T>
 __attribute__((always_inline)) inline
-void elemwise_forward (
+void elemwise_forward(
     Output_T* __restrict outputs,
     const Rescaling_T& __restrict rescaling,
     const Input_T* __restrict inputs1,
     const Input_T* __restrict inputs2)
 {
-    if (std::is_floating_point<Input_T>::value)
-    {
-        Input_T val = 0;
-
-        switch (ELEM_OP) {
-            case Add: {
-                for (int i = 0; i < NB_ELTS; ++i) {
-                    val = inputs1[i] + inputs2[i];
-                    outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
-                }
-                break;
-            }
-            case Sub: {
-                for (int i = 0; i < NB_ELTS; ++i) {
-                    val = inputs1[i] - inputs2[i];
-                    outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
-
-                }
-                break;
-            }
-            case Mul: {
-                for (int i = 0; i < NB_ELTS; ++i) {
-                    val = inputs1[i] * inputs2[i];
-                    outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
-                }
-                break;
-            }
-            default: {
-                // Copy inputs1 in outputs for default case
-                for (int i = 0; i < NB_ELTS; ++i) {
-                    val = inputs1[i];
-                    outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
-                }
-                break;
-            }
-        }
-    }
-    else
-    {
-        int32_t val = 0;
+    static_assert(INPUT1_MEM_WRAP_SIZE == 0, "Incompatible input memory wrapping");
+    static_assert(INPUT2_MEM_WRAP_SIZE == 0, "Incompatible input memory wrapping");
+    static_assert(OUTPUT_MEM_CONT_SIZE % OUTPUT_CONT_SIZE == 0, "Incompatible output memory wrapping");
 
+    auto apply_op = [](auto a, auto b) -> Output_T {
         switch (ELEM_OP) {
-            case Add: {
-                for (int i = 0; i < NB_ELTS; ++i) {
-                    val = inputs1[i] + inputs2[i];
-                    outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
-                }
-                break;
-            }
-            case Sub: {
-                for (int i = 0; i < NB_ELTS; ++i) {
-                    val = inputs1[i] - inputs2[i];
-                    outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
-                }
-                break;
-            }
-            case Mul: {
-                for (int i = 0; i < NB_ELTS; ++i) {
-                    val = inputs1[i] * inputs2[i];
-                    outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
-                }
-                break;
-            }
-            default: {
-                // Copy inputs1 in outputs for default case
-                for (int i = 0; i < NB_ELTS; ++i) {
-                    val = inputs1[i];
-                    outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
-                }
-                break;
-            }
+            case Add: return a + b;
+            case Sub: return a - b;
+            case Mul: return a * b;
+            case Div: return a / b;
+            default:  return a;
         }
-    }
-}
-
+    };
 
-// Generic function for multiple inputs
-// Not working
-
-// template<ElemWise_T ELEM_OP, typename Output_T>
-// __attribute__((always_inline)) inline
-// Output_T elemWise (int /*pos*/, int /*ch*/)
-// {
-//     return 0;
-// }
-
-// template<ElemWise_T ELEM_OP,
-//          int NB_CHANNELS,
-//          // For next inputs
-//          int... ARGS,
-//          typename... INPUTS,
-//          // Types
-//          typename Input_T, typename Output_T>
-// __attribute__((always_inline)) inline
-// Output_T elemWise (int pos, int ch,
-//                    const Input_T* __restrict firstInputs,
-//                    INPUTS... inputs)
-// {
-//     int iOffset = NB_CHANNELS * pos;
-
-//     return firstInputs[iOffset + ch]
-//                 + elemWise<ELEM_OP, ARGS...>(pos, ch, inputs...);
-// }
-
-// template<// For all inputs
-//          int NB_CHANNELS,
-//          int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
-//          int NB_ELTS,
-//          int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
-//          ElemWise_T ELEM_OP,
-//          ActivationFunction_T ACTIVATION,
-//          // For next inputs
-//          int... ARGS,
-//          typename... INPUTS,
-//          // Types
-//          typename Input_T, typename Output_T,
-//          typename Rescaling_T>
-// __attribute__((always_inline)) inline
-// void elemWise_forward (
-//     Output_T* __restrict outputs,
-//     const Rescaling_T& __restrict rescaling,
-//     const Input_T* __restrict firstInputs,
-//     INPUTS... inputs)
-// {
-//     for (int oy = 0; oy < OUTPUTS_HEIGHT; oy++) {
-//         for (int ox = 0; ox < OUTPUTS_WIDTH; ox++) {
-//             const int pos = (ox + OUTPUTS_WIDTH * oy);
-//             int oOffset = NB_ELTS * pos;
-
-//             for (int ch = 0; ch < NB_ELTS; ++ch) {
-//                 const Add_T val = elemWise<ELEM_OP,
-//                                         INPUT_NB_CHANNELS,
-//                                         INPUT_MEM_CONT_OFFSET,
-//                                         INPUT_MEM_CONT_NB_ELTS,
-//                                         INPUT_MEM_WRAP_OFFSET,
-//                                         INPUT_MEM_WRAP_NB_ELTS,
-//                                         INPUT_MEM_STRIDE,
-//                                         ARGS...>(pos, ch, firstInputs, inputs...);
-
-//                 outputs[oOffset + ch]
-//                     = sat<Output_T>(val, ch, ACTIVATION, rescaling);
-//             }
-//         }
-//     }
-// }
+    for (size_t stack = 0; stack < NB_MAT; ++stack) {
+        const size_t offset_in1 = OFFSET_IN1[stack] * INPUT1_CONT_SIZE;
+        const size_t offset_in2 = OFFSET_IN2[stack] * INPUT2_CONT_SIZE;
+        int out_offset = stack * OUTPUT_CONT_SIZE;
 
+        if (OUTPUT_MEM_WRAP_SIZE > 0 && out_offset >= static_cast<int>(OUTPUT_MEM_CONT_SIZE / sizeof(Output_T))) {
+            out_offset += (OUTPUT_MEM_WRAP_OFFSET - OUTPUT_MEM_CONT_OFFSET
+                        - OUTPUT_MEM_CONT_SIZE) / sizeof(Output_T);
+        }
 
+        for (size_t i = 0; i < OUTPUT_CONT_SIZE; ++i) {
+            const size_t in0_id = (INPUT1_CONT_SIZE != 1) ? i : 0;
+            const size_t in1_id = (INPUT2_CONT_SIZE != 1) ? i : 0;
+            const size_t out_id = out_offset + i;
 
+            const auto val1 = inputs1[in0_id + offset_in1];
+            const auto val2 = inputs2[in1_id + offset_in2];
+            const Output_T val = apply_op(val1, val2);
 
+            outputs[out_id] = activation_forward_value<Output_T>(val, out_id, ACTIVATION, rescaling);
+        }
+    }
+}
 
 #endif  // __AIDGE_EXPORT_CPP_KERNELS_ELEMWISE__
diff --git a/aidge_export_cpp/kernels/erf.hpp b/aidge_export_cpp/kernels/erf.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..371b959b011fe2f80fe8ac5e8f9284433885d95d
--- /dev/null
+++ b/aidge_export_cpp/kernels/erf.hpp
@@ -0,0 +1,40 @@
+#ifndef __AIDGE_EXPORT_CPP_KERNELS_ERP__
+#define __AIDGE_EXPORT_CPP_KERNELS_ERP__
+
+#include "network/typedefs.hpp"
+#include "math.h"
+#include <sys/types.h>
+
+template<size_t NB_ELTS,
+         typename Input_T, typename Output_T>
+__attribute__((always_inline)) inline 
+void erf_forward (
+    const Input_T* __restrict inputs,
+    Output_T* __restrict outputs)
+{
+    constexpr double a1 =  0.254829592;
+    constexpr double a2 = -0.284496736;
+    constexpr double a3 =  1.421413741;
+    constexpr double a4 = -1.453152027;
+    constexpr double a5 =  1.061405429;
+    constexpr double p  =  0.3275911;
+
+    #ifdef _OPENMP
+    #pragma omp parallel for
+    #endif
+    for (size_t i = 0; i < NB_ELTS; ++i) {
+        int sign = 1;
+        if (inputs[i] < 0)
+            sign = -1;
+        const double abs_value = abs(inputs[i]);
+        
+        // A&S formula 7.1.26
+        const double t = 1.0/(1.0 + p*abs_value);
+        const double y = 1.0 - (((((a5*t + a4)*t) + a3)*t + a2)*t + a1)*t*exp(-abs_value*abs_value);
+        outputs[i] = sign*y;
+
+    }
+}
+
+
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_ERP_
\ No newline at end of file
diff --git a/aidge_export_cpp/kernels/fullyconnected.hpp b/aidge_export_cpp/kernels/fullyconnected.hpp
index abaab59c355263a79c905ffeb8a2a72b6e976445..5d9aaf5a91d4a2ba6176df6da59b4bd7a7122d1e 100644
--- a/aidge_export_cpp/kernels/fullyconnected.hpp
+++ b/aidge_export_cpp/kernels/fullyconnected.hpp
@@ -6,12 +6,29 @@
 #include "network/utils.hpp"
 #include "network/macs.hpp"
 #include "network/activation_utils.hpp"
+#include <sys/types.h>
 
-template<int NB_CHANNELS,
-         int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
-         int NB_OUTPUTS,
-         int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
+/**
+ * @brief Kernel to use when the input is in the NHWC format, and the
+ * weights have been transposed accordingly.
+ */
+template<size_t NB_CHANNELS,
+         size_t CHANNELS_HEIGHT, size_t CHANNELS_WIDTH,
+         size_t NB_OUTPUTS,
+         size_t OUTPUTS_HEIGHT, size_t OUTPUTS_WIDTH,
          ActivationFunction_T ACTIVATION,
+         // Memory mapping: inputs
+         size_t INPUT_MEM_CONT_OFFSET,
+         size_t INPUT_MEM_CONT_SIZE,
+         size_t INPUT_MEM_WRAP_OFFSET,
+         size_t INPUT_MEM_WRAP_SIZE,
+         size_t INPUT_MEM_STRIDE,
+         // Memory mapping: outputs
+         size_t OUTPUT_MEM_CONT_OFFSET,
+         size_t OUTPUT_MEM_CONT_SIZE,
+         size_t OUTPUT_MEM_WRAP_OFFSET,
+         size_t OUTPUT_MEM_WRAP_SIZE,
+         size_t OUTPUT_MEM_STRIDE,
          typename Input_T, typename Output_T,
          typename Weight_T, typename Bias_T,
          typename Rescaling_T>
@@ -23,55 +40,209 @@ void fullyconnected_forward (
     const Bias_T* __restrict biases,
     const Rescaling_T& __restrict rescaling)
 {
-    // Warning, there is a trick here !
-    // To use this kernel, the inputs have to be in NHWC and the weights are in NCHW
-    // It is only an issue if the FC was after a flatten layer.
-    // Otherwise it is not an issue for the other FC because CHANNELS_WIDTH = CHANNELS_HEIGHT = 1
-    // Solution: Add a system to check dataformat
+    constexpr size_t INPUT_WIDTH_STRIDE = (INPUT_MEM_STRIDE / sizeof(Input_T));
+    constexpr size_t INPUT_HEIGHT_STRIDE = (INPUT_MEM_STRIDE / sizeof(Input_T))*CHANNELS_WIDTH;
+    // constexpr size_t INPUT_OUT_CHANNELS_STRIDE = (INPUT_MEM_STRIDE / sizeof(Input_T))*CHANNELS_WIDTH*CHANNELS_HEIGHT;
+
+    constexpr size_t WEIGHT_WIDTH_STRIDE = NB_CHANNELS;
+    constexpr size_t WEIGHT_HEIGHT_STRIDE = NB_CHANNELS*CHANNELS_WIDTH;
+    constexpr size_t WEIGHT_OUT_CHANNELS_STRIDE = NB_CHANNELS*CHANNELS_WIDTH*CHANNELS_HEIGHT;
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
-    for (int och = 0; och < NB_OUTPUTS; och++) {
-
+    for (size_t och = 0; och < NB_OUTPUTS; ++och) {
         Bias_T weightedSum = (biases) ? biases[och] : Bias_T(0);
 
-        for (int iy = 0; iy < CHANNELS_HEIGHT; ++iy) {
-            for (int ix = 0; ix < CHANNELS_WIDTH; ++ix) {
-                for (int ch = 0; ch < NB_CHANNELS; ++ch) {
-                    weightedSum += inputs[CHANNELS_WIDTH*NB_CHANNELS*iy + NB_CHANNELS*ix + ch]
-                                * weights[CHANNELS_HEIGHT*CHANNELS_WIDTH*NB_CHANNELS*och + CHANNELS_HEIGHT*CHANNELS_WIDTH*ch + CHANNELS_HEIGHT*iy + ix];
+        for (size_t iy = 0; iy < CHANNELS_HEIGHT; ++iy) {
+            int iOffset = INPUT_HEIGHT_STRIDE * iy;
+
+            // Wrapping cannot occur in the middle of a line, except if
+            // there is only one line (1D)!
+            bool wrapInRange = false;
+
+            if (INPUT_MEM_WRAP_SIZE > 0 && iOffset >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) {
+                iOffset += (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET
+                            - INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
+            }
+            else if (INPUT_MEM_WRAP_SIZE > 0 && CHANNELS_WIDTH > 1
+                && CHANNELS_HEIGHT == 1 // single line (1D)!
+                && iOffset + CHANNELS_WIDTH * NB_CHANNELS
+                    > (INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
+            {
+                wrapInRange = true;
+            }
+
+            const size_t wOffset = WEIGHT_HEIGHT_STRIDE * iy + WEIGHT_OUT_CHANNELS_STRIDE * och;
+
+            if (!wrapInRange && INPUT_WIDTH_STRIDE == WEIGHT_WIDTH_STRIDE) {
+                macsOnRange<INPUT_HEIGHT_STRIDE>(
+                    inputs + iOffset,
+                    weights + wOffset,
+                    weightedSum);
+            }
+            else {
+                for (size_t ix = 0; ix < CHANNELS_WIDTH; ++ix) {
+                    int iOffsetInRange = iOffset + ix * INPUT_WIDTH_STRIDE;
+
+                    if (wrapInRange
+                        && iOffsetInRange >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
+                    {
+                        iOffsetInRange += (INPUT_MEM_WRAP_OFFSET
+                                    - INPUT_MEM_CONT_OFFSET
+                                    - INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
+                    }
+
+                    macsOnRange<INPUT_WIDTH_STRIDE>(
+                        inputs + iOffsetInRange,
+                        weights + wOffset + ix * WEIGHT_WIDTH_STRIDE,
+                        weightedSum);
                 }
             }
         }
 
         outputs[och] = activation_forward_value<Output_T>(weightedSum, och, ACTIVATION, rescaling);
     }
-/*
-Here the kernel to use with inputs in NHWC and weights in NHWC
+}
+
+/**
+ * @brief Kernel to use when the input is in the NCHW or Default format
+ * format (4D or 2D).
+ */
+template<size_t NB_CHANNELS,
+         size_t CHANNELS_HEIGHT, size_t CHANNELS_WIDTH,
+         size_t NB_OUTPUTS,
+         size_t OUTPUTS_HEIGHT, size_t OUTPUTS_WIDTH,
+         ActivationFunction_T ACTIVATION,
+         // Memory mapping: inputs
+         size_t INPUT_MEM_CONT_OFFSET,
+         size_t INPUT_MEM_CONT_SIZE,
+         size_t INPUT_MEM_WRAP_OFFSET,
+         size_t INPUT_MEM_WRAP_SIZE,
+         size_t INPUT_MEM_STRIDE,
+         // Memory mapping: outputs
+         size_t OUTPUT_MEM_CONT_OFFSET,
+         size_t OUTPUT_MEM_CONT_SIZE,
+         size_t OUTPUT_MEM_WRAP_OFFSET,
+         size_t OUTPUT_MEM_WRAP_SIZE,
+         size_t OUTPUT_MEM_STRIDE,
+         typename Input_T, typename Output_T,
+         typename Weight_T, typename Bias_T,
+         typename Rescaling_T>
+__attribute__((always_inline)) inline
+void fullyconnected_default_forward (
+    const Input_T* __restrict inputs,
+    Output_T* __restrict outputs,
+    const Weight_T* __restrict weights,
+    const Bias_T* __restrict biases,
+    const Rescaling_T& __restrict rescaling)
+{
+    constexpr size_t WEIGHT_OUT_CHANNELS_STRIDE = NB_CHANNELS*CHANNELS_WIDTH*CHANNELS_HEIGHT;
+
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
-    for (int och = 0; och < NB_OUTPUTS; och++) {
+    for (size_t och = 0; och < NB_OUTPUTS; och++) {
+        Bias_T weightedSum = (biases) ? biases[och] : Bias_T(0);
+
+        const size_t wOffset = WEIGHT_OUT_CHANNELS_STRIDE * och;
+
+        macsOnRange<WEIGHT_OUT_CHANNELS_STRIDE>(
+            inputs,
+            weights + wOffset,
+            weightedSum);
 
+        outputs[och] = activation_forward_value<Output_T>(weightedSum, och, ACTIVATION, rescaling);
+    }
+}
+
+/**
+ * @brief Kernel to use when the input is in the NHWC format, but the
+ * weights have not been transposed and still follow the NCHW format order.
+ */
+template<size_t NB_CHANNELS,
+         size_t CHANNELS_HEIGHT, size_t CHANNELS_WIDTH,
+         size_t NB_OUTPUTS,
+         size_t OUTPUTS_HEIGHT, size_t OUTPUTS_WIDTH,
+         ActivationFunction_T ACTIVATION,
+         // Memory mapping: inputs
+         size_t INPUT_MEM_CONT_OFFSET,
+         size_t INPUT_MEM_CONT_SIZE,
+         size_t INPUT_MEM_WRAP_OFFSET,
+         size_t INPUT_MEM_WRAP_SIZE,
+         size_t INPUT_MEM_STRIDE,
+         // Memory mapping: outputs
+         size_t OUTPUT_MEM_CONT_OFFSET,
+         size_t OUTPUT_MEM_CONT_SIZE,
+         size_t OUTPUT_MEM_WRAP_OFFSET,
+         size_t OUTPUT_MEM_WRAP_SIZE,
+         size_t OUTPUT_MEM_STRIDE,
+         typename Input_T, typename Output_T,
+         typename Weight_T, typename Bias_T,
+         typename Rescaling_T>
+__attribute__((always_inline)) inline
+void fullyconnected_transpose_forward (
+    const Input_T* __restrict inputs,
+    Output_T* __restrict outputs,
+    const Weight_T* __restrict weights,
+    const Bias_T* __restrict biases,
+    const Rescaling_T& __restrict rescaling)
+{
+    constexpr size_t INPUT_WIDTH_STRIDE = (INPUT_MEM_STRIDE / sizeof(Input_T));
+    constexpr size_t INPUT_HEIGHT_STRIDE = (INPUT_MEM_STRIDE / sizeof(Input_T))*CHANNELS_WIDTH;
+    // constexpr size_t INPUT_OUT_CHANNELS_STRIDE = (INPUT_MEM_STRIDE / sizeof(Input_T))*CHANNELS_WIDTH*CHANNELS_HEIGHT;
+
+    constexpr size_t WEIGHT_HEIGHT_STRIDE = CHANNELS_WIDTH;
+    constexpr size_t WEIGHT_IN_CHANNELS_STRIDE = CHANNELS_HEIGHT*CHANNELS_WIDTH;
+    constexpr size_t WEIGHT_OUT_CHANNELS_STRIDE = NB_CHANNELS*CHANNELS_HEIGHT*CHANNELS_WIDTH;
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+    for (size_t och = 0; och < NB_OUTPUTS; och++) {
         Bias_T weightedSum = (biases) ? biases[och] : Bias_T(0);
 
-        for (int iy = 0; iy < CHANNELS_HEIGHT; ++iy) {
-            const int iPos = (CHANNELS_WIDTH * iy);
-            int iOffset = NB_CHANNELS * iPos;
+        for (size_t iy = 0; iy < CHANNELS_HEIGHT; ++iy) {
+            int iOffset = INPUT_HEIGHT_STRIDE * iy;
+
+            // Wrapping cannot occur in the middle of a line, except if
+            // there is only one line (1D)!
+            bool wrapInRange = false;
+
+            if (INPUT_MEM_WRAP_SIZE > 0 && iOffset >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) {
+                iOffset += (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET
+                            - INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
+            }
+            else if (INPUT_MEM_WRAP_SIZE > 0 && CHANNELS_WIDTH > 1
+                && CHANNELS_HEIGHT == 1 // single line (1D)!
+                && iOffset + CHANNELS_WIDTH * NB_CHANNELS
+                    > (INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
+            {
+                wrapInRange = true;
+            }
 
-            const int wOffset = NB_CHANNELS * CHANNELS_WIDTH
-                                    * (iy + CHANNELS_HEIGHT * och);
+            const size_t wOffset = WEIGHT_OUT_CHANNELS_STRIDE * och + WEIGHT_HEIGHT_STRIDE * iy;
 
-            macsOnRange<NB_CHANNELS * CHANNELS_WIDTH>(
-                inputs + iOffset,
-                weights + wOffset,
-                weightedSum);
+            for (size_t ix = 0; ix < CHANNELS_WIDTH; ++ix) {
+                int iOffsetInRange = iOffset + ix * INPUT_WIDTH_STRIDE;
+
+                if (wrapInRange
+                    && iOffsetInRange >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
+                {
+                    iOffsetInRange += (INPUT_MEM_WRAP_OFFSET
+                                - INPUT_MEM_CONT_OFFSET
+                                - INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
+                }
+
+                // Beware that the pointer increment for weights is
+                // CHANNELS_HEIGHT*CHANNELS_WIDTH
+                macsOnRange<NB_CHANNELS, WEIGHT_IN_CHANNELS_STRIDE>(
+                    inputs + iOffsetInRange,
+                    weights + wOffset + ix,
+                    weightedSum);
+            }
         }
 
         outputs[och] = activation_forward_value<Output_T>(weightedSum, och, ACTIVATION, rescaling);
     }
-*/
 }
 
-
 #endif  // __AIDGE_EXPORT_CPP_KERNELS_FULLYCONNECTED__
diff --git a/aidge_export_cpp/kernels/hardmax.hpp b/aidge_export_cpp/kernels/hardmax.hpp
index 0f183b394e0aca1fd5502573ab627ea23d067169..0f15936d255be014003ae2550d2ea1e19849eabb 100644
--- a/aidge_export_cpp/kernels/hardmax.hpp
+++ b/aidge_export_cpp/kernels/hardmax.hpp
@@ -3,15 +3,16 @@
 
 #include "network/typedefs.hpp"
 #include "network/utils.hpp"
+#include <sys/types.h>
 
 // Todo add border value and border type (Reflect, Constant, Wrap...) and add
 // the two missing pad value (bottom and right)
 
-template <unsigned int AXIS_DIM_SIZE,
-          unsigned int PREAXIS_STRIDE,
-          unsigned int AXIS_STRIDE,
-          unsigned int POSTAXIS_STRIDE,
-          unsigned int NB_ELTS,
+template <size_t AXIS_DIM_SIZE,
+          size_t PREAXIS_STRIDE,
+          size_t AXIS_STRIDE,
+          size_t POSTAXIS_STRIDE,
+          size_t NB_ELTS,
           typename Input_T,
           typename Output_T>
 // void HardmaxImpl_cpu_forward_kernel(std::int32_t axis_, const
@@ -26,17 +27,17 @@ hardmax2d_forward(const Input_T *__restrict input,
 
     // For each index on all the axes before and after 'axis', we have a
     // different max element to find
-    for (unsigned int i = 0, preAxisOffset = 0; i < PREAXIS_STRIDE;
+    for (size_t i = 0, preAxisOffset = 0; i < PREAXIS_STRIDE;
          ++i, preAxisOffset += AXIS_DIM_SIZE * POSTAXIS_STRIDE) {
 
-        for (unsigned int j = 0; j < POSTAXIS_STRIDE; ++j) {
+        for (size_t j = 0; j < POSTAXIS_STRIDE; ++j) {
             // Init the max with first element
-            unsigned int maxIdx = 0;
+            size_t maxIdx = 0;
             Input_T maxVal = input[preAxisOffset + j];
             // Loop over the elements on 'axis'
             // Since we start at 0th idx, we already initialize the values like
             // the 1st iteration has been done
-            for (unsigned int k = 1,
+            for (size_t k = 1,
                               postAxisOffset = preAxisOffset + POSTAXIS_STRIDE;
                  k < AXIS_DIM_SIZE;
                  ++k, postAxisOffset += POSTAXIS_STRIDE) {
diff --git a/aidge_export_cpp/kernels/reshape.hpp b/aidge_export_cpp/kernels/identity.hpp
similarity index 56%
rename from aidge_export_cpp/kernels/reshape.hpp
rename to aidge_export_cpp/kernels/identity.hpp
index 376ed8859241947c8239d086df75838b4865505e..7f7b31d80b2ab5eba9a41ce63d35c88cc8255c72 100644
--- a/aidge_export_cpp/kernels/reshape.hpp
+++ b/aidge_export_cpp/kernels/identity.hpp
@@ -1,27 +1,27 @@
-#ifndef __AIDGE_EXPORT_CPP_KERNELS_RESHAPE__
-#define __AIDGE_EXPORT_CPP_KERNELS_RESHAPE__
+#ifndef __AIDGE_EXPORT_CPP_KERNELS_IDENTITY__
+#define __AIDGE_EXPORT_CPP_KERNELS_IDENTITY__
 
 #include "network/typedefs.hpp"
+#include <sys/types.h>
 
-// Generic function for reshape and activation
+// Generic function for identity and activation
 
-template<int M,
+template<size_t M,
          typename Input_T, typename Output_T>
 __attribute__((always_inline)) inline
-void reshape_forward (
+void identity_forward (
     const Input_T* __restrict inputs,
-    const Input_T* __restrict /*shape*/,
     Output_T* __restrict outputs)
 {
     // If inputs and outputs pointers are the same, the memory manager has already optimized this function so it is a no-op !
     if (inputs == outputs)
         return;
 
-    // A reshape in c++ world should equal to a Noop
+    // A identity in c++ world should equal to a Noop
     // We only need to copy the input buffer to the output
-    for (int m = 0; m < M; ++m) {
+    for (size_t m = 0; m < M; ++m) {
         outputs[m] = inputs[m];
     }
 }
 
-#endif  // __AIDGE_EXPORT_CPP_KERNELS_RESHAPE__
\ No newline at end of file
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_IDENTITY__
\ No newline at end of file
diff --git a/aidge_export_cpp/kernels/leakyrelu.hpp b/aidge_export_cpp/kernels/leakyrelu.hpp
index 5e6598d8fe5d43d9ae9320498289577ab7695e97..cd18762fab47274eab3b3e095452ac8d414fc827 100644
--- a/aidge_export_cpp/kernels/leakyrelu.hpp
+++ b/aidge_export_cpp/kernels/leakyrelu.hpp
@@ -2,26 +2,50 @@
 #define __AIDGE_EXPORT_CPP_KERNELS_LEAKYRELU__
 
 #include "network/typedefs.hpp"
+#include <sys/types.h>
+#include <sys/types.h>
 
-template<int NB_DATA,
+template<size_t NB_ELTS,
+         // Memory mapping: inputs
+         size_t INPUT_MEM_CONT_OFFSET,
+         size_t INPUT_MEM_CONT_SIZE,
+         size_t INPUT_MEM_WRAP_OFFSET,
+         size_t INPUT_MEM_WRAP_SIZE,
+         size_t INPUT_MEM_STRIDE,
+         // Memory mapping: outputs
+         size_t OUTPUT_MEM_CONT_OFFSET,
+         size_t OUTPUT_MEM_CONT_SIZE,
+         size_t OUTPUT_MEM_WRAP_OFFSET,
+         size_t OUTPUT_MEM_WRAP_SIZE,
+         size_t OUTPUT_MEM_STRIDE,
          typename Input_T, typename Output_T>
-__attribute__((always_inline)) inline 
+__attribute__((always_inline)) inline
 void leakyrelu_forward (
     const Input_T* __restrict inputs,
     Output_T* __restrict outputs,
     const float negative_slope)
 {
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
-    for (int i = 0; i < NB_DATA; ++i) {
-        if (inputs[i] >= 0) {
-            outputs[i] = inputs[i];
-        } else {
-            outputs[i] = negative_slope * inputs[i];
+    int inOffset = 0;
+    int outOffset = 0;
+
+    for (size_t i = 0; i < NB_ELTS; ++i) {
+        if (INPUT_MEM_WRAP_SIZE > 0 && i == static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) {
+            inOffset = (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET
+                        - INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
+        }
+
+        if (OUTPUT_MEM_WRAP_SIZE > 0 && i == static_cast<int>(OUTPUT_MEM_CONT_SIZE / sizeof(Output_T))) {
+            outOffset = (OUTPUT_MEM_WRAP_OFFSET - OUTPUT_MEM_CONT_OFFSET
+                        - OUTPUT_MEM_CONT_SIZE) / sizeof(Output_T);
+        }
+
+        if (inputs[inOffset + i] >= 0) {
+            outputs[outOffset + i] = inputs[inOffset + i];
+        }
+        else {
+            outputs[outOffset + i] = negative_slope * inputs[inOffset + i];
         }
     }
 }
 
-
-#endif  // __AIDGE_EXPORT_CPP_KERNELS_LEAKYRELU__
\ No newline at end of file
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_LEAKYRELU__
diff --git a/aidge_export_cpp/kernels/matmul.hpp b/aidge_export_cpp/kernels/matmul.hpp
index b507c4f1e37065a620a0ac37ed370cfa6847487d..7494519e73cccc5fc0647e02065e3a1685979c7b 100644
--- a/aidge_export_cpp/kernels/matmul.hpp
+++ b/aidge_export_cpp/kernels/matmul.hpp
@@ -3,15 +3,14 @@
 
 #include "network/typedefs.hpp"
 #include "network/activation_utils.hpp"
+#include <sys/types.h>
 
 // Generic function for matmul and activation
 
-template<int M,
-         int K,
-         int N,
-         ActivationFunction_T ACTIVATION,
-         typename Input_T, typename Output_T,
-         typename Rescaling_T>
+template<size_t NB_MAT, size_t N, size_t M, size_t K,
+        const size_t OFFSET_IN1[], const size_t OFFSET_IN2[],
+        ActivationFunction_T ACTIVATION,
+        typename Input_T, typename Output_T, typename Rescaling_T>
 __attribute__((always_inline)) inline
 void matmul_forward (
     const Input_T* __restrict inputs1,
@@ -19,15 +18,28 @@ void matmul_forward (
     Output_T* __restrict outputs,
     const Rescaling_T& __restrict rescaling)
 {
-    for (int m = 0; m < M; ++m) {
-        for (int n = 0; n < N; ++n) {
-            Output_T sum = Output_T(0);
-            for (int k = 0; k < K; ++k) {
-                sum += inputs1[K*m + k] * inputs2[N*k + n];
+    for (size_t stack = 0; stack < NB_MAT; ++stack) {
+        const size_t offset1 = OFFSET_IN1[stack] * N * K;
+        const size_t offset2 = OFFSET_IN2[stack] * K * M;
+        Output_T* out_ptr = &outputs[stack * N * M];
+
+        for (size_t i = 0; i < N; ++i) {
+            const Output_T* in1_row = &inputs1[offset1 + i * K];
+
+            for (size_t j = 0; j < M; ++j) {
+                Output_T sum = 0;
+
+                // Access column of inputs2 as row-major
+                for (size_t l = 0; l < K; ++l) {
+                    sum += in1_row[l] * inputs2[offset2 + l * M + j];
+                }
+
+                out_ptr[i * M + j] = activation_forward_value<Output_T>(
+                    sum, 0 /* not applicable */, ACTIVATION, rescaling
+                );
             }
-            outputs[N*m + n] = activation_forward_value<Output_T>(sum, 0/*not applicable*/, ACTIVATION, rescaling);
         }
     }
 }
 
-#endif  // __AIDGE_EXPORT_CPP_KERNELS_MATMUL__
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_MATMUL__
\ No newline at end of file
diff --git a/aidge_export_cpp/kernels/pad.hpp b/aidge_export_cpp/kernels/pad.hpp
index 4e83257c1152b1963dd4b0eefc912216a729de7d..3f382c466de7b4549bf45b34398a7a893c8c5fc8 100644
--- a/aidge_export_cpp/kernels/pad.hpp
+++ b/aidge_export_cpp/kernels/pad.hpp
@@ -3,44 +3,83 @@
 
 #include "network/typedefs.hpp"
 #include "network/utils.hpp"
+#include <sys/types.h>
 
-// Todo add border value and border type (Reflect, Constant, Wrap...) and add the two missing pad value (bottom and right)
-
-template<int NB_BATCHES, int NB_CHANNELS,
-         int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
-         int NB_OUTPUTS,
-         int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
-         int PADDING_TOP,
-         int PADDING_LEFT,
-         int PADDING_BOTTOM,
-         int PADDING_RIGHT,
-         typename Input_T, typename Output_T>
-__attribute__((always_inline)) inline
-void pad_forward(
-    double borderValue,
-    const Input_T* __restrict inputs,
-    Output_T* __restrict outputs
-    )
-{
-    const unsigned int oySize = CHANNELS_HEIGHT + PADDING_TOP + PADDING_BOTTOM;
-    const unsigned int oxSize = CHANNELS_WIDTH + PADDING_LEFT + PADDING_RIGHT;
-
-    for (unsigned int batch = 0; batch < NB_BATCHES; ++batch) {
-        for (unsigned int ch = 0; ch < NB_CHANNELS; ++ch) {
-            const unsigned int preIndex = batch * NB_CHANNELS * CHANNELS_HEIGHT * CHANNELS_WIDTH + ch * CHANNELS_HEIGHT * CHANNELS_WIDTH;
-
-            for (unsigned int oy = 0; oy < oySize; ++oy) {
-                for (unsigned int ox = 0; ox < oxSize; ++ox) {
-                    const unsigned int outIndex = batch * NB_CHANNELS * oySize * oxSize + ch * oySize * oxSize + oy * oxSize + ox;
-
-                    outputs[outIndex] = borderValue;
-
-                    const unsigned int inputX = ox - PADDING_LEFT;
-                    const unsigned int inputY = oy - PADDING_TOP;
-
-                    if (inputY >= 0 and inputY < CHANNELS_HEIGHT and inputX >= 0 and inputX < CHANNELS_WIDTH)
-                    {
-                        outputs[outIndex] = inputs[preIndex + inputY * CHANNELS_WIDTH + inputX];
+// TODO : add border value and border type (Reflect, Constant, Wrap...) and add
+// the two missing pad value (bottom and right)
+
+template <size_t NB_BATCHES,
+          size_t NB_CHANNELS,
+          size_t CHANNELS_HEIGHT,
+          size_t CHANNELS_WIDTH,
+          size_t NB_OUTPUTS,
+          size_t OUTPUTS_HEIGHT,
+          size_t OUTPUTS_WIDTH,
+          int PADDING_TOP,
+          int PADDING_LEFT,
+          int PADDING_BOTTOM,
+          int PADDING_RIGHT,
+          typename Input_T,
+          typename Output_T>
+__attribute__((always_inline)) inline void
+pad_forward(double borderValue,
+            const Input_T *__restrict inputs,
+            Output_T *__restrict outputs) {
+    constexpr size_t oySize =
+        CHANNELS_HEIGHT + PADDING_TOP + PADDING_BOTTOM;
+    constexpr size_t oxSize =
+        CHANNELS_WIDTH + PADDING_LEFT + PADDING_RIGHT;
+
+    constexpr size_t inputStrides[3] = {
+        NB_CHANNELS * CHANNELS_HEIGHT * CHANNELS_WIDTH,
+        CHANNELS_WIDTH * CHANNELS_HEIGHT,
+        CHANNELS_WIDTH};
+    constexpr size_t outputStrides[3] = {
+        NB_CHANNELS * oySize * oxSize,
+        oySize * oxSize,
+        oxSize,
+    };
+
+    for (size_t batch = 0, inBatchOffset = 0, outBatchOffset = 0;
+         batch < NB_BATCHES;
+         ++batch,
+                      inBatchOffset += inputStrides[0],
+                      outBatchOffset += outputStrides[0]) {
+
+        for (size_t ch = 0,
+                          inChannelOffset = inBatchOffset,
+                          outChannelOffset = outBatchOffset;
+             ch < NB_CHANNELS;
+             ++ch,
+                          inChannelOffset += inputStrides[1],
+                          outChannelOffset += outputStrides[1]) {
+
+            for (int oY = 0,
+                     oDimYOffset = outChannelOffset,
+                     iY = oY - PADDING_TOP,
+                     // iDimOffset won't be used unless iY >= 0 hence no risk
+                     // of negative idx
+                 iDimYOffset = inChannelOffset + iY * inputStrides[2];
+                 static_cast<size_t>(oY) < oySize;
+                 ++oY,
+                     ++iY,
+                     iDimYOffset += inputStrides[2],
+                     oDimYOffset += outputStrides[2]) {
+
+                if (iY < 0 or iY >= CHANNELS_HEIGHT) {
+                    for (Output_T *o = outputs + oDimYOffset;
+                         o != outputs + oDimYOffset + outputStrides[2];
+                         ++o) {
+                        *o = borderValue;
+                    }
+                    continue;
+                }
+                for (size_t oX = 0; oX < oxSize; ++oX) {
+                    const int iX = static_cast<int>(oX - PADDING_LEFT);
+                    if (iX < 0 or iX >= CHANNELS_WIDTH) {
+                        outputs[oDimYOffset + oX] = borderValue;
+                    } else {
+                        outputs[oDimYOffset + oX] = inputs[iDimYOffset + iX];
                     }
                 }
             }
@@ -48,4 +87,4 @@ void pad_forward(
     }
 }
 
-#endif  // __AIDGE_EXPORT_CPP_KERNELS_PAD2D__
+#endif // __AIDGE_EXPORT_CPP_KERNELS_PAD2D__
diff --git a/aidge_export_cpp/kernels/pooling.hpp b/aidge_export_cpp/kernels/pooling.hpp
index 12ac69ffcf30e72c6d854753d4d2a22b1ce4419c..7f2d3a392c33f132727ce4638a45c09c9d48761e 100644
--- a/aidge_export_cpp/kernels/pooling.hpp
+++ b/aidge_export_cpp/kernels/pooling.hpp
@@ -6,60 +6,77 @@
 #include <limits>
 #include <cmath>
 #include <stdexcept>
-
-
-template<int NB_CHANNELS,
-         int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
-         int NB_OUTPUTS,
-         int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
-         int PADDING_Y, int PADDING_X,
-         int STRIDE_Y, int STRIDE_X,
-         int POOL_HEIGHT, int POOL_WIDTH,
+#include <sys/types.h>
+
+template<size_t NB_CHANNELS,
+         size_t CHANNELS_HEIGHT, size_t CHANNELS_WIDTH,
+         size_t NB_OUTPUTS,
+         size_t OUTPUTS_HEIGHT, size_t OUTPUTS_WIDTH,
+         size_t PADDING_Y, size_t PADDING_X,
+         size_t STRIDE_Y, size_t STRIDE_X,
+         size_t POOL_HEIGHT, size_t POOL_WIDTH,
          Pooling_T POOLING_TYPE,
          ActivationFunction_T ACTIVATION,
+         // Memory mapping: inputs
+         size_t INPUT_MEM_CONT_OFFSET,
+         size_t INPUT_MEM_CONT_SIZE,
+         size_t INPUT_MEM_WRAP_OFFSET,
+         size_t INPUT_MEM_WRAP_SIZE,
+         size_t INPUT_MEM_STRIDE,
+         // Memory mapping: outputs
+         size_t OUTPUT_MEM_CONT_OFFSET,
+         size_t OUTPUT_MEM_CONT_SIZE,
+         size_t OUTPUT_MEM_WRAP_OFFSET,
+         size_t OUTPUT_MEM_WRAP_SIZE,
+         size_t OUTPUT_MEM_STRIDE,
          typename Input_T, typename Output_T>
 __attribute__((always_inline)) inline
 void pooling_forward(
     const Input_T* __restrict inputs,
     Output_T* __restrict outputs)
 {
-    constexpr int OUTPUTS_HEIGHT_NOPAD
+    constexpr size_t OUTPUTS_HEIGHT_NOPAD
         = (CHANNELS_HEIGHT - POOL_HEIGHT + STRIDE_Y) / STRIDE_Y;
-    constexpr int OUTPUTS_WIDTH_NOPAD
+    constexpr size_t OUTPUTS_WIDTH_NOPAD
         = (CHANNELS_WIDTH - POOL_WIDTH + STRIDE_X) / STRIDE_X;
 
-    for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
-        const int syMin = (PADDING_Y == 0) ? 0
+    for (size_t oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
+        const size_t syMin = (PADDING_Y == 0) ? 0
             : max(PADDING_Y - (oy * STRIDE_Y), 0);
-        const int syMax = (PADDING_Y == 0
+        const size_t syMax = (PADDING_Y == 0
                 && OUTPUTS_HEIGHT == OUTPUTS_HEIGHT_NOPAD) ? POOL_HEIGHT
             : clamp(CHANNELS_HEIGHT + PADDING_Y - (oy * STRIDE_Y),
                     0, POOL_HEIGHT);
-        const int iy = (oy * STRIDE_Y) - PADDING_Y;
+        const int iy = static_cast<int>(oy * STRIDE_Y) - static_cast<int>(PADDING_Y);
 
 #ifdef _OPENMP
 #pragma omp parallel for collapse(2)
 #endif
-        for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
-            for (int output = 0; output < NB_OUTPUTS; ++output) {
+        for (size_t ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
+            for (size_t output = 0; output < NB_OUTPUTS; ++output) {
                 // moved to inner loop for collapsing -->
-                const int sxMin = (PADDING_X == 0) ? 0
+                const size_t sxMin = (PADDING_X == 0) ? 0
                     : max(PADDING_X - (ox * STRIDE_X), 0);
-                const int sxMax = (PADDING_X == 0
+                const size_t sxMax = (PADDING_X == 0
                         && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
                             ? POOL_WIDTH
                     : clamp(CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X),
                             0, POOL_WIDTH);
-                const int ix = (ox * STRIDE_X) - PADDING_X;
+                const int ix = static_cast<int>(ox * STRIDE_X) - static_cast<int>(PADDING_X);
+
+                const size_t oPos = (ox + OUTPUTS_WIDTH * oy);
+                int oOffset = (OUTPUT_MEM_STRIDE / sizeof(Output_T)) * oPos;
 
-                const int oPos = (ox + OUTPUTS_WIDTH * oy);
-                int oOffset = NB_OUTPUTS * oPos;
+                if (OUTPUT_MEM_WRAP_SIZE > 0 && oOffset >= static_cast<int>(OUTPUT_MEM_CONT_SIZE / sizeof(Output_T))) {
+                    oOffset += (OUTPUT_MEM_WRAP_OFFSET - OUTPUT_MEM_CONT_OFFSET
+                                - OUTPUT_MEM_CONT_SIZE) / sizeof(Output_T);
+                }
                 // <--
 
                 if (POOLING_TYPE == Max) {
                     Input_T maxVal = std::numeric_limits<Input_T>::lowest();
 
-                    for (int sy = 0; sy < POOL_HEIGHT; ++sy) {
+                    for (size_t sy = 0; sy < POOL_HEIGHT; ++sy) {
                         if ((PADDING_Y != 0
                                 || OUTPUTS_HEIGHT != OUTPUTS_HEIGHT_NOPAD)
                             && sy >= syMax - syMin)
@@ -67,11 +84,29 @@ void pooling_forward(
                             break;
                         }
 
-                        const int iPos = ((sxMin + ix)
-                                            + CHANNELS_WIDTH * (iy + syMin + sy));
-                        int iOffset = NB_CHANNELS * iPos;
+                        const size_t iPos = static_cast<size_t>(sxMin + ix)
+                                            + CHANNELS_WIDTH * (static_cast<size_t>(iy + syMin + sy));
+                        int iOffset = (INPUT_MEM_STRIDE / sizeof(Input_T)) * iPos;
+
+                        // Wrapping cannot occur in the middle of a line, except if
+                        // there is only one line (1D)!
+                        bool wrapInRange = false;
+
+                        if (INPUT_MEM_WRAP_SIZE > 0
+                            && iOffset >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
+                        {
+                            iOffset += (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET
+                                        - INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
+                        }
+                        else if (INPUT_MEM_WRAP_SIZE > 0 && POOL_WIDTH > 1
+                            && CHANNELS_HEIGHT == 1 // single line (1D)!
+                            && iOffset + POOL_WIDTH * (INPUT_MEM_CONT_SIZE / sizeof(Input_T))
+                                > (INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
+                        {
+                            wrapInRange = true;
+                        }
 
-                        for (int sx = 0; sx < POOL_WIDTH; ++sx) {
+                        for (size_t sx = 0; sx < POOL_WIDTH; ++sx) {
                             if ((PADDING_X != 0
                                     || OUTPUTS_WIDTH != OUTPUTS_WIDTH_NOPAD)
                                 && sx >= sxMax - sxMin)
@@ -79,7 +114,16 @@ void pooling_forward(
                                 break;
                             }
 
-                            int iOffsetInRange = iOffset + output + sx * NB_CHANNELS;
+                            int iOffsetInRange = iOffset + output
+                                + sx * (INPUT_MEM_STRIDE / sizeof(Input_T));
+
+                            if (wrapInRange &&
+                                iOffsetInRange >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
+                            {
+                                iOffsetInRange += (INPUT_MEM_WRAP_OFFSET
+                                            - INPUT_MEM_CONT_OFFSET
+                                            - INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
+                            }
 
                             if (inputs[iOffsetInRange] > maxVal)
                                 maxVal = inputs[iOffsetInRange];
@@ -91,7 +135,7 @@ void pooling_forward(
                 else if (POOLING_TYPE == Average) {
                     float sum = 0;
 
-                    for (int sy = 0; sy < POOL_HEIGHT; ++sy) {
+                    for (size_t sy = 0; sy < POOL_HEIGHT; ++sy) {
                         if ((PADDING_Y != 0
                                 || OUTPUTS_HEIGHT != OUTPUTS_HEIGHT_NOPAD)
                             && sy >= syMax - syMin)
@@ -99,11 +143,29 @@ void pooling_forward(
                             break;
                         }
 
-                        const int iPos = ((sxMin + ix)
-                                            + CHANNELS_WIDTH * (iy + syMin + sy));
-                        int iOffset = NB_CHANNELS * iPos;
+                        const size_t iPos = static_cast<size_t>(sxMin + ix)
+                                            + CHANNELS_WIDTH * (static_cast<size_t>(iy + syMin + sy));
+                        int iOffset = (INPUT_MEM_STRIDE / sizeof(Input_T)) * iPos;
+
+                        // Wrapping cannot occur in the middle of a line, except if
+                        // there is only one line (1D)!
+                        bool wrapInRange = false;
 
-                        for (int sx = 0; sx < POOL_WIDTH; ++sx) {
+                        if (INPUT_MEM_WRAP_SIZE > 0
+                            && iOffset >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
+                        {
+                            iOffset += (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET
+                                        - INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
+                        }
+                        else if (INPUT_MEM_WRAP_SIZE > 0 && POOL_WIDTH > 1
+                            && CHANNELS_HEIGHT == 1 // single line (1D)!
+                            && iOffset + POOL_WIDTH * (INPUT_MEM_STRIDE / sizeof(Input_T))
+                                > (INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
+                        {
+                            wrapInRange = true;
+                        }
+
+                        for (size_t sx = 0; sx < POOL_WIDTH; ++sx) {
                             if ((PADDING_X != 0
                                     || OUTPUTS_WIDTH != OUTPUTS_WIDTH_NOPAD)
                                 && sx >= sxMax - sxMin)
@@ -111,7 +173,17 @@ void pooling_forward(
                                 break;
                             }
 
-                            int iOffsetInRange = iOffset + output + sx * NB_CHANNELS;
+                            int iOffsetInRange = iOffset + output
+                                + sx * (INPUT_MEM_STRIDE / sizeof(Input_T));
+
+                            if (wrapInRange &&
+                                iOffsetInRange >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
+                            {
+                                iOffsetInRange += (INPUT_MEM_WRAP_OFFSET
+                                            - INPUT_MEM_CONT_OFFSET
+                                            - INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
+                            }
+
                             sum += inputs[iOffsetInRange];
                         }
                     }
diff --git a/aidge_export_cpp/kernels/reducemean.hpp b/aidge_export_cpp/kernels/reducemean.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b4a693d084af4941dd1d95c74eda84d720388922
--- /dev/null
+++ b/aidge_export_cpp/kernels/reducemean.hpp
@@ -0,0 +1,175 @@
+#ifndef __AIDGE_EXPORT_CPP_KERNELS_REDUCEMEAN__
+#define __AIDGE_EXPORT_CPP_KERNELS_REDUCEMEAN__
+
+#include "network/typedefs.hpp"
+#include "network/utils.hpp"
+#include <cmath>
+#include <type_traits>
+#include <sys/types.h>
+
+template <typename T>
+using Acc_T =
+    typename std::conditional_t<std::is_floating_point<T>::value, T, double>;
+
+// computes iterative mean
+template <typename T>
+typename std::enable_if<std::is_floating_point<T>::value, T>::type
+stableMean(const T *vec, std::size_t len, std::size_t stride) {
+    T mean = 0;
+    for (std::size_t i = 0; i < len; ++i) {
+        mean = std::fma(vec[i * stride] - mean,
+                        static_cast<T>(1) / static_cast<T>(i + 1),
+                        mean);
+    }
+    return mean;
+}
+
+// Specialization for integers: perform the mean computation in float
+template <typename T>
+typename std::enable_if_t<!std::is_floating_point<T>::value, double>
+stableMean(const T *vec, std::size_t len, std::size_t stride) {
+    double mean = 0;
+    for (size_t i = 0; i < len; ++i) {
+        mean = std::fma<double>(static_cast<double>(vec[i * stride]) - mean,
+                                1.0 / static_cast<double>(i + 1),
+                                mean);
+    }
+    return mean;
+}
+
+template <typename T>
+typename std::enable_if_t<std::is_floating_point<T>::value, T>
+castFromFloat(T value) {
+    return value;
+}
+
+template <typename T>
+typename std::enable_if_t<!std::is_floating_point<T>::value, T>
+castFromFloat(double value) {
+    return static_cast<T>(std::nearbyint(value));
+}
+
+/**
+ * @brief computes the mean of the tensor values over specified axis
+ * This function can be called in a for loop to compute along different axes
+ iteratively
+ * @param[in] iDim : input dimensions of the tensor along axis to reduce
+ * @param[in] preAxisNbElts : nb of elements on each axis before the axis to
+ reduce.
+ * @param[in] postAxisNbElts : nb of elements on each axis after the axis to
+ reduce
+ * @param[in] axisNbElts : nb of elements on the axis to reduce
+ * @param[inout] prevAcc: Values returned by previous computation, if 1st
+ * iteration, its the input tensor.
+ * @param[inout] currAcc: output of computation : tensor with averaged
+ values
+ * along given axis
+ */
+template <typename Input_T, typename Output_T>
+Output_T *computeMeanOverAxis(const size_t preAxisNbElts,
+                              const size_t postAxisNbElts,
+                              const size_t axisNbElts,
+                              const size_t iDim,
+                              const Input_T *__restrict__ prevAcc,
+                              Output_T *currAcc) {
+    for (size_t preAxisIdx = 0, iPreAxisOffset = 0, oPreAxisOffset = 0;
+         preAxisIdx < preAxisNbElts;
+         ++preAxisIdx,
+                     iPreAxisOffset += axisNbElts,
+                     oPreAxisOffset += postAxisNbElts) {
+
+        for (size_t postAxisIdx = 0; postAxisIdx < postAxisNbElts;
+             ++postAxisIdx) {
+            currAcc[oPreAxisOffset + postAxisIdx] = castFromFloat<Output_T>(
+                stableMean(prevAcc + iPreAxisOffset + postAxisIdx,
+                           iDim,
+                           postAxisNbElts));
+        }
+    }
+    return currAcc;
+}
+
+template <size_t IN_NB_DIMS,
+          size_t IN_NB_ELTS,
+          size_t OUT_NB_ELTS,
+          size_t NB_AXES_TO_REDUCE,
+          typename Input_T,
+          typename Output_T>
+__attribute__((always_inline)) inline void
+reducemean_forward(const size_t axesToReduce[NB_AXES_TO_REDUCE],
+                   const size_t iDims[IN_NB_DIMS],
+                   const size_t preAxisStrides[IN_NB_DIMS],
+                   const size_t postAxisStrides[IN_NB_DIMS],
+                   const Input_T *__restrict input,
+                   Output_T *__restrict output) {
+
+    switch (NB_AXES_TO_REDUCE) {
+    case 0: {
+        copy_n(input, IN_NB_ELTS, output);
+        break;
+    }
+    case 1: {
+        output = computeMeanOverAxis<Input_T, Output_T>(
+            preAxisStrides[axesToReduce[0]],
+            postAxisStrides[axesToReduce[0]],
+            iDims[axesToReduce[0]] * postAxisStrides[axesToReduce[0]],
+            iDims[axesToReduce[0]],
+            input,
+            output);
+        break;
+    }
+    default: {
+
+        // the set up for th elfor loop is basically just unrolling the 1st
+        // iteration.
+        size_t outputElements = IN_NB_ELTS / iDims[axesToReduce[0]];
+        Acc_T<Output_T> *currAcc = new Acc_T<Input_T>[outputElements];
+        Acc_T<Input_T> *prevAcc = nullptr;
+        prevAcc = computeMeanOverAxis<Input_T, Output_T>(
+            preAxisStrides[axesToReduce[0]],
+            postAxisStrides[axesToReduce[0]],
+            iDims[axesToReduce[0]] * postAxisStrides[axesToReduce[0]],
+            iDims[axesToReduce[0]],
+            input,
+            currAcc);
+
+        // mutable copy of preAxisStride to avoid modifying input values
+        size_t preAxisStrides_mut[IN_NB_DIMS];
+        for (size_t i = 0; i < IN_NB_DIMS; ++i) {
+            preAxisStrides_mut[i] =
+                i < axesToReduce[0] + 1
+                    ? preAxisStrides[i]
+                    : preAxisStrides[i] / iDims[axesToReduce[0]];
+        }
+
+        for (size_t i = 1; i < NB_AXES_TO_REDUCE; ++i) {
+            const size_t axis = axesToReduce[i];
+            outputElements /= iDims[i];
+            currAcc = new Acc_T<Input_T>[outputElements];
+            currAcc = computeMeanOverAxis<Acc_T<Input_T>, Acc_T<Output_T>>(
+                preAxisStrides_mut[axis],
+                postAxisStrides[axis],
+                iDims[axis] * postAxisStrides[axis],
+                iDims[axis],
+                prevAcc,
+                currAcc);
+
+            for (size_t j = axis + 1; j < IN_NB_DIMS; ++j) {
+                preAxisStrides_mut[j] /= iDims[axis];
+            }
+            delete[] prevAcc;
+            prevAcc = currAcc;
+        }
+
+        for (size_t i = 0; i < OUT_NB_ELTS; ++i) {
+            output[i] = castFromFloat<Output_T>(currAcc[i]);
+        }
+
+        if (currAcc) {
+            delete[] currAcc;
+        }
+    }
+    }
+}
+
+#endif // __AIDGE_EXPORT_CPP_KERNELS_REDUCEMEAN__
diff --git a/aidge_export_cpp/kernels/rescaling.hpp b/aidge_export_cpp/kernels/rescaling.hpp
deleted file mode 100644
index a831fa8730dfa45384c6f251d7fe079caa015ce6..0000000000000000000000000000000000000000
--- a/aidge_export_cpp/kernels/rescaling.hpp
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef __AIDGE_EXPORT_CPP_NETWORK_RESCALING__
-#define __AIDGE_EXPORT_CPP_NETWORK_RESCALING__
-
-#include "network/rescaling_utils.hpp"
-#include "network/activation_utils.hpp"
-
-template<int NB_DATA,
-         ActivationFunction_T ACTIVATION,
-         typename Input_T,
-         typename Output_T,
-         typename Rescaling_T>
-__attribute__((always_inline)) inline 
-void rescaling_forward (
-    const Input_T* __restrict inputs,
-    Output_T* __restrict outputs,
-    const Rescaling_T& __restrict rescaling)
-{
-#ifdef _OPENMP
-    #pragma omp parallel
-#endif
-    for (int i = 0; i < NB_DATA; ++i) {
-        outputs[i] = activation_forward_value<Output_T>(inputs[i] , 0, ACTIVATION, rescaling);
-    }
-}
-
-#endif  // __AIDGE_EXPORT_CPP_NETWORK_RESCALING__
diff --git a/aidge_export_cpp/kernels/sigmoid.hpp b/aidge_export_cpp/kernels/sigmoid.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d3d548d7e8ebd4bdd215e232079aecad4a31c206
--- /dev/null
+++ b/aidge_export_cpp/kernels/sigmoid.hpp
@@ -0,0 +1,75 @@
+#ifndef __AIDGE_EXPORT_CPP_KERNELS_SIGMOID__
+#define __AIDGE_EXPORT_CPP_KERNELS_SIGMOID__
+
+#include "network/typedefs.hpp"
+#include <cmath>
+#include <array>
+#include <algorithm>
+#include <limits>
+#include <cstdint>
+#include <sys/types.h>
+
+template <typename Input_T, class Output_T, size_t LUT_IDX, size_t LUT_SIZE>
+constexpr Input_T sigmoid_index()
+{
+    constexpr auto unitVal = std::numeric_limits<typename std::make_unsigned<Output_T>::type>::max();
+    constexpr auto y = 0.5f + 0.5f * static_cast<float>(LUT_IDX) / LUT_SIZE;
+    constexpr auto x = std::log(y) - std::log(1.0f - y);
+    return unitVal * x;
+}
+
+template <typename Input_T, typename Output_T, std::size_t... I>
+constexpr auto sigmoid_lookup_helper(std::index_sequence<I...>)
+{
+    return std::array<Input_T, sizeof...(I)>({sigmoid_index<Input_T, Output_T, I, sizeof...(I)>()...});
+}
+
+template <typename Input_T, typename Output_T, size_t LUT_Size>
+constexpr auto sigmoid_lookup()
+{
+    return sigmoid_lookup_helper<Input_T, Output_T>(std::make_index_sequence<LUT_Size>());
+}
+
+template <typename Output_T, size_t LUT_SIZE>
+constexpr Output_T sigmoid_scale_idx(size_t idx, bool pos) {
+    constexpr auto midVal = (std::numeric_limits<Output_T>::max() + 1) / 2;
+    return (pos) ? midVal + (midVal*idx) / LUT_SIZE - 1 : midVal - (midVal*idx) / LUT_SIZE;
+}
+
+template<size_t NB_ELTS,
+         typename Input_T, typename Output_T>
+__attribute__((always_inline)) inline 
+typename std::enable_if<std::is_floating_point<Input_T>::value || std::is_floating_point<Output_T>::value, void>::type
+sigmoid_forward (
+    const Input_T* __restrict inputs,
+    Output_T* __restrict outputs)
+{
+    for (size_t i = 0; i < NB_ELTS; ++i) {
+        if (inputs[i] > Input_T(0)) {
+            outputs[i] = Output_T(1) / (Output_T(1) + std::exp(-inputs[i]));
+        }
+        else {
+            outputs[i] = std::exp(inputs[i]) / (Output_T(1) + std::exp(inputs[i]));
+        }
+    }
+}
+
+template<size_t NB_ELTS,
+         typename Input_T, typename Output_T>
+__attribute__((always_inline)) inline 
+typename std::enable_if<!std::is_floating_point<Input_T>::value && !std::is_floating_point<Output_T>::value, void>::type
+sigmoid_forward (
+    const Input_T* __restrict inputs,
+    Output_T* __restrict outputs)
+{
+    constexpr size_t LUT_Size = 1 << (8 * sizeof(Output_T) - 1);
+    static constexpr auto lut = sigmoid_lookup<Input_T, Output_T, LUT_Size>();
+
+    for (size_t i = 0; i < NB_ELTS; ++i) {
+        const auto it = std::lower_bound(std::begin(lut), std::end(lut), std::abs(inputs[i]));
+        const auto idx = std::distance(std::begin(lut), it);
+        outputs[i] = sigmoid_scale_idx<Output_T, LUT_Size>(idx, inputs[i] > 0);
+    }
+}
+
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_SIGMOID__
diff --git a/aidge_export_cpp/kernels/slice.hpp b/aidge_export_cpp/kernels/slice.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..978ffca5bf549efcac0aa9a3c80570e6fcd830a6
--- /dev/null
+++ b/aidge_export_cpp/kernels/slice.hpp
@@ -0,0 +1,40 @@
+#ifndef __AIDGE_EXPORT_CPP_KERNELS_SLICE__
+#define __AIDGE_EXPORT_CPP_KERNELS_SLICE__
+
+#include "network/typedefs.hpp"
+#include <sys/types.h>
+
+// Generic function for slice
+// Note : implementation differs from cpu_backend's but this one uses no additional buffer.
+
+template <typename T,
+         size_t NB_DIMS, size_t NB_ELTS, size_t NB_AXES,
+         const size_t STARTS[], const size_t ENDS[], const size_t STEPS[],
+         const size_t AXES_MOD[], const size_t AXES_DIV[],
+         typename Input_T, typename Output_T>
+__attribute__((always_inline)) inline
+void slice_forward (
+    const Input_T* __restrict inputs,
+    Output_T* __restrict outputs)
+{
+    // iterate on each element and check if it belongs into the slice
+    size_t o = 0;
+    for (size_t e=0; e<NB_ELTS; e++){
+        bool is_sliced=true;
+        for (size_t i=0; i<NB_AXES; i++){ // check for for each sliced ax
+            const size_t ax_idx = (e % AXES_MOD[i]) / AXES_DIV[i];
+            // check steps and boundaries
+            if (((ax_idx - STARTS[i]) % STEPS[i] != 0) || (ax_idx < STARTS[i]) ||  (ax_idx >= ENDS[i])){
+                is_sliced = false;
+                break;
+            }
+        }
+        // If the element is in the slice, copy it to output
+        if (is_sliced){
+            outputs[o] = inputs[e];
+            o++;
+        }
+    }
+}
+
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_SLICE__
\ No newline at end of file
diff --git a/aidge_export_cpp/kernels/softmax.hpp b/aidge_export_cpp/kernels/softmax.hpp
index d29e9b42cba35287c71d32f211550a51b784aa12..da97f2dae18e1715f800aa737f0738057ee6f20a 100644
--- a/aidge_export_cpp/kernels/softmax.hpp
+++ b/aidge_export_cpp/kernels/softmax.hpp
@@ -7,10 +7,11 @@
 #include <type_traits>
 #include <cmath>
 #include <algorithm>
+#include <sys/types.h>
 
-template<int AXIS_SIZE,
-         int AXIS_SIZE_POST,
-         int AXIS_SIZE_PRE,
+template<size_t AXIS_SIZE,
+         size_t AXIS_SIZE_POST,
+         size_t AXIS_SIZE_PRE,
          typename Input_T, typename Output_T>
 __attribute__((always_inline)) inline
 void softmax_forward (
@@ -20,29 +21,29 @@ void softmax_forward (
     // Iterate over the "pre-axis" and "post-axis" slices.
     // For each slice along the axis, compute the maximum value,
     // the sum of exponentials, and then write the normalized softmax outputs.
-    for (int i = 0; i < AXIS_SIZE_PRE; ++i) {
-        for (int j = 0; j < AXIS_SIZE_POST; ++j) {
+    for (size_t i = 0; i < AXIS_SIZE_PRE; ++i) {
+        for (size_t j = 0; j < AXIS_SIZE_POST; ++j) {
             // Compute the base index for this slice.
-            const int baseIdx = i * AXIS_SIZE * AXIS_SIZE_POST + j;
+            const size_t baseIdx = i * AXIS_SIZE * AXIS_SIZE_POST + j;
 
             // Find the maximum value along the axis.
             Input_T maxVal = inputs[baseIdx];
-            for (int k = 1; k < AXIS_SIZE; ++k) {
-                const int idx = baseIdx + k * AXIS_SIZE_POST;
+            for (size_t k = 1; k < AXIS_SIZE; ++k) {
+                const size_t idx = baseIdx + k * AXIS_SIZE_POST;
                 maxVal = std::max(maxVal, inputs[idx]);
             }
 
             // Compute the sum of the exponentials along the axis.
             Input_T sumExp = 0;
-            for (int k = 0; k < AXIS_SIZE; ++k) {
-                const int idx = baseIdx + k * AXIS_SIZE_POST;
+            for (size_t k = 0; k < AXIS_SIZE; ++k) {
+                const size_t idx = baseIdx + k * AXIS_SIZE_POST;
                 outputs[idx] = std::exp(inputs[idx] - maxVal);
                 sumExp += outputs[idx];
             }
 
             // Write the softmax values to the output.
-            for (int k = 0; k < AXIS_SIZE; ++k) {
-                const int idx = baseIdx + k * AXIS_SIZE_POST;
+            for (size_t k = 0; k < AXIS_SIZE; ++k) {
+                const size_t idx = baseIdx + k * AXIS_SIZE_POST;
                 outputs[idx] /= sumExp;
             }
         }
diff --git a/aidge_export_cpp/kernels/transpose.hpp b/aidge_export_cpp/kernels/transpose.hpp
index 31c9e27869c5e2fde701f6700fd4964ea4cefd29..57d48c283644df26a52ff2c561206e09af0602a1 100644
--- a/aidge_export_cpp/kernels/transpose.hpp
+++ b/aidge_export_cpp/kernels/transpose.hpp
@@ -12,6 +12,8 @@
 #ifndef __AIDGE_EXPORT_CPP_KERNELS_TRANSPOSE__
 #define __AIDGE_EXPORT_CPP_KERNELS_TRANSPOSE__
 
+#include <sys/types.h>
+
 /**
  * @brief Transposes an N-dimensional tensor based on the specified permutation.
  *
@@ -25,62 +27,36 @@
  * @tparam T        Data type of the tensor elements.
  * @tparam NB_DIMS  Number of dimensions of the input tensor.
  * @param[in]  inputs      Pointer to the input tensor data stored in contiguous memory.
- * @param[in]  in_dims     Array containing the size of each dimension of the input tensor.
- * @param[in]  permute     Array of unsigned integers specifying the desired permutation
- *                         of dimensions. Each value should be in the range [0, NB_DIMS-1],
- *                         defining the new order of dimensions for the output tensor.
- * @param[in]  total_size  Total number of elements in the input/output tensor.
  * @param[out] outputs     Pointer to the pre-allocated memory for the transposed tensor.
  *                         Ensure this memory is appropriately sized to hold the transposed data.
  */
-template <typename T,unsigned int NB_DIMS>
+template <typename T, size_t NB_DIMS, size_t NB_ELTS,
+          const size_t PERMUTE[], const size_t IN_DIMS[], const size_t OUT_STRIDE[]>
+__attribute__((always_inline)) inline
 void transpose_ND_forward(const T *__restrict inputs,
-                          const unsigned int *in_dims,
-                          const unsigned int *permute,
-                          const unsigned int total_size,
                           T *__restrict outputs)
 {
-    // Compute strides for input tensor
-    unsigned int in_strides[NB_DIMS];
-    in_strides[NB_DIMS - 1] = 1;
-    for (int i = NB_DIMS - 2; i >= 0; --i)
-    {
-        in_strides[i] = in_strides[i + 1] * in_dims[i + 1];
-    }
-
-    // Compute dimensions and strides for output tensor
-    unsigned int out_dims[NB_DIMS];
-    unsigned int out_strides[NB_DIMS];
-    out_strides[NB_DIMS - 1] = 1;
-    for (unsigned int i = 0; i < NB_DIMS; ++i)
-    {
-        out_dims[i] = in_dims[permute[i]];
-    }
-    for (int i = NB_DIMS - 2; i >= 0; --i)
-    {
-        out_strides[i] = out_strides[i + 1] * out_dims[i + 1];
-    }
-
-    unsigned int current_idx[NB_DIMS];
+    size_t current_idx[NB_DIMS] = {0};
 
     // Iterate over all elements in the input tensor
-    for (unsigned int idx = 0; idx < total_size; ++idx)
-    {
-
-        unsigned int remaining = idx;
-        for (unsigned int i = 0; i < NB_DIMS; ++i)
-        {
-            current_idx[i] = remaining / in_strides[i];
-            remaining = remaining % in_strides[i];
-        }
-
-        unsigned int output_index = 0;
-        for (unsigned int i = 0; i < NB_DIMS; ++i)
-        {
-            output_index += current_idx[permute[i]] * out_strides[i];
+    for (size_t idx = 0; idx < NB_ELTS; ++idx) {
+        // Compute output index using current_idx
+        size_t output_index = 0;
+        for (size_t i = 0; i < NB_DIMS; ++i) {
+            output_index += current_idx[PERMUTE[i]] * OUT_STRIDE[i];
         }
 
         outputs[output_index] = inputs[idx];
+
+        // Increment current_idx as a multidimensional counter
+        for (int i = NB_DIMS - 1; i >= 0; --i) {
+            if (++current_idx[i] < IN_DIMS[i]) {
+                break;
+            }
+            else {
+                current_idx[i] = 0;
+            }
+        }
     }
 }
 
diff --git a/aidge_export_cpp/operators/BatchNorm.py b/aidge_export_cpp/operators/BatchNorm.py
index b0f5a16f195fb27846db9a8727b3804d84520d12..4552a009a2bc46726e5b8acc6473dfde467c2cf2 100644
--- a/aidge_export_cpp/operators/BatchNorm.py
+++ b/aidge_export_cpp/operators/BatchNorm.py
@@ -3,7 +3,9 @@ from aidge_core.export_utils import ExportNodeCpp
 from aidge_export_cpp import ROOT
 from aidge_export_cpp import ExportLibCpp
 
-@ExportLibCpp.register("BatchNorm2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
+@ExportLibCpp.register("BatchNorm2D",
+    aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32, aidge_core.dformat.nchw)),
+    aidge_core.ProdConso.in_place_model)
 class BatchNorm(ExportNodeCpp):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
diff --git a/aidge_export_cpp/operators/Conv.py b/aidge_export_cpp/operators/Conv.py
index c8137c51377c103855b9c2d133707124ebef64c5..44b50b49a2ecf4c5957fc842aee01a19c3e0ecb1 100644
--- a/aidge_export_cpp/operators/Conv.py
+++ b/aidge_export_cpp/operators/Conv.py
@@ -2,8 +2,116 @@ import aidge_core
 from aidge_core.export_utils import ExportNodeCpp, get_node_from_metaop
 from aidge_export_cpp import ROOT, ExportLibCpp, set_scaling_attributes
 
-@ExportLibCpp.register("Conv2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)))
-class Conv(ExportNodeCpp):
+# Consumer-Producer model to allow memory wrapping for Conv/PaddedConv
+# (and Pool/PaddedPool), keeping one input line margin in NHWC data format 
+# (one input line = W*C)
+class PaddedInPlace_CP(aidge_core.ProdConso):
+    def __init__(self, op: aidge_core.Operator):
+        aidge_core.ProdConso.__init__(self, op, False)
+
+    def default_model(op: aidge_core.Operator):
+        return PaddedInPlace_CP(op)
+
+    def get_nb_required_protected(self, input_idx):
+        if input_idx != 0:
+            return super().get_nb_required_protected(input_idx)
+
+        input = self.get_operator().get_input(0)
+        if not input:
+            return aidge_core.Elts_t.none_elts()
+
+        # Non-Padded case: margin = one input line
+        margin = 1
+        if not self.get_operator().is_atomic():
+            # Padded case: margin = (padding_y / stride_y) input lines
+            sub_graph = self.get_operator().get_micro_graph().clone()
+            aidge_core.expand_metaops(sub_graph, True)
+
+            padding_y = 0
+            stride_y = 1
+            for node in sub_graph.get_nodes():
+                if hasattr(node.get_operator().attr, 'stride_dims'):
+                    if len(node.get_operator().attr.stride_dims) > 1:
+                        stride_y = node.get_operator().attr.stride_dims[0]
+                elif hasattr(node.get_operator().attr, 'begin_end_borders'):
+                    if len(node.get_operator().attr.begin_end_borders) > 2:
+                        padding_y = node.get_operator().attr.begin_end_borders[0]
+
+            margin += padding_y // stride_y
+
+        if len(input.dims()) == 4:
+            # 2D: one input line = W*C
+            margin *= input.dims()[2] * input.dims()[3]
+        else:
+            # 1D: one input line = C
+            margin *= input.dims()[2]
+
+        return aidge_core.Elts_t.data_elts(margin)
+
+
+@ExportLibCpp.register("Conv1D",
+    aidge_core.ImplSpec(
+        [ # Input specifications
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nwc),
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nwc),
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any)
+        ],
+        [ # Output specifications
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nwc)
+        ],
+    ),
+    PaddedInPlace_CP.default_model)
+class Conv1D(ExportNodeCpp):
+    def __init__(self, node, mem_info):
+        super().__init__(node, mem_info)
+
+        # Initialize kernel attributes
+        self.attributes["padding"] = [0, 0, 0, 0]
+        self.attributes["activation"] = "Linear"
+        self.attributes["aidge_cmp"] = node.attributes().has_attr("aidge_cmp")
+
+        ## Scaling
+        self.attributes["rescaling"] = "NoScaling"
+        self.attributes["shift_value"] = 0
+
+        # Browse the metaop to update kernel attributes
+        ConvNode = get_node_from_metaop(node, "Conv1D") 
+        self.attributes["kernel_dims"] = ConvNode[0].get_operator().attr.kernel_dims
+        self.attributes["stride_dims"] = ConvNode[0].get_operator().attr.stride_dims
+        self.attributes["dilation_dims"] = ConvNode[0].get_operator().attr.dilation_dims
+
+        # Template for layer configutation file generation
+        self.config_template = str(ROOT / "templates" / "configuration" / "convolution_config.jinja")
+        
+        # Template layer call function generation within the forward file
+        self.forward_template = str(ROOT / "templates" / "kernel_forward" / "convolution_forward.jinja")
+        
+        # Files to include within the generated forward.cpp file
+        self.include_list = []
+        
+        # Path to the kernel(s) files to copy
+        self.add_kernel_to_copy(ROOT / "kernels" / "convolution.hpp")
+        self.add_kernel_to_copy(ROOT / "static" / "macs.hpp", "include/network", fwd_include=False)
+        
+        # Include aidge outputs within the fwd file
+        if self.attributes["aidge_cmp"]:
+            self.include_list.append("network/utils.hpp")   # aidge_cmp function
+            self.include_list.append("data/aidge_outputs/" + node.name() + ".hpp") 
+
+
+@ExportLibCpp.register("Conv2D",
+    aidge_core.ImplSpec(
+        [ # Input specifications
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc),
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc),
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any)
+        ],
+        [ # Output specifications
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc)
+        ],
+    ),
+    PaddedInPlace_CP.default_model)
+class Conv2D(ExportNodeCpp):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
 
@@ -41,8 +149,19 @@ class Conv(ExportNodeCpp):
             self.include_list.append("data/aidge_outputs/" + node.name() + ".hpp") 
 
 
-@ExportLibCpp.register_metaop("QConv", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)))
-class QConv(Conv):
+@ExportLibCpp.register_metaop("QConv",
+    aidge_core.ImplSpec(
+        [ # Input specifications
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc),
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc),
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any)
+        ],
+        [ # Output specifications
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc)
+        ],
+    ),
+    PaddedInPlace_CP.default_model)
+class QConv(Conv2D):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
 
@@ -54,7 +173,18 @@ class QConv(Conv):
             self.attributes["rescaling"] = "SingleShiftScaling"
 
 
-@ExportLibCpp.register_metaop("PadConv", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)))
+@ExportLibCpp.register_metaop(["PaddedConv2D", "PadConv"],
+    aidge_core.ImplSpec(
+        [ # Input specifications
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc),
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc),
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any)
+        ],
+        [ # Output specifications
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc)
+        ],
+    ),
+    PaddedInPlace_CP.default_model)
 class PadConv(QConv):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
@@ -64,7 +194,18 @@ class PadConv(QConv):
         self.attributes["padding"] = PadNode[0].get_operator().attr.begin_end_borders
 
 
-@ExportLibCpp.register_metaop("ConvAct", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)))
+@ExportLibCpp.register_metaop("ConvAct",
+    aidge_core.ImplSpec(
+        [ # Input specifications
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc),
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc),
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any)
+        ],
+        [ # Output specifications
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc)
+        ],
+    ),
+    PaddedInPlace_CP.default_model)
 class ConvAct(QConv):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
@@ -75,7 +216,18 @@ class ConvAct(QConv):
         else:
             aidge_core.Log.error(f"{node.type()} activation is not yet supported.")
 
-@ExportLibCpp.register_metaop("PadConvAct", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)))
+@ExportLibCpp.register_metaop("PadConvAct",
+    aidge_core.ImplSpec(
+        [ # Input specifications
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc),
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc),
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any)
+        ],
+        [ # Output specifications
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc)
+        ],
+    ),
+    PaddedInPlace_CP.default_model)
 class PadConvAct(PadConv, ConvAct):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
diff --git a/aidge_export_cpp/operators/ConvDw.py b/aidge_export_cpp/operators/ConvDw.py
index 936c3b6a9a6b96012d3c01a1ab6961e1adcc0c1e..136666719552aa58ac8e7a08c363c4cfd128c5e2 100644
--- a/aidge_export_cpp/operators/ConvDw.py
+++ b/aidge_export_cpp/operators/ConvDw.py
@@ -1,8 +1,20 @@
 import aidge_core
 from aidge_core.export_utils import ExportNodeCpp, get_node_from_metaop
 from aidge_export_cpp import ROOT, ExportLibCpp, set_scaling_attributes
-
-@ExportLibCpp.register("ConvDepthWise2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)))
+from .Conv import PaddedInPlace_CP
+
+@ExportLibCpp.register("ConvDepthWise2D",
+    aidge_core.ImplSpec(
+        [ # Input specifications
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc),
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc),
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any)
+        ],
+        [ # Output specifications
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc)
+        ],
+    ),
+    PaddedInPlace_CP.default_model)
 class ConvDw(ExportNodeCpp):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
@@ -42,7 +54,18 @@ class ConvDw(ExportNodeCpp):
             self.include_list.append("data/aidge_outputs/" + node.name() + ".hpp") 
 
 
-@ExportLibCpp.register_metaop("QConvDw", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)))
+@ExportLibCpp.register_metaop("QConvDw",
+    aidge_core.ImplSpec(
+        [ # Input specifications
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc),
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc),
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any)
+        ],
+        [ # Output specifications
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc)
+        ],
+    ),
+    PaddedInPlace_CP.default_model)
 class QConvDw(ConvDw):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
@@ -55,7 +78,18 @@ class QConvDw(ConvDw):
             self.attributes["rescaling"] = "SingleShiftScaling"
 
 
-@ExportLibCpp.register_metaop("PadConvDw", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)))
+@ExportLibCpp.register_metaop(["PaddedConvDepthWise2D", "PadConvDw"],
+    aidge_core.ImplSpec(
+        [ # Input specifications
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc),
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc),
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any)
+        ],
+        [ # Output specifications
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc)
+        ],
+    ),
+    PaddedInPlace_CP.default_model)
 class PadConvDw(QConvDw):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
@@ -65,7 +99,18 @@ class PadConvDw(QConvDw):
         self.attributes["padding"] = PadNode[0].get_operator().attr.begin_end_borders
 
 
-@ExportLibCpp.register_metaop("ConvDwAct", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)))
+@ExportLibCpp.register_metaop("ConvDwAct",
+    aidge_core.ImplSpec(
+        [ # Input specifications
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc),
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc),
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any)
+        ],
+        [ # Output specifications
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc)
+        ],
+    ),
+    PaddedInPlace_CP.default_model)
 class ConvDwAct(QConvDw):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
@@ -76,7 +121,18 @@ class ConvDwAct(QConvDw):
         else:
             aidge_core.Log.error(f"{node.type()} activation is not yet supported.")
 
-@ExportLibCpp.register_metaop("PadConvDwAct", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)))
+@ExportLibCpp.register_metaop("PadConvDwAct",
+    aidge_core.ImplSpec(
+        [ # Input specifications
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc),
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc),
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any)
+        ],
+        [ # Output specifications
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc)
+        ],
+    ),
+    PaddedInPlace_CP.default_model)
 class PadConvDwAct(PadConvDw, ConvDwAct):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
diff --git a/aidge_export_cpp/operators/ElemWise.py b/aidge_export_cpp/operators/ElemWise.py
index 7d073ca9549aa0fb67a2c63562536a7c6808e6cd..751fb3a7ec89d1f6e27daa3a1aae478709d715ac 100644
--- a/aidge_export_cpp/operators/ElemWise.py
+++ b/aidge_export_cpp/operators/ElemWise.py
@@ -15,6 +15,80 @@ class ElemWise(ExportNodeCpp):
         self.attributes["shift_value"] = 0
         self.attributes["coef_value"] = 1
 
+        nbdims_out = len(self.attributes["out_dims"][0])
+        dims_a = self.attributes["in_dims"][0]
+        dims_b = self.attributes["in_dims"][1]
+        ndim_a = [0] * nbdims_out
+        ndim_b = [0] * nbdims_out
+
+        idx_a = nbdims_out - len(dims_a)
+        for i in range(nbdims_out):
+            ndim_a[i] = 1 if i < idx_a else dims_a[i - idx_a]
+
+        idx_b = nbdims_out - len(dims_b)
+        for i in range(nbdims_out):
+            ndim_b[i] = 1 if i < idx_b else dims_b[i - idx_b]
+
+        # Find highest equal dimension
+        contiguousIdx = nbdims_out - 1
+        for i in range(nbdims_out - 1, -1, -1):
+            if ndim_a[i] != ndim_b[i]:
+                break
+            contiguousIdx = i
+
+        # Compute the highest number of contiguous data
+        input0_contiguous_size = 1
+        input1_contiguous_size = 1
+        output_contiguous_size = 1
+        for i in range(contiguousIdx, nbdims_out):
+            input0_contiguous_size *= ndim_a[i]
+            input1_contiguous_size *= ndim_b[i]
+            output_contiguous_size *= self.attributes["out_dims"][0][i]
+
+        self.attributes["input1_cont_size"] = input0_contiguous_size
+        self.attributes["input2_cont_size"] = input1_contiguous_size
+        self.attributes["output_cont_size"] = output_contiguous_size
+
+        # Initialize strides for broadcasting
+        stride_post0 = [0] * contiguousIdx
+        stride_post1 = [0] * contiguousIdx
+        stride_step0 = [0] * contiguousIdx
+        stride_step1 = [0] * contiguousIdx
+
+        if contiguousIdx > 0:
+            stride_post0[contiguousIdx - 1] = 1
+            stride_post1[contiguousIdx - 1] = 1
+            for i in range(contiguousIdx - 2, -1, -1):
+                stride_post0[i] = stride_post0[i + 1] * ndim_a[i + 1]
+                stride_post1[i] = stride_post1[i + 1] * ndim_b[i + 1]
+
+            for i in range(contiguousIdx):
+                stride_step0[i] = 1 - stride_post0[i] if ndim_a[i] == 1 else 1
+                stride_step1[i] = 1 - stride_post1[i] if ndim_b[i] == 1 else 1
+
+        # Offset and matrix count
+        offsetIn0 = 0
+        offsetIn1 = 0
+        nbMatrices = 1
+        for i in range(contiguousIdx):
+            nbMatrices *= self.attributes["out_dims"][0][i]
+
+
+        self.attributes["offset_in1"] = [0]
+        self.attributes["offset_in2"] = [0]
+
+        for stack in range(1, nbMatrices):
+            dim = contiguousIdx - 1
+            tmp_stack = stack
+            while tmp_stack % self.attributes["out_dims"][0][dim] == 0:
+                tmp_stack //= self.attributes["out_dims"][0][dim]
+                dim -= 1
+            offsetIn0 += stride_step0[dim]
+            offsetIn1 += stride_step1[dim]
+
+            self.attributes["offset_in1"].append(offsetIn0)
+            self.attributes["offset_in2"].append(offsetIn1)
+
         # Template for layer configutation file generation
         self.config_template = str(ROOT / "templates" / "configuration" / "elemwise_config.jinja")
 
@@ -47,20 +121,26 @@ class QElemWise(ElemWise):
             self.attributes["rescaling"] = "SingleShiftScaling"
 
 
-@ExportLibCpp.register("Add", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)))
+@ExportLibCpp.register("Add",
+    aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)),
+    aidge_core.ProdConso.in_place_model)
 class Add(ElemWise):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
         self.attributes["elemwise_op"] = "Add"
 
 
-@ExportLibCpp.register_metaop("QAdd", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)))
+@ExportLibCpp.register_metaop("QAdd",
+    aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)),
+    aidge_core.ProdConso.in_place_model)
 class QAdd(QElemWise, Add):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
 
 
-@ExportLibCpp.register_metaop("AddAct", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)))
+@ExportLibCpp.register_metaop("AddAct",
+    aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)),
+    aidge_core.ProdConso.in_place_model)
 class AddAct(QAdd):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
@@ -72,20 +152,26 @@ class AddAct(QAdd):
             aidge_core.Log.error(f"{node.type()} activation is not yet supported.") 
 
 
-@ExportLibCpp.register("Sub", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)))
+@ExportLibCpp.register("Sub",
+    aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)),
+    aidge_core.ProdConso.in_place_model)
 class Sub(ElemWise):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
         self.attributes["elemwise_op"] = "Sub"
 
 
-@ExportLibCpp.register_metaop("QSub", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)))
+@ExportLibCpp.register_metaop("QSub",
+    aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)),
+    aidge_core.ProdConso.in_place_model)
 class QSub(QElemWise, Sub):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
 
 
-@ExportLibCpp.register_metaop("SubAct", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)))
+@ExportLibCpp.register_metaop("SubAct",
+    aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)),
+    aidge_core.ProdConso.in_place_model)
 class SubAct(QSub):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
@@ -97,15 +183,41 @@ class SubAct(QSub):
             aidge_core.Log.error(f"{node.type()} activation is not yet supported.") 
 
 
-@ExportLibCpp.register("Mul", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)))
+@ExportLibCpp.register("Mul",
+    aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)),
+    aidge_core.ProdConso.in_place_model)
 class Mul(QElemWise):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
         self.attributes["elemwise_op"] = "Mul"
 
 
-@ExportLibCpp.register_metaop("MulAct", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)))
+@ExportLibCpp.register_metaop("MulAct",
+    aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)),
+    aidge_core.ProdConso.in_place_model)
 class MulAct(Mul):
+    def __init__(self, node, mem_info):
+        super().__init__(node, mem_info)
+
+        # Browse the metaop to update kernel attributes
+        if get_node_from_metaop(node, "ReLU"):
+            self.attributes["activation"] = "Rectifier"
+        else:
+            aidge_core.Log.error(f"{node.type()} activation is not yet supported.")
+
+@ExportLibCpp.register("Div",
+    aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)),
+    aidge_core.ProdConso.in_place_model)
+class Div(QElemWise):
+    def __init__(self, node, mem_info):
+        super().__init__(node, mem_info)
+        self.attributes["elemwise_op"] = "Div"
+
+
+@ExportLibCpp.register_metaop("DivAct",
+    aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)),
+    aidge_core.ProdConso.in_place_model)
+class DivAct(Div):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
 
diff --git a/aidge_export_cpp/operators/Erf.py b/aidge_export_cpp/operators/Erf.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd0f36a2e09ea359681d88b77073905ce6bb2fff
--- /dev/null
+++ b/aidge_export_cpp/operators/Erf.py
@@ -0,0 +1,21 @@
+import aidge_core
+from aidge_core.export_utils import ExportNodeCpp, get_node_from_metaop
+from aidge_export_cpp import ROOT, ExportLibCpp, set_scaling_attributes
+
+@ExportLibCpp.register("Erf",
+    aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)),
+    aidge_core.ProdConso.in_place_model)
+class ErfCPP(ExportNodeCpp):
+    def __init__(self, node, mem_info):
+        super().__init__(node, mem_info)
+        self.attributes["activation"] = "Linear"
+        self.attributes["rescaling"] = "NoScaling"
+        self.config_template = str(
+            ROOT / "templates" / "configuration" / "erf_config.jinja")
+        self.forward_template = str(
+            ROOT / "templates" / "kernel_forward" / "erf_forward.jinja")
+        self.include_list = []
+        self.kernels_to_copy = [
+            str(ROOT / "kernels" / "erf.hpp"),
+            str(ROOT / "kernels" / "activation.hpp")
+        ]
\ No newline at end of file
diff --git a/aidge_export_cpp/operators/Fc.py b/aidge_export_cpp/operators/Fc.py
index d32d20e2fad90f8418ee58067f1cd6e6c7e72065..184baabd7c02cae7ca6b1942a6cb37136ceb1a7b 100644
--- a/aidge_export_cpp/operators/Fc.py
+++ b/aidge_export_cpp/operators/Fc.py
@@ -2,12 +2,22 @@ import aidge_core
 from aidge_core.export_utils import ExportNodeCpp, get_node_from_metaop
 from aidge_export_cpp import ROOT, ExportLibCpp, set_scaling_attributes
 
-@ExportLibCpp.register("FC", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)))
+@ExportLibCpp.register("FC",
+    aidge_core.ImplSpec(
+        [ # Input specifications
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.default),
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any)
+        ],
+        [ # Output specifications
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any)
+        ],
+    ))
 class FC(ExportNodeCpp):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
 
         # Initialize kernel attributes
+        self.attributes["kernel"] = "default"
         self.attributes["activation"] = "Linear"
         self.attributes["aidge_cmp"] = node.attributes().has_attr("aidge_cmp")
 
@@ -33,8 +43,34 @@ class FC(ExportNodeCpp):
             self.include_list.append("network/utils.hpp")   # aidge_cmp function
             self.include_list.append("data/aidge_outputs/" + node.name() + ".hpp")
 
+@ExportLibCpp.register("FC",
+    aidge_core.ImplSpec(
+        [ # Input specifications
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc),
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any)
+        ],
+        [ # Output specifications
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any)
+        ],
+    ))
+class FC_NHWC(FC):
+    def __init__(self, node, mem_info):
+        super().__init__(node, mem_info)
+        if node.attributes().has_attr("ignore_input_format"):
+            self.attributes["kernel"] = ""
+        else:
+            self.attributes["kernel"] = "transpose"
 
-@ExportLibCpp.register_metaop("QFC", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)))
+@ExportLibCpp.register_metaop("QFC",
+    aidge_core.ImplSpec(
+        [ # Input specifications
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.default),
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any)
+        ],
+        [ # Output specifications
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any)
+        ],
+    ))
 class QFC(FC):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
@@ -46,8 +82,34 @@ class QFC(FC):
         if self.attributes["shift_value"] != 0:
             self.attributes["rescaling"] = "SingleShiftScaling"
 
+@ExportLibCpp.register_metaop("QFC",
+    aidge_core.ImplSpec(
+        [ # Input specifications
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc),
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any)
+        ],
+        [ # Output specifications
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any)
+        ],
+    ))
+class QFC_NHWC(QFC):
+    def __init__(self, node, mem_info):
+        super().__init__(node, mem_info)
+        if node.attributes().has_attr("ignore_input_format"):
+            self.attributes["kernel"] = ""
+        else:
+            self.attributes["kernel"] = "transpose"
 
-@ExportLibCpp.register_metaop("FCAct", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)))
+@ExportLibCpp.register_metaop("FCAct",
+    aidge_core.ImplSpec(
+        [ # Input specifications
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.default),
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any)
+        ],
+        [ # Output specifications
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any)
+        ],
+    ))
 class FCAct(QFC):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
@@ -57,3 +119,21 @@ class FCAct(QFC):
             self.attributes["activation"] = "Rectifier"
         else:
             aidge_core.Log.error(f"{node.type()} activation is not yet supported.")
+
+@ExportLibCpp.register_metaop("FCAct",
+    aidge_core.ImplSpec(
+        [ # Input specifications
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc),
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any)
+        ],
+        [ # Output specifications
+            aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any)
+        ],
+    ))
+class FCAct_NHWC(FCAct):
+    def __init__(self, node, mem_info):
+        super().__init__(node, mem_info)
+        if node.attributes().has_attr("ignore_input_format"):
+            self.attributes["kernel"] = ""
+        else:
+            self.attributes["kernel"] = "transpose"
diff --git a/aidge_export_cpp/operators/Identity.py b/aidge_export_cpp/operators/Identity.py
new file mode 100644
index 0000000000000000000000000000000000000000..69d36c59a9d41259ff5f40320aaf35070606169b
--- /dev/null
+++ b/aidge_export_cpp/operators/Identity.py
@@ -0,0 +1,19 @@
+import aidge_core
+from aidge_core.export_utils import ExportNodeCpp
+from aidge_export_cpp import ROOT
+from aidge_export_cpp import ExportLibCpp
+
+@ExportLibCpp.register("Identity",
+    aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)),
+    aidge_core.ProdConso.in_place_model)
+class IdentityCPP(ExportNodeCpp):
+    def __init__(self, node, mem_info):
+        super().__init__(node, mem_info)
+        self.config_template = str(
+            ROOT / "templates" / "configuration" / "identity_config.jinja")
+        self.forward_template = str(
+            ROOT / "templates" / "kernel_forward" / "identity_forward.jinja")
+        self.include_list = []
+        self.kernels_to_copy = [
+            str(ROOT / "kernels" / "identity.hpp"),
+        ]
diff --git a/aidge_export_cpp/operators/MatMul.py b/aidge_export_cpp/operators/MatMul.py
index ac8802cd876c257f70f82ac5464df775299f52ce..7baf4a7886e0c286d8263d924fa75b07865c3790 100644
--- a/aidge_export_cpp/operators/MatMul.py
+++ b/aidge_export_cpp/operators/MatMul.py
@@ -9,6 +9,75 @@ class MatMulCPP(ExportNodeCpp):
         super().__init__(node, mem_info)
         self.attributes["activation"] = "Linear"
         self.attributes["rescaling"] = "NoScaling"
+        self.attributes["aidge_cmp"] = node.attributes().has_attr("aidge_cmp")
+
+        # Initialize arrays storing broadcasted(or not) dims
+        nbdims_out = len(self.attributes["out_dims"][0])
+        dims_a = self.attributes["in_dims"][0]
+        dims_b = self.attributes["in_dims"][1]
+        ndim_a = [0] * nbdims_out
+        ndim_b = [0] * nbdims_out
+
+        if len(dims_a) == 1:
+            ndim_a[0] = 1
+            ndim_a[1] = dims_a[0]
+
+        if len(dims_b) == 1:
+            ndim_b[0] = 1
+            ndim_b[1] = dims_b[0]
+
+        idx_a = nbdims_out - len(dims_a)
+        for i in range(nbdims_out):
+            ndim_a[i] = 1 if i < idx_a else dims_a[i - idx_a]
+
+        idx_b = nbdims_out - len(dims_b)
+        for i in range(nbdims_out):
+            ndim_b[i] = 1 if i < idx_b else dims_b[i - idx_b]
+
+        # Initialize strides for broadcasting
+        stride_post0 = [0] * (nbdims_out - 2)
+        stride_post1 = [0] * (nbdims_out - 2)
+        stride_step0 = [0] * (nbdims_out - 2)
+        stride_step1 = [0] * (nbdims_out - 2)
+
+        if nbdims_out > 2:
+            stride_post0[nbdims_out - 3] = 1
+            stride_post1[nbdims_out - 3] = 1
+            for i in range(nbdims_out - 4, -1, -1):
+                stride_post0[i] = stride_post0[i + 1] * ndim_a[i + 1]
+                stride_post1[i] = stride_post1[i + 1] * ndim_b[i + 1]
+
+            for i in range(nbdims_out - 2):
+                stride_step0[i] = 1 - stride_post0[i] if ndim_a[i] == 1 else 1
+                stride_step1[i] = 1 - stride_post1[i] if ndim_b[i] == 1 else 1
+
+        # if len(dims_b) == len(dims_a), then len(dims_a) == nbdims_out == len(dims_b); 
+        # else it will be broadcasted to the correct dims
+        nbMatrices = 1
+        for i in range(nbdims_out - 3, -1, -1):
+            nbMatrices *= self.attributes["out_dims"][0][i]
+
+        offsetIn0 = 0
+        offsetIn1 = 0
+        self.attributes["offset_in1"] = [0]
+        self.attributes["offset_in2"] = [0]
+
+        for stack in range(1, nbMatrices):
+            dim = nbdims_out - 3
+            tmp_stack = stack
+            while tmp_stack % self.attributes["out_dims"][0][dim] == 0:
+                tmp_stack //= self.attributes["out_dims"][0][dim]
+                dim -= 1
+            offsetIn0 += stride_step0[dim]
+            offsetIn1 += stride_step1[dim]
+
+            self.attributes["offset_in1"].append(offsetIn0)
+            self.attributes["offset_in2"].append(offsetIn1)
+
+        self.attributes["n"] = ndim_a[nbdims_out - 2]
+        self.attributes["m"] = ndim_b[nbdims_out - 1]
+        self.attributes["k"] = ndim_a[nbdims_out - 1]
+
         self.config_template = str(
             ROOT / "templates" / "configuration" / "matmul_config.jinja")
         self.forward_template = str(
@@ -17,3 +86,8 @@ class MatMulCPP(ExportNodeCpp):
         self.kernels_to_copy = [
             str(ROOT / "kernels" / "matmul.hpp"),
         ]
+
+        # Include aidge outputs within the fwd file
+        if self.attributes["aidge_cmp"]:
+            self.include_list.append("network/utils.hpp")   # aidge_cmp function
+            self.include_list.append("data/aidge_outputs/" + node.name() + ".hpp") 
diff --git a/aidge_export_cpp/operators/Pad.py b/aidge_export_cpp/operators/Pad.py
index f84f2cff0f1df283327c1122ddb7b77049bddd3f..bc73ef43ab862bf11a2a41477006827ffcb19afa 100644
--- a/aidge_export_cpp/operators/Pad.py
+++ b/aidge_export_cpp/operators/Pad.py
@@ -3,7 +3,30 @@ from aidge_core.export_utils import ExportNodeCpp
 from aidge_export_cpp import ROOT
 from aidge_export_cpp import ExportLibCpp
 
-@ExportLibCpp.register("Pad2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)))
+# Consumer-Producer model to allow memory wrapping for Pad in-place operator
+class PadInPlace_CP(aidge_core.ProdConso):
+    def __init__(self, op: aidge_core.Operator):
+        aidge_core.ProdConso.__init__(self, op, False)
+
+    def default_model(op: aidge_core.Operator):
+        return PadInPlace_CP(op)
+
+    def get_nb_required_protected(self, input_idx):
+        if input_idx != 0:
+            return super().get_nb_required_protected(input_idx)
+
+        pad_node = self.get_operator()
+        input = pad_node.get_operator().get_input(0)
+        if input:
+            output = pad_node.get_operator().get_output(0)
+            return aidge_core.Elts_t.data_elts(max(0, output.size() - input.size()))
+        else:
+            return aidge_core.Elts_t.none_elts()
+
+
+@ExportLibCpp.register("Pad2D",
+    aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)),
+    PadInPlace_CP.default_model)
 class CppPad(ExportNodeCpp):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
diff --git a/aidge_export_cpp/operators/Pool.py b/aidge_export_cpp/operators/Pool.py
index 10d595e5ed4a76c22bcc15f90d8c693b8dbf2144..1d6cd53d6d0f2d7c000b8f3acc81ee5771295080 100644
--- a/aidge_export_cpp/operators/Pool.py
+++ b/aidge_export_cpp/operators/Pool.py
@@ -2,6 +2,7 @@ import aidge_core
 from aidge_core.export_utils import ExportNodeCpp, get_node_from_metaop
 from aidge_export_cpp import ROOT
 from aidge_export_cpp import ExportLibCpp
+from .Conv import PaddedInPlace_CP
 
 class Pool(ExportNodeCpp):
     def __init__(self, node, mem_info):
@@ -52,7 +53,9 @@ class PoolAct(Pool):
             aidge_core.Log.error(f"{node.type()} activation is not yet supported.")  
 
 
-@ExportLibCpp.register("MaxPooling2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)))
+@ExportLibCpp.register("MaxPooling2D",
+    aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc)),
+    PaddedInPlace_CP.default_model)
 class MaxPool(Pool):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
@@ -64,25 +67,33 @@ class MaxPool(Pool):
         self.attributes["stride_dims"] = PoolNode[0].get_operator().attr.stride_dims
 
 
-@ExportLibCpp.register_metaop("PadMaxPool", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)))
+@ExportLibCpp.register_metaop(["PaddedMaxPooling2D", "PadMaxPool"],
+    aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc)),
+    PaddedInPlace_CP.default_model)
 class PadMaxPool(MaxPool, PadPool):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
 
 
-@ExportLibCpp.register_metaop("MaxPoolAct", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)))
+@ExportLibCpp.register_metaop("MaxPoolAct",
+    aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc)),
+    PaddedInPlace_CP.default_model)
 class MaxPoolAct(MaxPool, PoolAct):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
 
 
-@ExportLibCpp.register_metaop("PadMaxPoolAct", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)))
+@ExportLibCpp.register_metaop("PadMaxPoolAct",
+    aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc)),
+    PaddedInPlace_CP.default_model)
 class PadMaxPoolAct(PadMaxPool, MaxPoolAct):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
 
 
-@ExportLibCpp.register("AvgPooling2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)))
+@ExportLibCpp.register("AvgPooling2D",
+    aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc)),
+    PaddedInPlace_CP.default_model)
 class AvgPool(Pool):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
@@ -94,25 +105,33 @@ class AvgPool(Pool):
         self.attributes["stride_dims"] = PoolNode[0].get_operator().attr.stride_dims
 
 
-@ExportLibCpp.register_metaop("PadAvgPool", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)))
+@ExportLibCpp.register_metaop(["PaddedAvgPooling2D", "PadAvgPool"],
+    aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc)),
+    PaddedInPlace_CP.default_model)
 class PadAvgPool(AvgPool, PadPool):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
 
 
-@ExportLibCpp.register_metaop("AvgPoolAct", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)))
+@ExportLibCpp.register_metaop("AvgPoolAct",
+    aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc)),
+    PaddedInPlace_CP.default_model)
 class AvgPoolAct(AvgPool, PoolAct):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
 
 
-@ExportLibCpp.register_metaop("PadAvgPoolAct", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)))
+@ExportLibCpp.register_metaop("PadAvgPoolAct",
+    aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc)),
+    PaddedInPlace_CP.default_model)
 class PadAvgPoolAct(PadAvgPool, AvgPoolAct):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
 
 
-@ExportLibCpp.register("GlobalAveragePooling", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)))
+@ExportLibCpp.register("GlobalAveragePooling",
+    aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc)),
+    aidge_core.ProdConso.in_place_model)
 class GlobalAvgPool(Pool):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
@@ -121,19 +140,25 @@ class GlobalAvgPool(Pool):
         self.attributes["kernel_dims"] = [self.attributes["in_width"][0], self.attributes["in_height"][0]]
 
 
-@ExportLibCpp.register_metaop("PadGlobalAvgPool", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)))
+@ExportLibCpp.register_metaop("PadGlobalAvgPool",
+    aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc)),
+    PaddedInPlace_CP.default_model)
 class PadGlobalAvgPool(GlobalAvgPool, PadPool):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
 
 
-@ExportLibCpp.register_metaop("GlobalAvgPoolAct", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)))
+@ExportLibCpp.register_metaop("GlobalAvgPoolAct",
+    aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc)),
+    aidge_core.ProdConso.in_place_model)
 class GlobalAvgPoolAct(GlobalAvgPool, PoolAct):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
 
 
-@ExportLibCpp.register_metaop("PadGlobalAvgPoolAct", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)))
+@ExportLibCpp.register_metaop("PadGlobalAvgPoolAct",
+    aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc)),
+    PaddedInPlace_CP.default_model)
 class PadGlobalAvgPoolAct(PadGlobalAvgPool, GlobalAvgPoolAct):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
\ No newline at end of file
diff --git a/aidge_export_cpp/operators/Producer.py b/aidge_export_cpp/operators/Producer.py
index 627dcb29ed21e2779a30978b5b22768c0c2cc9c4..5bd63323155b3342dcefc6b92e3c9861c8333127 100644
--- a/aidge_export_cpp/operators/Producer.py
+++ b/aidge_export_cpp/operators/Producer.py
@@ -2,29 +2,12 @@ import os
 from pathlib import Path
 import numpy as np
 import aidge_core
-from aidge_core.export_utils import ExportNode, generate_file
+from aidge_core.export_utils import ExportNodeCpp, generate_file, aidge2c
 from aidge_export_cpp import ROOT
 from aidge_export_cpp import ExportLibCpp
 
-def numpy_dtype2ctype(dtype):
-    if dtype == np.int8:
-        return "int8_t"
-    elif dtype == np.int16:
-        return "int16_t"
-    elif dtype == np.int32:
-        return "int32_t"
-    elif dtype == np.int64:
-        return "int64_t"
-    elif dtype == np.float32:
-        return "float"
-    elif dtype == np.float64:
-        return "double"
-    # Add more dtype mappings as needed
-    else:
-        raise ValueError(f"Unsupported {dtype} dtype")
-
 def export_params(name: str,
-                  array: np.ndarray,
+                  output: aidge_core.Tensor,
                   filepath: str):
 
     # Get directory name of the file
@@ -38,30 +21,40 @@ def export_params(name: str,
         filepath,
         str(ROOT / "templates" / "data" / "parameters.jinja"),
         name=name,
-        data_t=numpy_dtype2ctype(array.dtype),
-        values=array.tolist()
+        dims=output.dims(),
+        dtype=aidge2c(output.dtype()),
+        values=np.array(output).tolist()
     )
 
 @ExportLibCpp.register("Producer", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)))
-class ProducerCPP(ExportNode):
+class ProducerCPP(ExportNodeCpp):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
-        self.values = np.array(self.operator.get_output(0))
+        self.output = self.operator.get_output(0)
         self.ignore = node.attributes().has_attr("ignore")
 
-        if len(self.values.shape) == 4:  # Note: export in HWC
-            self.values = np.transpose(self.values, (0, 2, 3, 1))
-
     def export(self, export_folder: Path):
-        if not self.ignore :
-            header_path = f"include/parameters/{self.attributes['name']}.h"
-            export_params(
-                self.attributes['out_name'][0],
-                self.values.reshape(-1),
-                str(export_folder / header_path))
-            return [header_path]
-        return []
+        if self.ignore:
+            return []
+
+        path_to_definition = f"{self.config_path}/{self.attributes['name']}.{self.config_extension}"
+
+        try:
+            aidge_core.export_utils.code_generation.generate_file(
+                str(export_folder / path_to_definition),
+                str(ROOT / "templates" / "configuration" / "producer_config.jinja"),
+                **self.attributes
+            )
+        except Exception as e:
+            raise RuntimeError(f"Error when creating config file for {self.node.name()}[{self.node.type()}].") from e
+
+        header_path = f"include/parameters/{self.attributes['name']}.h"
+        export_params(
+            self.attributes['out_name'][0],
+            self.output,
+            str(export_folder / header_path))
+        return [path_to_definition, header_path]
 
     def forward(self):
         # A Producer does nothing during forward
-        return []
\ No newline at end of file
+        return []
diff --git a/aidge_export_cpp/operators/Quantizer.py b/aidge_export_cpp/operators/Quantizer.py
index 51f5c23da24e7c6a47c162314f54a15c8845fc00..ae2860018406ee93871755243c38414e0e719568 100644
--- a/aidge_export_cpp/operators/Quantizer.py
+++ b/aidge_export_cpp/operators/Quantizer.py
@@ -28,16 +28,16 @@ class Quantizer(ExportNodeCpp):
             self.attributes["rescaling"] = "SingleShiftScaling"
 
         # Template for layer configutation file generation
-        self.config_template = str(ROOT / "templates" / "configuration" / "rescaling_config.jinja")
+        self.config_template = str(ROOT / "templates" / "configuration" / "activation_config.jinja")
 
         # Template layer call function generation within the forward file
-        self.forward_template = str(ROOT / "templates" / "kernel_forward" / "rescaling_forward.jinja")
+        self.forward_template = str(ROOT / "templates" / "kernel_forward" / "activation_forward.jinja")
 
         # Files to include within the generated forward.cpp file
         self.include_list = []
 
         # Path to the kernel(s) files to copy
-        self.add_kernel_to_copy(ROOT / "kernels" / "rescaling.hpp")
+        self.add_kernel_to_copy(ROOT / "kernels" / "activation.hpp")
         
         # Include aidge outputs within the fwd file
         if self.attributes["aidge_cmp"]:
diff --git a/aidge_export_cpp/operators/ReLU.py b/aidge_export_cpp/operators/ReLU.py
index 55e7e19425e0a5b61790b58a2d36a8f233f75228..10390ea730fb2750fbc5a3b43d9c95372d6b211a 100644
--- a/aidge_export_cpp/operators/ReLU.py
+++ b/aidge_export_cpp/operators/ReLU.py
@@ -2,7 +2,9 @@ import aidge_core
 from aidge_core.export_utils import ExportNodeCpp
 from aidge_export_cpp import ROOT, ExportLibCpp, set_scaling_attributes
 
-@ExportLibCpp.register("ReLU", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)))
+@ExportLibCpp.register("ReLU",
+    aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)),
+    aidge_core.ProdConso.in_place_model)
 class ReLU(ExportNodeCpp):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
@@ -32,7 +34,34 @@ class ReLU(ExportNodeCpp):
         if self.attributes["aidge_cmp"]:
             self.include_list.append("network/utils.hpp")   # aidge_cmp function
             self.include_list.append("data/aidge_outputs/" + node.name() + ".hpp")
-    
+
+@ExportLibCpp.register("LeakyReLU",
+    aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)),
+    aidge_core.ProdConso.in_place_model)
+class LeakyReLU(ExportNodeCpp):
+    def __init__(self, node, mem_info):
+        super().__init__(node, mem_info)
+
+        # Initialize kernel attributes
+        self.attributes["alpha"] = node.get_operator().attr.negative_slope
+        self.attributes["aidge_cmp"] = node.attributes().has_attr("aidge_cmp")
+
+        # Template for layer configutation file generation
+        self.config_template = str(ROOT / "templates" / "configuration" / "leakyrelu_config.jinja")
+        
+        # Template layer call function generation within the forward file
+        self.forward_template = str(ROOT / "templates" / "kernel_forward" / "leakyrelu_forward.jinja")
+        
+        # Files to include within the generated forward.cpp file
+        self.include_list = []
+        
+        # Path to the kernel(s) files to copy
+        self.add_kernel_to_copy(ROOT / "kernels" / "leakyrelu.hpp")
+
+        # Include aidge outputs within the fwd file
+        if self.attributes["aidge_cmp"]:
+            self.include_list.append("network/utils.hpp")   # aidge_cmp function
+            self.include_list.append("data/aidge_outputs/" + node.name() + ".hpp")
         
 @ExportLibCpp.register_metaop("QReLU", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)))
 class QReLU(ReLU):
diff --git a/aidge_export_cpp/operators/ReduceMean.py b/aidge_export_cpp/operators/ReduceMean.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd0d6316ae6143f9aad062f3feec14b4a6d803bd
--- /dev/null
+++ b/aidge_export_cpp/operators/ReduceMean.py
@@ -0,0 +1,56 @@
+import aidge_core
+from aidge_core.export_utils import ExportNodeCpp
+from aidge_export_cpp import ROOT, ExportLibCpp, set_scaling_attributes
+
+@ExportLibCpp.register("ReduceMean", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
+class ReducemeanCPP(ExportNodeCpp):
+    def __init__(self, node, mem_info):
+        super().__init__(node, mem_info)
+        self.attributes["aidge_cmp"] = node.attributes().has_attr("aidge_cmp")
+
+        if self.operator.get_input(0) is None:
+            raise AttributeError("Input 0 not found for operator ReduceMean")
+
+        input_T = self.operator.get_input(0)
+
+        # COMPUTING PRE/POST AXES STRIDES:
+        # Example :
+        # input dims {3, 3, 2}
+        # stride_pre = {1,3,9}
+        # stride_post = {6, 2, 1}
+        post_axis_strides = input_T.strides()
+
+        pre_axis_strides = [1]
+        for i in range(1, len(post_axis_strides)):
+            pre_axis_strides.append(pre_axis_strides[i - 1] * input_T.dims()[i - 1])
+
+        in_nb_elts = input_T.dims()[0] * input_T.strides()[0]
+        out_nb_elts = in_nb_elts
+        axes_to_reduce = self.node.get_operator().attr.axes
+        for i in axes_to_reduce: 
+            out_nb_elts = out_nb_elts // input_T.dims()[i]
+
+        self.attributes["in_dims"] = input_T.dims()
+        self.attributes["in_nb_dims"] = len(input_T.dims())
+        self.attributes["in_nb_elts"] = in_nb_elts
+        self.attributes["out_nb_elts"] = out_nb_elts
+        self.attributes["nb_axes_to_reduce"] = len(self.operator.attr.axes)
+        self.attributes["axes_to_reduce"] = self.node.get_operator().attr.axes
+        self.attributes["pre_axis_strides"] = pre_axis_strides
+        self.attributes["post_axis_strides"] = post_axis_strides
+
+        # axis = node.get_operator().attr.axis if node.get_operator().attr.axis >= 0 else node.get_operator().attr.axis + nbDims
+
+        self.config_template = str(
+            ROOT / "templates" / "configuration" / "reducemean_config.jinja")
+        self.forward_template = str(
+            ROOT / "templates" / "kernel_forward" / "reducemean_forward.jinja")
+        self.include_list = []
+        self.kernels_to_copy = [
+            str(ROOT / "kernels" / "reducemean.hpp"),
+        ]
+
+        # Include aidge outputs within the fwd file
+        if self.attributes["aidge_cmp"]:
+            self.include_list.append("network/utils.hpp")   # aidge_cmp function
+            self.include_list.append("data/aidge_outputs/" + node.name() + ".hpp") 
diff --git a/aidge_export_cpp/operators/Reshape.py b/aidge_export_cpp/operators/Reshape.py
index ed094c94bbffcf84a2370bb24cb9110f9e2e6c68..b7166853a5790d89b40ffc1bd15400614361cfb3 100644
--- a/aidge_export_cpp/operators/Reshape.py
+++ b/aidge_export_cpp/operators/Reshape.py
@@ -3,15 +3,28 @@ from aidge_core.export_utils import ExportNodeCpp
 from aidge_export_cpp import ROOT
 from aidge_export_cpp import ExportLibCpp
 
-@ExportLibCpp.register("Reshape", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
+@ExportLibCpp.register("Reshape",
+    # Reshape cannot accept any format, because its output format does not necessarily
+    # match its input format. So, if the previous layer is changed from NCHW to NHWC
+    # by adapt_to_backend(), it won't propagate the new format, ultimately leading
+    # to a missing transpose for the next layer!
+    aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.default)),
+    aidge_core.ProdConso.in_place_model)
 class ReshapeCPP(ExportNodeCpp):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
+        self.attributes["aidge_cmp"] = node.attributes().has_attr("aidge_cmp")
+
         self.config_template = str(
-            ROOT / "templates" / "configuration" / "reshape_config.jinja")
+            ROOT / "templates" / "configuration" / "identity_config.jinja")
         self.forward_template = str(
-            ROOT / "templates" / "kernel_forward" / "reshape_forward.jinja")
+            ROOT / "templates" / "kernel_forward" / "identity_forward.jinja")
         self.include_list = []
         self.kernels_to_copy = [
-            str(ROOT / "kernels" / "reshape.hpp"),
+            str(ROOT / "kernels" / "identity.hpp"),
         ]
+
+        # Include aidge outputs within the fwd file
+        if self.attributes["aidge_cmp"]:
+            self.include_list.append("network/utils.hpp")   # aidge_cmp function
+            self.include_list.append("data/aidge_outputs/" + node.name() + ".hpp") 
diff --git a/aidge_export_cpp/operators/Sigmoid.py b/aidge_export_cpp/operators/Sigmoid.py
new file mode 100644
index 0000000000000000000000000000000000000000..de89d08631ab44ef32df99d02b6f2de3a944c822
--- /dev/null
+++ b/aidge_export_cpp/operators/Sigmoid.py
@@ -0,0 +1,21 @@
+import aidge_core
+from aidge_core.export_utils import ExportNodeCpp, get_node_from_metaop
+from aidge_export_cpp import ROOT, ExportLibCpp, set_scaling_attributes
+
+@ExportLibCpp.register("Sigmoid",
+    aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)),
+    aidge_core.ProdConso.in_place_model)
+class SigmoidCPP(ExportNodeCpp):
+    def __init__(self, node, mem_info):
+        super().__init__(node, mem_info)
+        self.attributes["activation"] = "Linear"
+        self.attributes["rescaling"] = "NoScaling"
+        self.config_template = str(
+            ROOT / "templates" / "configuration" / "sigmoid_config.jinja")
+        self.forward_template = str(
+            ROOT / "templates" / "kernel_forward" / "sigmoid_forward.jinja")
+        self.include_list = []
+        self.kernels_to_copy = [
+            str(ROOT / "kernels" / "sigmoid.hpp"),
+            str(ROOT / "kernels" / "activation.hpp")
+        ]
\ No newline at end of file
diff --git a/aidge_export_cpp/operators/Slice.py b/aidge_export_cpp/operators/Slice.py
new file mode 100644
index 0000000000000000000000000000000000000000..26041b0178dd43d867ca00382fe0a675a3c865d9
--- /dev/null
+++ b/aidge_export_cpp/operators/Slice.py
@@ -0,0 +1,55 @@
+import aidge_core
+from aidge_core.export_utils import ExportNodeCpp
+from aidge_export_cpp import ROOT
+from aidge_export_cpp import ExportLibCpp
+
+@ExportLibCpp.register("Slice",
+    aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)))
+class SliceCPP(ExportNodeCpp):
+    def __init__(self, node, mem_info):
+        super().__init__(node, mem_info)
+        # Secure retrieve parameter attributes
+        input_dims = self.attributes["in_dims"][0]
+        axes = [a if a>=0 else a+len(input_dims) for a in node.get_operator().attr.axes] # postive axes
+        starts, ends, steps = node.get_operator().attr.starts, node.get_operator().attr.ends, node.get_operator().attr.steps
+        assert len(starts) == len(axes)
+        assert len(ends) == len(axes)
+        assert len(steps) == len(steps)
+        # positive start and ends indices
+        starts = [s if  s>=0 else s+input_dims[axes[i]] for i,s in enumerate(starts)] 
+        ends = [e if e>=0 else e+input_dims[axes[i]] for i,e in enumerate(ends)]
+        # assert boundaries
+        for a in axes: assert a>=0 and a < len(input_dims)
+        for i,e in enumerate(ends): assert e>=0 and e <= input_dims[axes[i]]
+        for i,s in enumerate(starts): assert s>=0 and s < ends[i]
+        for st in steps: assert st >= 1
+        self.attributes["starts"] = starts
+        self.attributes["ends"] = ends
+        self.attributes["steps"] = steps
+        
+        #Compute mod and div values that will be used to convert input flat-index to axes-index
+        axes_mod = [input_dims[a] for a in axes]
+        axes_div = len(axes) * [1]
+        for i,ax in enumerate(axes):
+            for j in range(ax+1, len(input_dims)):
+                axes_mod[i] *= input_dims[j]
+                axes_div[i] *= input_dims[j]
+        self.attributes["axes_mod"] = axes_mod
+        self.attributes["axes_div"] = axes_div
+
+
+        self.attributes["aidge_cmp"] = node.attributes().has_attr("aidge_cmp")
+
+        self.config_template = str(
+            ROOT / "templates" / "configuration" / "slice_config.jinja")
+        self.forward_template = str(
+            ROOT / "templates" / "kernel_forward" / "slice_forward.jinja")
+        self.include_list = []
+        self.kernels_to_copy = [
+            str(ROOT / "kernels" / "slice.hpp"),
+        ]
+        
+        # Include aidge outputs within the fwd file
+        if self.attributes["aidge_cmp"]:
+            self.include_list.append("network/utils.hpp")   # aidge_cmp function
+            self.include_list.append("data/aidge_outputs/" + node.name() + ".hpp")
diff --git a/aidge_export_cpp/operators/Transpose.py b/aidge_export_cpp/operators/Transpose.py
index 63082ea67fcd35548681108b9ae6306c600fa252..e0784374d1b6282605b752805bd3d25f0abe0784 100644
--- a/aidge_export_cpp/operators/Transpose.py
+++ b/aidge_export_cpp/operators/Transpose.py
@@ -7,6 +7,27 @@ from aidge_export_cpp import ExportLibCpp
 class TransposeCPP(ExportNodeCpp):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
+
+        nbdims = len(self.attributes["in_dims"][0])
+
+        # Compute input strides
+        in_strides = [0] * nbdims
+        in_strides[nbdims - 1] = 1
+        for i in range(nbdims - 2, -1, -1):
+            in_strides[i] = in_strides[i + 1] * self.attributes["in_dims"][0][i + 1]
+
+        # Compute output dimensions based on permutation
+        out_dims = [self.attributes["in_dims"][0][self.attributes["output_dims_order"][i]] for i in range(nbdims)]
+
+        # Compute output strides
+        out_strides = [0] * nbdims
+        out_strides[nbdims - 1] = 1
+        for i in range(nbdims - 2, -1, -1):
+            out_strides[i] = out_strides[i + 1] * out_dims[i + 1]
+
+        self.attributes["in_strides"] = in_strides
+        self.attributes["out_strides"] = out_strides
+
         self.config_template = str(
             ROOT / "templates" / "configuration" / "transpose_ND_config.jinja")
         self.forward_template = str(
diff --git a/aidge_export_cpp/static/Makefile b/aidge_export_cpp/static/Makefile
index 176b8fc1dc3082c5de4cefbba961073b446f4614..5c95c2258040f701c23ee6f56b88b3504bc9918c 100644
--- a/aidge_export_cpp/static/Makefile
+++ b/aidge_export_cpp/static/Makefile
@@ -1,16 +1,43 @@
 CC := g++
-CCFLAGS := ${CCFLAGS} -O2 -Wall -Wextra -MMD -fopenmp
+CCFLAGS_COMMON := -Wall -Wextra -MMD -fopenmp
+CCFLAGS_RELEASE := -O2
+CCFLAGS_DEBUG := -g -O0 -DDEBUG
+CCFLAGS_ASAN := -g -O1 -DDEBUG -fsanitize=address -fno-omit-frame-pointer
+CCFLAGS := ${CCFLAGS_COMMON} ${CCFLAGS_RELEASE}
 LDFLAGS := ${LDFLAGS} -fopenmp
+LDFLAGS_ASAN := ${LDFLAGS} -fsanitize=address
+
 OBJDIR := build
 DNNDIR := dnn
 BINDIR := bin
 TARGET := $(BINDIR)/run_export
+DEBUG_OBJDIR := build_debug
+DEBUG_TARGET := $(BINDIR)/run_export_debug
+ASAN_OBJDIR := build_asan
+ASAN_TARGET := $(BINDIR)/run_export_asan
+
+# Export Params
+AIDGE_CMP := false		# Compare fmaps with Aidge ref
+SAVE_OUTPUTS := false	# Store fmaps into files
+
+ifeq ($(AIDGE_CMP), true)
+	PRMFLAGS := ${PRMFLAGS} -DAIDGE_CMP
+endif
 
+ifeq ($(SAVE_OUTPUTS), true)
+	PRMFLAGS := ${PRMFLAGS} -DSAVE_OUTPUTS
+endif
 
 INCLUDE_DIRS :=-I. -I./${DNNDIR} -I./${DNNDIR}/include -I./${DNNDIR}/layers -I./${DNNDIR}/parameters
 CC_SRCS := $(shell find . -iname "*.cpp")
 CC_OBJS := $(patsubst %.cpp, ${OBJDIR}/%.o, ${CC_SRCS})
+DEBUG_OBJS := $(patsubst %.cpp, ${DEBUG_OBJDIR}/%.o, ${CC_SRCS})
+ASAN_OBJS := $(patsubst %.cpp, ${ASAN_OBJDIR}/%.o, ${CC_SRCS})
 DEPENDENCIES := $(patsubst %.o, %.d, ${CC_OBJS})
+DEBUG_DEPENDENCIES := $(patsubst %.o, %.d, ${DEBUG_OBJS})
+ASAN_DEPENDENCIES := $(patsubst %.o, %.d, ${ASAN_OBJS})
+
+.PHONY: all build debug asan clean
 
 all: build
 
@@ -18,13 +45,35 @@ build: ${CC_OBJS}
 	@mkdir -p $(dir ${TARGET})
 	${CC} ${CC_OBJS} ${LDFLAGS} -o ${TARGET}
 
+debug: CCFLAGS := ${CCFLAGS_COMMON} ${CCFLAGS_DEBUG}
+debug: ${DEBUG_OBJS}
+	@mkdir -p $(dir ${DEBUG_TARGET})
+	${CC} ${DEBUG_OBJS} ${LDFLAGS} -o ${DEBUG_TARGET}
+
+asan: CCFLAGS := ${CCFLAGS_COMMON} ${CCFLAGS_ASAN}
+asan: ${ASAN_OBJS}
+	@mkdir -p $(dir ${ASAN_TARGET})
+	${CC} ${ASAN_OBJS} ${LDFLAGS_ASAN} -o ${ASAN_TARGET}
+
 ${OBJDIR}/%.o: %.cpp
 	@mkdir -p $(dir $@)
-	${CC} ${CCFLAGS} ${INCLUDE_DIRS} -c $< -o $@ 
+	${CC} ${CCFLAGS} ${PRMFLAGS} ${INCLUDE_DIRS} -c $< -o $@ 
+
+${DEBUG_OBJDIR}/%.o: %.cpp
+	@mkdir -p $(dir $@)
+	${CC} ${CCFLAGS} ${PRMFLAGS} ${INCLUDE_DIRS} -c $< -o $@ 
+
+${ASAN_OBJDIR}/%.o: %.cpp
+	@mkdir -p $(dir $@)
+	${CC} ${CCFLAGS} ${PRMFLAGS} ${INCLUDE_DIRS} -c $< -o $@ 
 
 clean:
 	if [ -d "$(OBJDIR)" ]; then rm -rf $(OBJDIR); fi
+	if [ -d "$(DEBUG_OBJDIR)" ]; then rm -rf $(DEBUG_OBJDIR); fi
+	if [ -d "$(ASAN_OBJDIR)" ]; then rm -rf $(ASAN_OBJDIR); fi
 	if [ -d "$(BINDIR)" ]; then rm -rf $(BINDIR); fi
 
 
 -include $(DEPENDENCIES)
+-include $(DEBUG_DEPENDENCIES)
+-include $(ASAN_DEPENDENCIES)
diff --git a/aidge_export_cpp/static/activation_utils.hpp b/aidge_export_cpp/static/activation_utils.hpp
index c6a1bcdc0ce289a384519673ca04a001a5ca9692..ddac7fb46061c5ca909d1f0571723f0842dbda64 100644
--- a/aidge_export_cpp/static/activation_utils.hpp
+++ b/aidge_export_cpp/static/activation_utils.hpp
@@ -48,9 +48,5 @@ Output_T activation_forward_value (Sum_T weightedSum,
             break;
     }
 
-    // Value fixed here for now but it should be generated by
-    // the export module or determined by the type of Output_T
-    // For now only works for int8_t and uint8_t
-    const uint32_t NB_BITS = 8;
-    return saturate<Output_T>(rescaling(weightedSum, output), NB_BITS);
+    return saturate<Output_T>(rescaling(weightedSum, output), 8 * sizeof(Output_T));
 }
diff --git a/aidge_export_cpp/static/rescaling_utils.hpp b/aidge_export_cpp/static/rescaling_utils.hpp
index 4fdb321820f92f8d33e474aabc4665a99cb0d4b0..8825197e743e7a488cbc411dc83af6a1968d9974 100644
--- a/aidge_export_cpp/static/rescaling_utils.hpp
+++ b/aidge_export_cpp/static/rescaling_utils.hpp
@@ -1,14 +1,16 @@
 #pragma once
 
+#include <sys/types.h>
+
 // ---------------------------------------------------
 // ----------------- Saturate Utils ------------------
 // ---------------------------------------------------
 
-static int64_t toInt64(uint32_t lo, uint32_t hi) {
+constexpr int64_t toInt64(uint32_t lo, uint32_t hi) {
     return (int64_t) (((uint64_t) hi) << 32ull) | ((uint64_t) lo);
 }
 
-static int64_t smlal(int32_t lhs, int32_t rhs, 
+constexpr int64_t smlal(int32_t lhs, int32_t rhs, 
                      uint32_t accumLo, uint32_t accumHi) 
 {
     return ((int64_t) lhs) * ((int64_t) rhs) + toInt64(accumLo, accumHi);
@@ -52,9 +54,9 @@ struct FixedPointScaling {
     }
 
     // Attributes
-    static const uint32_t HALF_LO = (SHIFT > 0)
+    static constexpr uint32_t HALF_LO = (SHIFT > 0)
         ? (1ull << (SHIFT - 1)) & 0xFFFFFFFF : 0;
-    static const uint32_t HALF_HI = (SHIFT > 0)
+    static constexpr uint32_t HALF_HI = (SHIFT > 0)
         ? (1ull << (SHIFT - 1)) >> 32u : 0;
     
     // static const int32_t mScaling = SCALING;
diff --git a/aidge_export_cpp/static/typedefs.hpp b/aidge_export_cpp/static/typedefs.hpp
index acece91115f73a57197c8a423cd34ec37b2f2e2a..8ecd095639a1afeb03b96f8f59149545cfd3a825 100644
--- a/aidge_export_cpp/static/typedefs.hpp
+++ b/aidge_export_cpp/static/typedefs.hpp
@@ -19,7 +19,8 @@ typedef enum {
 typedef enum {
     Add,
     Sub,
-    Mul
+    Mul, 
+    Div
 } ElemWise_T;
 
 typedef enum {
diff --git a/aidge_export_cpp/static/utils.hpp b/aidge_export_cpp/static/utils.hpp
index b9b739269216f6e02e05f5da3bcb3c2a8df30150..6142ac3f34951c424910b7294e8ba0af886c091e 100644
--- a/aidge_export_cpp/static/utils.hpp
+++ b/aidge_export_cpp/static/utils.hpp
@@ -52,10 +52,24 @@ int min (int lhs, int rhs)
     return (lhs <= rhs) ? lhs : rhs;
 }
 
+template <class InputIt, class Size, class OutputIt>
+__attribute__((always_inline))
+static inline OutputIt copy_n(InputIt first, Size count, OutputIt result) {
+    if (count > 0) {
+        *result = *first;
+        ++result;
+        for (Size i = 1; i != count; ++i, ++result) {
+            *result = *++first;
+        }
+    }
+
+    return result;
+}
+
+#if SAVE_OUTPUTS || AIDGE_CMP
 
-#if SAVE_OUTPUTS
 enum class Format {
-    Default,
+    DEFAULT,
     NCHW,
     NHWC,
     CHWN,
@@ -64,105 +78,152 @@ enum class Format {
     CDHWN
 };
 
+#endif  // SAVE_OUTPUTS || AIDGE_CMP
+
+#if SAVE_OUTPUTS
+
+template<int NB_OUTPUTS, int OUT_HEIGHT, int OUT_WIDTH,
+    size_t MEM_CONT_OFFSET,
+    size_t MEM_CONT_SIZE,
+    size_t MEM_WRAP_OFFSET,
+    size_t MEM_WRAP_SIZE,
+    Format FMT, typename Output_T>
+inline void saveOutputs(const Output_T* __restrict outputs, FILE* pFile) {
+    int offset = 0;
+
+    // NCHW
+    if (FMT == Format::NCHW || FMT == Format::DEFAULT) {
+        fprintf(pFile, "{");
+        for (auto out = 0; out < NB_OUTPUTS; ++out) {
+            fprintf(pFile, "{");
+            for (auto h = 0; h < OUT_HEIGHT; ++h) {
+                fprintf(pFile, "{");
+                for (auto w = 0; w < OUT_WIDTH; ++w) {
+                    if (MEM_WRAP_SIZE > 0 && offset == static_cast<int>(MEM_CONT_SIZE / sizeof(Output_T))) {
+                        offset += (MEM_WRAP_OFFSET - MEM_CONT_OFFSET
+                                    - MEM_CONT_SIZE) / sizeof(Output_T);
+                    }
 
-template<typename Output_T>
-inline void saveOutputs(
-    int NB_OUTPUTS,
-    int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
-    // int OUTPUT_MEM_CONT_OFFSET,
-    // int OUTPUT_MEM_CONT_SIZE,
-    // int OUTPUT_MEM_WRAP_OFFSET,
-    // int OUTPUT_MEM_WRAP_SIZE,
-    // int OUTPUT_MEM_STRIDE,
-    const Output_T* __restrict outputs,
-    FILE* pFile,
-    Format format)
-{
-    // default is NHCW !
-    if (format == Format::NHWC) {
-        fprintf(pFile, "(");
-        auto oOffset = 0;
-        for(int oy = 0; oy < OUTPUTS_HEIGHT; oy++) {
-            fprintf(pFile, "(");
-
-            for(int ox = 0; ox < OUTPUTS_WIDTH; ox++) {
-                fprintf(pFile, "(");
-
-                // const int oPos = (ox + OUTPUTS_WIDTH * oy);
-                // int oOffset = OUTPUT_MEM_STRIDE * oPos;
-
-                // if (OUTPUT_MEM_WRAP_SIZE > 0
-                //     && oOffset >= OUTPUT_MEM_CONT_SIZE)
-                // {
-                //     oOffset += OUTPUT_MEM_WRAP_OFFSET - OUTPUT_MEM_CONT_OFFSET
-                //                 - OUTPUT_MEM_CONT_SIZE;
-                // }
-
-                for (int output = 0; output < NB_OUTPUTS; output++) {
                     if (std::is_floating_point<Output_T>::value)
-                        fprintf(pFile, "%f", static_cast<float>(outputs[oOffset]));
+                        fprintf(pFile, "%f", static_cast<float>(outputs[offset]));
                     else
-                        fprintf(pFile, "%d", static_cast<int>(outputs[oOffset]));
-                    oOffset += 1;
+                        fprintf(pFile, "%d", static_cast<int>(outputs[offset]));
+                    ++offset;
 
                     fprintf(pFile, ", ");
-                }
 
-                fprintf(pFile, "), \n");
+                }
+                fprintf(pFile, "}\n");
             }
-
-            fprintf(pFile, "), \n");
+            fprintf(pFile, "}\n");
         }
+        fprintf(pFile, "}\n");
 
-        fprintf(pFile, ")\n");
-    }
-    else if (format == Format::NCHW || format == Format::Default) {
-        auto ofst = 0;
-        for(int output = 0; output < NB_OUTPUTS; output++) {
-            fprintf(pFile, "%d:\n", output);
-            for(int oy = 0; oy < OUTPUTS_HEIGHT; oy++) {
-                for(int ox = 0; ox < OUTPUTS_WIDTH; ox++) {
-                    fprintf(pFile, "%d",  static_cast<int>(outputs[ofst]));
-                    fprintf(pFile, " ");
-                    ofst += 1;
-                }
+    // NHWC
+    } else if (FMT == Format::NHWC) {
+        fprintf(pFile, "{\n"); // Start outer brace
+        for (auto h = 0; h < OUT_HEIGHT; ++h) {
+            fprintf(pFile, "  {\n"); // Indent level 1
+            for (auto w = 0; w < OUT_WIDTH; ++w) {
+                fprintf(pFile, "    { "); // Indent level 2 and open inner brace
+                for (auto out = 0; out < NB_OUTPUTS; ++out) {
+                    if (MEM_WRAP_SIZE > 0 && offset == static_cast<int>(MEM_CONT_SIZE / sizeof(Output_T))) {
+                        offset += (MEM_WRAP_OFFSET - MEM_CONT_OFFSET
+                                    - MEM_CONT_SIZE) / sizeof(Output_T);
+                    }
 
-                fprintf(pFile, "\n");
-            }
+                    if (std::is_floating_point<Output_T>::value)
+                        fprintf(pFile, "%f", static_cast<float>(outputs[offset]));
+                    else
+                        fprintf(pFile, "%4d", static_cast<int>(outputs[offset]));
+                    ++offset;
 
-            fprintf(pFile, "\n");
+                    // Add comma except for last element
+                    if (out != NB_OUTPUTS - 1)
+                        fprintf(pFile, ",");
+                }
+                fprintf(pFile, " },\n"); // Close inner brace and newline
+            }
+            fprintf(pFile, "  },\n"); // Close w-loop brace and newline
         }
+        fprintf(pFile, "}\n"); // Close outer brace
 
-        fprintf(pFile, "\n");
-    }
-    else {
-        printf("Warning unsupported dataformat.\n");
+    } else {
+        printf("[ERROR] - Format is not supported.\n");
+        printf("[ERROR] - Aborting save outputs...\n");
+        return;
     }
 }
 #endif // SAVE_OUTPUTS
 
 #if AIDGE_CMP
 
-template<int NB_OUTPUTS, int OUT_WIDTH, int OUT_HEIGHT, typename AidgeOutput_T, typename DevOutput_T>
+template<int NB_OUTPUTS, int OUT_WIDTH, int OUT_HEIGHT, Format FMT>
+int get_ofst_from_fmt(int out, int h, int w) {
+    if (FMT == Format::NCHW || FMT == Format::DEFAULT)
+        return out * OUT_HEIGHT * OUT_WIDTH + h * OUT_WIDTH + w;
+    else if (FMT == Format::NHWC)
+        return h * OUT_WIDTH * NB_OUTPUTS + w * NB_OUTPUTS + out;
+    else {
+        printf("[ERROR] - This data format is not supported.\n");
+        return -1;
+    }
+}
+
+template<int NB_OUTPUTS, int OUT_WIDTH, int OUT_HEIGHT,
+    size_t MEM_CONT_OFFSET,
+    size_t MEM_CONT_SIZE,
+    size_t MEM_WRAP_OFFSET,
+    size_t MEM_WRAP_SIZE,
+    Format AIDGE_FMT, Format DEV_FMT, typename AidgeOutput_T, typename DevOutput_T>
 void aidge_cmp(std::string layer_name, AidgeOutput_T* aidge_output, DevOutput_T* dev_output) {
 
-    printf("[AIDGE COMPARE] - %s\n", layer_name.c_str());
+    printf("[NOTICE] - Comparing with Aidge ref for node : %s -> ", layer_name.c_str());
+
+    const float atol = 1e-5f;   // Absolute
+    const float rtol = 1e-3f;   // Relative
 
     for (auto out = 0; out < NB_OUTPUTS; ++out) {
         for (auto h = 0; h < OUT_HEIGHT; ++h) {
             for (auto w = 0; w < OUT_WIDTH; ++w) {
-                const int aidge_ofst = out * OUT_HEIGHT * OUT_WIDTH + h * OUT_WIDTH + w;
-                const int dev_ofst = h * OUT_WIDTH * NB_OUTPUTS + w * NB_OUTPUTS + out;
-                if (aidge_output[aidge_ofst] != dev_output[dev_ofst]) {
-                    if (std::is_floating_point<DevOutput_T>::value) {
-                        printf("[ERROR] - First error detected at %dx%dx%d (out x h x w) : aidge_out = %f vs dev_out = %f\n",
-                                out, h, w, static_cast<double>(aidge_output[aidge_ofst]), static_cast<double>(dev_output[dev_ofst]));
-                    } else {
+
+                const int aidge_ofst = get_ofst_from_fmt<NB_OUTPUTS, OUT_WIDTH, OUT_HEIGHT, AIDGE_FMT>(out, h, w);
+                int dev_ofst   = get_ofst_from_fmt<NB_OUTPUTS, OUT_WIDTH, OUT_HEIGHT, DEV_FMT>(out, h, w);
+
+                if (aidge_ofst == -1 || dev_ofst == -1) {
+                    printf("[FAILURE]\n");
+                    printf("[ERROR] - Aborting this layer comparison...\n");
+                    return;
+                }
+
+                if (MEM_WRAP_SIZE > 0 && dev_ofst >= static_cast<int>(MEM_CONT_SIZE / sizeof(DevOutput_T))) {
+                    dev_ofst += (MEM_WRAP_OFFSET - MEM_CONT_OFFSET
+                                - MEM_CONT_SIZE) / sizeof(DevOutput_T);
+                }
+
+                // Float Comparison
+                if (std::is_floating_point<DevOutput_T>::value) {
+
+                    const float diff = std::abs(aidge_output[aidge_ofst] - dev_output[dev_ofst]);
+                    const float tolerance = atol + rtol * std::abs(dev_output[dev_ofst]);
+
+                    if (diff > tolerance) {
+                            printf("[FAILURE]\n");
+                            printf("[ERROR] - First error detected at %dx%dx%d (out x h x w) : aidge_out = %f vs dev_out = %f\n",
+                                    out, h, w, static_cast<double>(aidge_output[aidge_ofst]), static_cast<double>(dev_output[dev_ofst]));
+                            printf("Abort program.\n");
+                            exit(1);
+                    }
+                
+                // Int Comparison
+                } else {    
+                    if (aidge_output[aidge_ofst] != dev_output[dev_ofst]) {
+                        printf("[FAILURE]\n");
                         printf("[ERROR] - First error detected at %dx%dx%d (out x h x w) : aidge_out = %d vs dev_out = %d\n",
-                              out, h, w, static_cast<int>(aidge_output[aidge_ofst]), static_cast<int>(dev_output[dev_ofst]));
+                                out, h, w, static_cast<int>(aidge_output[aidge_ofst]), static_cast<int>(dev_output[dev_ofst]));
+                        printf("[ERROR] - Abort program.\n");
+                        exit(1);
                     }
-                    printf("Abort program.\n");
-                    exit(1);
                 }
             }
         }
diff --git a/aidge_export_cpp/templates/configuration/_def_io.jinja b/aidge_export_cpp/templates/configuration/_def_io.jinja
index 314ae39a491f2bf1eafec4b0ee5aaab156d3c987..4bbb86666d56656cd2477f6b0d398b58bf08be75 100644
--- a/aidge_export_cpp/templates/configuration/_def_io.jinja
+++ b/aidge_export_cpp/templates/configuration/_def_io.jinja
@@ -1,9 +1,11 @@
 {# NOTE: Suppose input is first #}
 {# // INPUT CONF
 {% for inidx in range(nb_in) -%}
+{% if in_name[inidx] %}
 #define {{ in_name[inidx]|upper }}_NB_CHANNELS {{ in_chan[inidx] }}
 #define {{ in_name[inidx]|upper }}_IN_HEIGHT {{ in_height[inidx] }}
 #define {{ in_name[inidx]|upper }}_IN_WIDTH {{ in_width[inidx] }}
+{% endif %}
 {% endfor %} #}
 
 // INPUT CONF
diff --git a/aidge_export_cpp/templates/configuration/_meminfo.jinja b/aidge_export_cpp/templates/configuration/_meminfo.jinja
index 15d15425b4330f68b4a97c31e9cf7a1076cc93e8..6c99509c8fc93fd550a1b45e4803e23a072145f2 100644
--- a/aidge_export_cpp/templates/configuration/_meminfo.jinja
+++ b/aidge_export_cpp/templates/configuration/_meminfo.jinja
@@ -1,11 +1,34 @@
 // MEMINFO CONF
+{% for inidx in range(nb_in) -%}
+{% if in_name[inidx] %}
+{# Specify a default memory layout for standalone input tensors -#}
+{% if not in_node[inidx] %}
+{% if in_dformat[inidx]|string == 'nhwc' or in_dformat[inidx]|string == 'nwc' %}
+#define {{ in_name[inidx]|upper }}_MEM_SIZE {{ in_chan[inidx] * in_sizeof[inidx] }}
+#define {{ in_name[inidx]|upper }}_MEM_OFFSET 0
+#define {{ in_name[inidx]|upper }}_MEM_STRIDE {{ in_chan[inidx] * in_sizeof[inidx] }}
+#define {{ in_name[inidx]|upper }}_MEM_LENGTH {{ in_width[inidx] }}
+{% else %}
+#define {{ in_name[inidx]|upper }}_MEM_SIZE {{ in_size[inidx] * in_sizeof[inidx] }}
+#define {{ in_name[inidx]|upper }}_MEM_OFFSET 0
+#define {{ in_name[inidx]|upper }}_MEM_STRIDE {{ in_size[inidx] * in_sizeof[inidx] }}
+#define {{ in_name[inidx]|upper }}_MEM_LENGTH 1
+{% endif %}
+#define {{ in_name[inidx]|upper }}_MEM_CONT_SIZE {{ in_size[inidx] * in_sizeof[inidx] }}
+#define {{ in_name[inidx]|upper }}_MEM_CONT_OFFSET 0
+#define {{ in_name[inidx]|upper }}_MEM_WRAP_OFFSET 0
+#define {{ in_name[inidx]|upper }}_MEM_WRAP_SIZE 0
+{% endif %}
+{% endif %}
+{% endfor %}
+
 {% for outidx in range(nb_out) -%}
-#define {{ out_name[outidx]|upper }}_SIZE {{ mem_info_size[outidx]}}
-#define {{ out_name[outidx]|upper }}_OFFSET {{ mem_info_offset[outidx]}}
-#define {{ out_name[outidx]|upper }}_STRIDE {{ mem_info_stride[outidx]}}
-#define {{ out_name[outidx]|upper }}_LENGTH {{ mem_info_length[outidx]}}
-#define {{ out_name[outidx]|upper }}_CONT_SIZE {{ mem_info_cont_size[outidx]}}
-#define {{ out_name[outidx]|upper }}_CONT_OFFSET {{ mem_info_cont_offset[outidx]}}
-#define {{ out_name[outidx]|upper }}_WRAP_OFFSET {{ mem_info_wrap_offset[outidx]}}
-#define {{ out_name[outidx]|upper }}_WRAP_SIZE {{ mem_info_wrap_size[outidx]}}
+#define {{ out_name[outidx]|upper }}_MEM_SIZE {{ mem_info_size[outidx]}}
+#define {{ out_name[outidx]|upper }}_MEM_OFFSET {{ mem_info_offset[outidx]}}
+#define {{ out_name[outidx]|upper }}_MEM_STRIDE {{ mem_info_stride[outidx]}}
+#define {{ out_name[outidx]|upper }}_MEM_LENGTH {{ mem_info_length[outidx]}}
+#define {{ out_name[outidx]|upper }}_MEM_CONT_SIZE {{ mem_info_cont_size[outidx]}}
+#define {{ out_name[outidx]|upper }}_MEM_CONT_OFFSET {{ mem_info_cont_offset[outidx]}}
+#define {{ out_name[outidx]|upper }}_MEM_WRAP_OFFSET {{ mem_info_wrap_offset[outidx]}}
+#define {{ out_name[outidx]|upper }}_MEM_WRAP_SIZE {{ mem_info_wrap_size[outidx]}}
 {% endfor %}
diff --git a/aidge_export_cpp/templates/configuration/_save_outputs.jinja b/aidge_export_cpp/templates/configuration/_save_outputs.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..358b9cf40e16715882809a1b69be8b86493799b4
--- /dev/null
+++ b/aidge_export_cpp/templates/configuration/_save_outputs.jinja
@@ -0,0 +1 @@
+#define {{ out_name[0] | upper }}_DEV_FMT Format::{{ out_format[0] | upper }}
\ No newline at end of file
diff --git a/aidge_export_cpp/templates/configuration/activation_config.jinja b/aidge_export_cpp/templates/configuration/activation_config.jinja
index 84b122ba5207f0022d72f35bb4f8e7064bf7fe32..45c2760706fd6e3a07b9cf5c9be8a1e585a2f0b1 100644
--- a/aidge_export_cpp/templates/configuration/activation_config.jinja
+++ b/aidge_export_cpp/templates/configuration/activation_config.jinja
@@ -4,9 +4,9 @@
 #include "network/rescaling_utils.hpp"
 
 {# For layer configuration -#}
-{%- set nb_data = in_chan[0] * in_height[0] * in_width[0] %}
-#define {{ name|upper }}_NB_DATA {{ nb_data }}
+#define {{ name|upper }}_NB_ELTS {{ in_dims[0]|join('*') }}
 #define {{ name|upper }}_ACTIVATION {{ activation }}
+{% include "./_save_outputs.jinja" %}
 {% include "./_def_io.jinja" %}
 {% include "./_meminfo.jinja" %}
 {% include "./_rescaling.jinja" %}
diff --git a/aidge_export_cpp/templates/configuration/batchnorm_config.jinja b/aidge_export_cpp/templates/configuration/batchnorm_config.jinja
index 0c0bc49b521556eee1a4e455486caae44a2b86cb..34412212dbb1ff689d4c04d89f8206e22bad8d3b 100644
--- a/aidge_export_cpp/templates/configuration/batchnorm_config.jinja
+++ b/aidge_export_cpp/templates/configuration/batchnorm_config.jinja
@@ -8,6 +8,7 @@
 {% include "./_meminfo.jinja" %}
 #define {{ name|upper }}_ACTIVATION {{ activation }}
 #define {{ name|upper }}_EPSILON {{ epsilon }}
+{% include "./_save_outputs.jinja" %}
 {% include "./_rescaling.jinja" %}
 
 #endif /* {{ name|upper }}_LAYER_H */
diff --git a/aidge_export_cpp/templates/configuration/concat_config.jinja b/aidge_export_cpp/templates/configuration/concat_config.jinja
index ea8246db9a315a371e0cacea5d45d07fa2b8f7e8..faaf5aad481975a80f5f95e50277fe4547ee1522 100644
--- a/aidge_export_cpp/templates/configuration/concat_config.jinja
+++ b/aidge_export_cpp/templates/configuration/concat_config.jinja
@@ -2,17 +2,20 @@
 #ifndef {{ name|upper }}_LAYER_H
 #define {{ name|upper }}_LAYER_H
 
+#include <sys/types.h>
+
 {% include "./_def_io.jinja" %}
 {% include "./_meminfo.jinja" %}
 
 // Attributes
 #define {{ name|upper }}_NB_INPUTS {{ nb_in }}
 #define {{ name|upper }}_AXIS {{ axis }}
-{%- for i in range(nb_in) %}
-#define {{ name|upper }}_INPUT_{{i}}_SIZE {{ axis_size[i] }}
-{%- endfor %}
+constexpr size_t {{name|upper}}_AXIS_SIZE[] =  { {{ axis_size|join(", ") }} };
+#define {{name|upper}}_TOTAL_AXIS_SIZE ({{ axis_size|join('+') }})
 
 #define {{ name|upper }}_AXIS_SIZE_POST {{ axis_size_post }}
 #define {{ name|upper }}_AXIS_SIZE_PRE {{ axis_size_pre }}
 
+{% include "./_save_outputs.jinja" %}
+
 #endif /* {{ name|upper }}_LAYER_H */
diff --git a/aidge_export_cpp/templates/configuration/convolution_config.jinja b/aidge_export_cpp/templates/configuration/convolution_config.jinja
index b72df4d10f5342f661e921f4b2a7dbaf79d32e85..5774e031ba553414250886a0896de1f491425e80 100644
--- a/aidge_export_cpp/templates/configuration/convolution_config.jinja
+++ b/aidge_export_cpp/templates/configuration/convolution_config.jinja
@@ -5,20 +5,21 @@
 {# For layer configuration -#}
 {% include "./_def_io.jinja" %}
 {% include "./_meminfo.jinja" %}
-#define {{ name|upper }}_PADDING_Y {{ padding[0] }}
-#define {{ name|upper }}_PADDING_X {{ padding[1] }}
-#define {{ name|upper }}_STRIDE_Y {{ stride_dims[0] }}
-#define {{ name|upper }}_STRIDE_X {{ stride_dims[1] }}
-#define {{ name|upper }}_DILATION_Y {{ dilation_dims[0] }}
-#define {{ name|upper }}_DILATION_X {{ dilation_dims[1] }}
-#define {{ name|upper }}_KERNEL_HEIGHT {{ kernel_dims[0] }}
-#define {{ name|upper }}_KERNEL_WIDTH {{ kernel_dims[1] }}
+#define {{ name|upper }}_PADDING_Y {{ padding[0] if padding|length > 2 else 0 }}
+#define {{ name|upper }}_PADDING_X {{ padding[1] if padding|length > 2 else padding[0] }}
+#define {{ name|upper }}_STRIDE_Y {{ stride_dims[0] if stride_dims|length > 1 else 1 }}
+#define {{ name|upper }}_STRIDE_X {{ stride_dims[1] if stride_dims|length > 1 else stride_dims[0] }}
+#define {{ name|upper }}_DILATION_Y {{ dilation_dims[0] if dilation_dims|length > 1 else 1 }}
+#define {{ name|upper }}_DILATION_X {{ dilation_dims[1] if dilation_dims|length > 1 else dilation_dims[0] }}
+#define {{ name|upper }}_KERNEL_HEIGHT {{ kernel_dims[0] if kernel_dims|length > 1 else 1 }}
+#define {{ name|upper }}_KERNEL_WIDTH {{ kernel_dims[1] if kernel_dims|length > 1 else kernel_dims[0] }}
 #define {{ name|upper }}_ACTIVATION {{ activation }}
+{% include "./_save_outputs.jinja" %}
 {% include "./_rescaling.jinja" %}
 
 {#- Calculate sizes #}
-{%- set weights_size = out_chan[0] * kernel_dims[1] * kernel_dims[0] if depthwise is defined
-    else out_chan[0] * in_chan[0] * kernel_dims[1] * kernel_dims[0] %}
+{%- set weights_size = out_chan[0] * (kernel_dims[1] if kernel_dims|length > 1 else 1) * kernel_dims[0] if depthwise is defined
+    else out_chan[0] * in_chan[0] * (kernel_dims[1] if kernel_dims|length > 1 else 1) * kernel_dims[0] %}
 #define {{ name|upper }}_WEIGHTS_SIZE {{ weights_size }}
 #define {{ name|upper }}_BIASES_SIZE {{ out_chan[0] }}
 
diff --git a/aidge_export_cpp/templates/configuration/elemwise_config.jinja b/aidge_export_cpp/templates/configuration/elemwise_config.jinja
index f839602fff707bc4dc30b11835846c977130cab4..2ccf5fa73a1b464ac1145684e92f1e00d0204b9f 100644
--- a/aidge_export_cpp/templates/configuration/elemwise_config.jinja
+++ b/aidge_export_cpp/templates/configuration/elemwise_config.jinja
@@ -3,12 +3,23 @@
 #define {{ name|upper }}_LAYER_H
 #include "network/rescaling_utils.hpp"
 
+#include <sys/types.h>
+
 {% include "./_def_io.jinja" %}
 {% include "./_meminfo.jinja" %}
 {# For layer configuration -#}
-#define {{ name|upper }}_NB_ELTS {{ in_dims[0]|join('*') }}
+#define {{ name|upper }}_NB_MAT {{ offset_in1|length }}
+#define {{ name|upper }}_INPUT1_CONT_SIZE {{ input1_cont_size }}
+#define {{ name|upper }}_INPUT2_CONT_SIZE {{ input2_cont_size }}
+#define {{ name|upper }}_OUTPUT_CONT_SIZE {{ output_cont_size }}
+
+constexpr size_t {{name|upper}}_OFFSET_IN1[] =  { {{ offset_in1|join(", ") }} };
+constexpr size_t {{name|upper}}_OFFSET_IN2[] = { {{ offset_in2|join(", ") }} };
+
 #define {{ name|upper }}_ACTIVATION {{ activation }}
 #define {{ name|upper }}_ELEM_OP {{ elemwise_op }}
+
+{% include "./_save_outputs.jinja" %}
 {% include "./_rescaling.jinja" %}
 
 #endif /* {{ name|upper }}_LAYER_H */
diff --git a/aidge_export_cpp/templates/configuration/erf_config.jinja b/aidge_export_cpp/templates/configuration/erf_config.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..1e8e02c81188ddfeeb0d1c6e821f2509d6bb9a9e
--- /dev/null
+++ b/aidge_export_cpp/templates/configuration/erf_config.jinja
@@ -0,0 +1,11 @@
+{#- For name header -#}
+#ifndef {{ name|upper }}_LAYER_H
+#define {{ name|upper }}_LAYER_H
+
+{# For layer configuration -#}
+{% include "./_def_io.jinja" %}
+{% include "./_meminfo.jinja" %}
+#define {{ name|upper }}_NB_ELTS {{ in_dims[0]|join('*') }}
+{% include "./_save_outputs.jinja" %}
+
+#endif /* {{ name|upper }}_LAYER_H */
\ No newline at end of file
diff --git a/aidge_export_cpp/templates/configuration/fullyconnected_config.jinja b/aidge_export_cpp/templates/configuration/fullyconnected_config.jinja
index 856d727abc11ceb6f914e9d71d286ef5882322d6..b50f64fb500b7ae036f91821b689175c2c9b3e92 100644
--- a/aidge_export_cpp/templates/configuration/fullyconnected_config.jinja
+++ b/aidge_export_cpp/templates/configuration/fullyconnected_config.jinja
@@ -13,4 +13,6 @@
 #define {{ name|upper }}_WEIGHTS_SIZE {{ weights_size }}
 #define {{ name|upper }}_BIASES_SIZE {{ out_chan[0] }}
 
+{% include "./_save_outputs.jinja" %}
+
 #endif /* {{ name|upper }}_LAYER_H */
diff --git a/aidge_export_cpp/templates/configuration/hardmax_config.jinja b/aidge_export_cpp/templates/configuration/hardmax_config.jinja
index b5bf60cfd28ead3b0b787a0750c5ceae648e5e72..247a44cfe9da08864ea06d08264b04f9fb900876 100644
--- a/aidge_export_cpp/templates/configuration/hardmax_config.jinja
+++ b/aidge_export_cpp/templates/configuration/hardmax_config.jinja
@@ -10,5 +10,6 @@
 #define {{ name|upper }}_AXIS_STRIDE {{ axis_stride }}
 #define {{ name|upper }}_POSTAXIS_STRIDE {{ postaxis_stride }}
 #define {{ name|upper }}_INOUT_NB_ELTS {{ out_nb_elts }}
+{% include "./_save_outputs.jinja" %}
 
 #endif /* {{ name|upper }}_LAYER_H */
diff --git a/aidge_export_cpp/templates/configuration/reshape_config.jinja b/aidge_export_cpp/templates/configuration/identity_config.jinja
similarity index 88%
rename from aidge_export_cpp/templates/configuration/reshape_config.jinja
rename to aidge_export_cpp/templates/configuration/identity_config.jinja
index 88c8cc21f134b333e354c0df96dd07f41e948506..80318d1fa23d8e6caa401d0d40f080339343c2be 100644
--- a/aidge_export_cpp/templates/configuration/reshape_config.jinja
+++ b/aidge_export_cpp/templates/configuration/identity_config.jinja
@@ -6,5 +6,6 @@
 {% include "./_meminfo.jinja" %}
 {# For layer configuration -#}
 #define {{ name|upper }}_NB_ELTS {{ in_dims[0]|join('*') }}
+{% include "./_save_outputs.jinja" %}
 
 #endif /* {{ name|upper }}_LAYER_H */
diff --git a/aidge_export_cpp/templates/configuration/leakyrelu_config.jinja b/aidge_export_cpp/templates/configuration/leakyrelu_config.jinja
index 80903622d394bac9132ae3015f82ef72ac2242ea..2d6bd87ff2eb64026dd25a5741bf1fb89f1ebc6d 100644
--- a/aidge_export_cpp/templates/configuration/leakyrelu_config.jinja
+++ b/aidge_export_cpp/templates/configuration/leakyrelu_config.jinja
@@ -5,7 +5,8 @@
 {# For layer configuration -#}
 {% include "./_def_io.jinja" %}
 {% include "./_meminfo.jinja" %}
-#define {{ name|upper }}_NB_DATA {{ nb_data }}
+#define {{ name|upper }}_NB_ELTS {{ in_dims[0]|join('*') }}
 #define {{ name|upper }}_ALPHA {{ alpha }}
+{% include "./_save_outputs.jinja" %}
 
 #endif /* {{ name|upper }}_LAYER_H */
diff --git a/aidge_export_cpp/templates/configuration/matmul_config.jinja b/aidge_export_cpp/templates/configuration/matmul_config.jinja
index d0d4958e505b3208598fe387bba357a0c3d84602..0c4982867482738af9fddcd15cee59c308244c88 100644
--- a/aidge_export_cpp/templates/configuration/matmul_config.jinja
+++ b/aidge_export_cpp/templates/configuration/matmul_config.jinja
@@ -3,16 +3,24 @@
 #define {{ name|upper }}_LAYER_H
 #include "network/rescaling_utils.hpp"
 
+#include <sys/types.h>
+
+{# For layer configuration -#}
 {% include "./_def_io.jinja" %}
 {% include "./_meminfo.jinja" %}
+#define {{ name|upper }}_NB_MAT {{ offset_in1|length }}
+constexpr size_t {{name|upper}}_OFFSET_IN1[] =  { {{ offset_in1|join(", ") }} };
+constexpr size_t {{name|upper}}_OFFSET_IN2[] = { {{ offset_in2|join(", ") }} };
+
+#define {{ name|upper }}_N {{ n }}
+#define {{ name|upper }}_M {{ m }}
+#define {{ name|upper }}_K {{ k }}
 
-{# For layer configuration -#}
-#define {{ name|upper }}_M {{ in_dims[0][0] }}
-#define {{ name|upper }}_K {{ in_dims[0][1] }}
-#define {{ name|upper }}_N {{ in_dims[1][1] }}
 #define {{ name|upper }}_ACTIVATION {{ activation }}
-{% include "./_rescaling.jinja" %}
 
+{% include "./_save_outputs.jinja" %}
+
+{% include "./_rescaling.jinja" %}
 {#- Calculate sizes #}
 
 
diff --git a/aidge_export_cpp/templates/configuration/pad_config.jinja b/aidge_export_cpp/templates/configuration/pad_config.jinja
index 8b21577fe4d6f52ddb36ae796740f265db3d45cc..ed6f8ae79d19925041d7c2fec587a53f59770200 100644
--- a/aidge_export_cpp/templates/configuration/pad_config.jinja
+++ b/aidge_export_cpp/templates/configuration/pad_config.jinja
@@ -9,5 +9,6 @@
 #define {{ name|upper }}_PADDING_TOP {{ padding[0] }}
 #define {{ name|upper }}_PADDING_LEFT {{ padding[1] }}
 #define {{ name|upper }}_BORDER_VALUE {{ border_value }}
+{% include "./_save_outputs.jinja" %}
 
 #endif /* {{ name|upper }}_LAYER_H */
diff --git a/aidge_export_cpp/templates/configuration/pooling_config.jinja b/aidge_export_cpp/templates/configuration/pooling_config.jinja
index 57608fcb9c8071a1c686b10565d6adab78735079..ad6a7aebe54d2386e5bc0d47b2e80b0ea019aad6 100644
--- a/aidge_export_cpp/templates/configuration/pooling_config.jinja
+++ b/aidge_export_cpp/templates/configuration/pooling_config.jinja
@@ -13,5 +13,6 @@
 #define {{ name|upper }}_KERNEL_WIDTH {{ kernel_dims[1] }}
 #define {{ name|upper }}_POOLING_TYPE {{ pool_type }}
 #define {{ name|upper }}_ACTIVATION {{ activation }}
+{% include "./_save_outputs.jinja" %}
 
 #endif /* {{ name|upper }}_LAYER_H */
diff --git a/aidge_export_cpp/templates/configuration/producer_config.jinja b/aidge_export_cpp/templates/configuration/producer_config.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..ad0660f50d94984b1113af2cc4fae4cca5ac00c5
--- /dev/null
+++ b/aidge_export_cpp/templates/configuration/producer_config.jinja
@@ -0,0 +1,8 @@
+{#- For name header -#}
+#ifndef {{ name|upper }}_LAYER_H
+#define {{ name|upper }}_LAYER_H
+
+{# For layer configuration -#}
+{% include "./_meminfo.jinja" %}
+
+#endif /* {{ name|upper }}_LAYER_H */
\ No newline at end of file
diff --git a/aidge_export_cpp/templates/configuration/reducemean_config.jinja b/aidge_export_cpp/templates/configuration/reducemean_config.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..369ac8a3e7d70f8949e1c18e65cd7b0eb319e71f
--- /dev/null
+++ b/aidge_export_cpp/templates/configuration/reducemean_config.jinja
@@ -0,0 +1,27 @@
+{#- For name header -#}
+#ifndef {{ name|upper }}_LAYER_H
+#define {{ name|upper }}_LAYER_H
+
+#include <sys/types.h>
+
+{# For layer configuration -#}
+{% include "./_def_io.jinja" %}
+{% include "./_meminfo.jinja" %}
+
+#define {{ in_name[0]|upper }}_NB_DIMS {{ in_nb_dims }}
+#define {{ in_name[0]|upper }}_NB_ELTS {{ in_nb_elts }}
+#define {{ out_name[0]|upper }}_NB_ELTS {{ out_nb_elts }}
+#define {{ name|upper }}_NB_AXES_TO_REDUCES {{ nb_axes_to_reduce }}
+constexpr size_t {{ in_name[0]|upper }}_DIMS[{{ in_nb_dims }}] =
+  { {{ in_dims|join(", ") }} };
+constexpr size_t {{ name|upper }}_AXES_TO_REDUCE[{{ nb_axes_to_reduce }}] =
+  { {{ axes_to_reduce|join(", ") }} };
+constexpr size_t {{ name|upper }}_PREAXIS_STRIDES[{{ in_nb_dims }}] =
+  { {{ pre_axis_strides|join(", ") }} };
+constexpr size_t {{ name|upper }}_POSTAXIS_STRIDES[{{ in_nb_dims }}] =
+  { {{ post_axis_strides|join(", ") }} };
+
+{% include "./_save_outputs.jinja" %}
+
+#endif /* {{ name|upper }}_LAYER_H */
+
diff --git a/aidge_export_cpp/templates/configuration/rescaling_config.jinja b/aidge_export_cpp/templates/configuration/rescaling_config.jinja
deleted file mode 100644
index 6f4e3ad80e4f72d180a9add5fd7978181a71031d..0000000000000000000000000000000000000000
--- a/aidge_export_cpp/templates/configuration/rescaling_config.jinja
+++ /dev/null
@@ -1,16 +0,0 @@
-{#- For name header -#}
-#ifndef {{ name|upper }}_LAYER_H
-#define {{ name|upper }}_LAYER_H
-#include "network/rescaling_utils.hpp"
-
-{# For layer configuration -#}
-{% include "./_def_io.jinja" %}
-{% include "./_meminfo.jinja" %}
-
-#define {{ name|upper }}_NB_DATA {{ in_chan[0] * in_height[0] * in_width[0] }}
-
-// Activation
-#define {{ name|upper }}_ACTIVATION     {{ activation }}
-{% include "./_rescaling.jinja" %}
-
-#endif /* {{ name|upper }}_LAYER_H */
diff --git a/aidge_export_cpp/templates/configuration/sigmoid_config.jinja b/aidge_export_cpp/templates/configuration/sigmoid_config.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..1e8e02c81188ddfeeb0d1c6e821f2509d6bb9a9e
--- /dev/null
+++ b/aidge_export_cpp/templates/configuration/sigmoid_config.jinja
@@ -0,0 +1,11 @@
+{#- For name header -#}
+#ifndef {{ name|upper }}_LAYER_H
+#define {{ name|upper }}_LAYER_H
+
+{# For layer configuration -#}
+{% include "./_def_io.jinja" %}
+{% include "./_meminfo.jinja" %}
+#define {{ name|upper }}_NB_ELTS {{ in_dims[0]|join('*') }}
+{% include "./_save_outputs.jinja" %}
+
+#endif /* {{ name|upper }}_LAYER_H */
\ No newline at end of file
diff --git a/aidge_export_cpp/templates/configuration/slice_config.jinja b/aidge_export_cpp/templates/configuration/slice_config.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..e3af737ba8aef29937ea95da05a46310bc1f30a6
--- /dev/null
+++ b/aidge_export_cpp/templates/configuration/slice_config.jinja
@@ -0,0 +1,20 @@
+{#- For name header -#}
+#ifndef {{ name|upper }}_LAYER_H
+#define {{ name|upper }}_LAYER_H
+
+#include <sys/types.h>
+
+{% include "./_def_io.jinja" %}
+{% include "./_meminfo.jinja" %}
+{# For layer configuration -#}
+#define {{ name|upper }}_NB_DIMS {{ in_dims[0] | length }}
+#define {{ name|upper }}_NB_ELTS {{ in_dims[0]|join('*') }}
+#define {{ name|upper }}_NB_AXES {{ axes| length }}
+constexpr size_t {{ name|upper }}_STARTS[] = { {{ starts | join(', ') }} };
+constexpr size_t {{ name|upper }}_ENDS[] = { {{ ends | join(', ') }} };
+constexpr size_t {{ name|upper }}_STEPS[] = { {{ steps | join(', ') }} };
+constexpr size_t {{ name|upper }}_AXES_MOD[] = { {{ axes_mod | join(', ') }} };
+constexpr size_t {{ name|upper }}_AXES_DIV[] = { {{ axes_div | join(', ') }} };
+{% include "./_save_outputs.jinja" %}
+
+#endif /* {{ name|upper }}_LAYER_H */
diff --git a/aidge_export_cpp/templates/configuration/softmax_config.jinja b/aidge_export_cpp/templates/configuration/softmax_config.jinja
index 6dc6b3453e103757c184e7a6efdfd9d21c5cbd43..095bce86e3d30ef77a5599f47b031aa0ba25775e 100644
--- a/aidge_export_cpp/templates/configuration/softmax_config.jinja
+++ b/aidge_export_cpp/templates/configuration/softmax_config.jinja
@@ -8,5 +8,6 @@
 #define {{ name|upper }}_AXIS_SIZE {{ axis_size }}
 #define {{ name|upper }}_AXIS_SIZE_POST {{ axis_size_post }}
 #define {{ name|upper }}_AXIS_SIZE_PRE {{ axis_size_pre }}
+{% include "./_save_outputs.jinja" %}
 
 #endif /* {{ name|upper }}_LAYER_H */
diff --git a/aidge_export_cpp/templates/configuration/transpose_ND_config.jinja b/aidge_export_cpp/templates/configuration/transpose_ND_config.jinja
index 56bb5bd524a5927d7461391c18df6077c97766db..6ccd728adbd3256b98ff00284ba7d4aeb7ffcb3a 100644
--- a/aidge_export_cpp/templates/configuration/transpose_ND_config.jinja
+++ b/aidge_export_cpp/templates/configuration/transpose_ND_config.jinja
@@ -2,14 +2,19 @@
 #ifndef {{ name|upper }}_LAYER_H
 #define {{ name|upper }}_LAYER_H
 
+#include <sys/types.h>
+
 {# For layer configuration -#}
 {% include "./_def_io.jinja" %}
 {% include "./_meminfo.jinja" %}
 {# Export suppose that batchsize = 1#}
 #define {{ name|upper }}_NB_ELTS {{ in_dims[0]|join('*') }}
 #define {{ name|upper }}_NB_DIMS {{ in_dims[0] | length }}
+constexpr size_t {{ name|upper }}_IN_STRIDES[] = { {{ in_strides | join(', ') }} };
+constexpr size_t {{ name|upper }}_OUT_STRIDES[] = { {{ out_strides | join(', ') }} };
 
-static constexpr unsigned int {{ name|upper }}_PERMUTE[] = { {{ output_dims_order | join(', ') }} };
-static constexpr unsigned int {{ name|upper }}_DIMS[] = { {{ in_dims[0] | join(', ') }}};
+constexpr size_t {{ name|upper }}_PERMUTE[] = { {{ output_dims_order | join(', ') }} };
+constexpr size_t {{ name|upper }}_DIMS[] = { {{ in_dims[0] | join(', ') }} };
+{% include "./_save_outputs.jinja" %}
 
 #endif /* {{ name|upper }}_LAYER_H */
\ No newline at end of file
diff --git a/aidge_export_cpp/templates/data/aidge_tensor.jinja b/aidge_export_cpp/templates/data/aidge_tensor.jinja
index 3f086afd0a8f9a479a6073b463cefce21fc3e752..a1d5a20dff9a48e364eca0a588ac193eb2f5ad83 100644
--- a/aidge_export_cpp/templates/data/aidge_tensor.jinja
+++ b/aidge_export_cpp/templates/data/aidge_tensor.jinja
@@ -1,7 +1,79 @@
 #include <stdint.h>
 
-static const {{ data_t }} {{ name }}
+#define {{name|upper}}_FMT  Format::{{ dformat | upper }}
+
+{%- set format_map = {
+    "int8_t": "%4d",
+    "int16_t": "%6d",
+    "int32_t": "%6d",
+    "int64_t": "%8d",
+    "uint8_t": "%4d",
+    "uint16_t": "%6d",
+    "uint32_t": "%6d",
+    "uint64_t": "%8d",
+    "float": "%.9f",
+    "half_float::half": "%.9f",
+    "double": "%.17f"
+} %}
+
+static const {{ dtype }} {{ name }}
 {%- for dim in dims -%}
     [{{ dim }}]
 {%- endfor %} = 
-{{ values }};
+
+{{ '{' -}}
+
+{# 1D #}
+{%- if dims | length == 1 -%}
+{{ '{' }}
+{%- for x in range(dims[0]) -%}
+{{ format_map[dtype] | format(values[x]) }},
+{%- endfor -%}
+{{ '}' }};
+{%- endif -%}
+
+{#- 2D #}
+{%- if dims | length == 2 -%}
+{%- for y in range(dims[0]) -%}
+{{ '{' }}
+    {%- for x in range(dims[1]) -%}
+        {{ format_map[dtype] | format(values[y][x]) }}, 
+    {%- endfor -%}
+{{ '}' }},
+{%- endfor -%}
+{%- endif -%}
+
+{#- 3D #}
+{%- if dims | length == 3 -%}
+{%- for z in range(dims[0]) %}
+{{ '{' }}
+    {%- for y in range(dims[1]) %}
+    {{ '{' }}
+        {%- for x in range(dims[2]) -%}
+            {{ format_map[dtype] | format(values[z][y][x]) }}, 
+        {%- endfor -%}
+    {{ '}' }},
+    {%- endfor %}
+{{ '}' }},
+{%- endfor -%}
+{%- endif -%}
+
+{#- 4D #}
+{%- if dims | length == 4 -%}
+{%- for n in range(dims[0]) %}
+{{ '{' }}
+    {%- for z in range(dims[1]) %}
+    {{ '{' }}
+        {%- for y in range(dims[2]) %}
+        {{ '{' }}
+            {%- for x in range(dims[3]) -%}
+                {{ format_map[dtype] | format(values[n][z][y][x]) }}, 
+            {%- endfor -%}
+        {{ '}' }},
+        {%- endfor %}
+    {{ '}' }},
+    {%- endfor %}
+{{ '}' }},
+{%- endfor %}
+{%- endif -%}
+{{ '};' }}
diff --git a/aidge_export_cpp/templates/data/parameters.jinja b/aidge_export_cpp/templates/data/parameters.jinja
index b58ca9c1977a5e8f9af69401c0eb3e5f47fc6cc4..70e2b7dcc5b5728bcc252c4974d4e09b79f975f0 100644
--- a/aidge_export_cpp/templates/data/parameters.jinja
+++ b/aidge_export_cpp/templates/data/parameters.jinja
@@ -1,21 +1,66 @@
 {#- For libraries -#}
 #include <stdint.h>
 
+{%- set format_map = {
+    "int8_t": "%4d",
+    "int16_t": "%6d",
+    "int32_t": "%6d",
+    "int64_t": "%8d",
+    "uint8_t": "%4d",
+    "uint16_t": "%6d",
+    "uint32_t": "%6d",
+    "uint64_t": "%8d",
+    "float": "%.9f",
+    "half_float::half": "%.9f",
+    "double": "%.17f"
+} %}
+
 {# Design header of the array -#}
-static const {{ data_t }} {{ name }}[{{ values|length }}] __attribute__((section(".nn_data"))) =
+static const {{ dtype }} {{ name }}[{{ dims | join("*") }}] __attribute__((section(".nn_data"))) =
 {
-    {# For loop to add new elements -#}
-    {%- for i in range(values|length) %}
+{# 1D #}
+{%- if dims | length == 1 -%}
+{%- for x in range(dims[0]) -%}
+{{ format_map[dtype] | format(values[x]) }}, 
+{%- endfor -%}
+{%- endif -%}
+
+{#- 2D #}
+{%- if dims | length == 2 -%}
+{%- for y in range(dims[0]) %}
+{{ ' ' }}
+    {%- for x in range(dims[1]) -%}
+        {{ format_map[dtype] | format(values[y][x]) }}, 
+    {%- endfor %}
+{%- endfor -%}
+{%- endif -%}
+
+{#- 3D #}
+{%- if dims | length == 3 -%}
+{%- for z in range(dims[0]) %}
+{{ ' ' }}
+    {%- for y in range(dims[1]) %}
+    {{ ' ' }}
+        {%- for x in range(dims[2]) -%}
+            {{ format_map[dtype] | format(values[z][y][x]) }}, 
+        {%- endfor -%}
+    {%- endfor %}
+{%- endfor -%}
+{%- endif -%}
 
-        {#- Last value -#}
-        {%- if (i+1) == values|length -%}
-        {{ values[i]|string }}
-        {%- else -%}
-            {%- if (i+1) % 5 == 0 -%}
-            {{ values[i]|string + ",\n\t" }}
-            {%- else -%}
-            {{ values[i]|string + ", " }}
-            {%- endif -%}
-        {%- endif -%}
+{#- 4D #}
+{%- if dims | length == 4 -%}
+{%- for n in range(dims[0]) %}
+{{ ' ' }}
+    {%- for z in range(dims[1]) %}
+    {{ ' ' }}
+        {%- for y in range(dims[2]) %}
+        {{ ' ' }}
+            {%- for x in range(dims[3]) -%}
+                {{ format_map[dtype] | format(values[n][z][y][x]) }}, 
+            {%- endfor -%}
+        {%- endfor %}
     {%- endfor %}
+{%- endfor %}
+{%- endif %}
 };
diff --git a/aidge_export_cpp/templates/kernel_forward/_aidge_cmp.jinja b/aidge_export_cpp/templates/kernel_forward/_aidge_cmp.jinja
index 98778c1deff2a8bc2b32eac395f8113279f03d68..b30a03545271812ca5a17934caea2c3fc17c26e6 100644
--- a/aidge_export_cpp/templates/kernel_forward/_aidge_cmp.jinja
+++ b/aidge_export_cpp/templates/kernel_forward/_aidge_cmp.jinja
@@ -1,8 +1,14 @@
 {%- if aidge_cmp is defined and aidge_cmp %}
-#if AIDGE_CMP
+#ifdef AIDGE_CMP
     aidge_cmp<{{ out_name[0] | upper }}_NB_OUTPUTS, 
               {{ out_name[0] | upper }}_OUT_HEIGHT, 
-              {{ out_name[0] | upper }}_OUT_WIDTH>
+              {{ out_name[0] | upper }}_OUT_WIDTH,
+              {{ out_name[0]|upper }}_MEM_CONT_OFFSET,
+              {{ out_name[0]|upper }}_MEM_CONT_SIZE,
+              {{ out_name[0]|upper }}_MEM_WRAP_OFFSET,
+              {{ out_name[0]|upper }}_MEM_WRAP_SIZE,
+              {{ out_name[0] | upper }}_AIDGE_FMT,
+              {{ out_name[0] | upper }}_DEV_FMT>
              ("{{ name }}", ({{out_cdtype[0]}}*) {{ out_name[0] }}_aidge, {{ out_name[0] }});
 #endif
 {%- endif %}
\ No newline at end of file
diff --git a/aidge_export_cpp/templates/kernel_forward/_mem_offset.jinja b/aidge_export_cpp/templates/kernel_forward/_mem_offset.jinja
index f3bea038c52dc1500f6a827a24557f2bc4f7dcc9..e3853184e7bedf0195abe6867eb12439590001d5 100644
--- a/aidge_export_cpp/templates/kernel_forward/_mem_offset.jinja
+++ b/aidge_export_cpp/templates/kernel_forward/_mem_offset.jinja
@@ -1,3 +1,3 @@
 {%- for outidx in range(nb_out) %}
-{{out_cdtype[outidx]}}* {{out_name[outidx]}} = ({{out_cdtype[outidx]}}*) mem + {{out_name[outidx]|upper}}_OFFSET;
+{{out_cdtype[outidx]}}* {{out_name[outidx]}} = ({{out_cdtype[outidx]}}*) (mem + {{out_name[outidx]|upper}}_MEM_OFFSET);
 {%- endfor %}
diff --git a/aidge_export_cpp/templates/kernel_forward/_save_outputs.jinja b/aidge_export_cpp/templates/kernel_forward/_save_outputs.jinja
index 6865be575a613af16fc6a88fd969525abba80d0d..c0956d290f6c0f4b4cebd901cd1d531a69584c89 100644
--- a/aidge_export_cpp/templates/kernel_forward/_save_outputs.jinja
+++ b/aidge_export_cpp/templates/kernel_forward/_save_outputs.jinja
@@ -1,19 +1,16 @@
 
-#if SAVE_OUTPUTS
-{% for outidx in range(nb_out) -%}
-    FILE* {{out_name[outidx]|upper}}_STREAM = fopen("data/export_outputs/{{out_name[outidx]}}.txt", "w");
-    saveOutputs<{{out_cdtype[outidx]}}>(
-        {{out_name[outidx]|upper}}_NB_OUTPUTS,
-        {{out_name[outidx]|upper}}_OUT_HEIGHT,
-        {{out_name[outidx]|upper}}_OUT_WIDTH,
-        {#- {{out_name[outidx]|upper}}_CONT_OFFSET,
-        {{out_name[outidx]|upper}}_CONT_SIZE,
-        {{out_name[outidx]|upper}}_WRAP_OFFSET,
-        {{out_name[outidx]|upper}}_WRAP_SIZE,
-        {{out_name[outidx]|upper}}_STRIDE, #}
-        {{out_name[outidx]}},
-        {{out_name[outidx]|upper}}_STREAM,
-        Format::NHWC);
-    fclose({{out_name[outidx]|upper}}_STREAM);
-{% endfor %}
+#ifdef SAVE_OUTPUTS
+printf("[NOTICE] - Saving outputs of node {{ name }}\n");
+FILE* {{ out_name[0] | upper }}_STREAM = fopen("data/export_outputs/{{out_name[0]}}.txt", "w");
+saveOutputs<{{ out_name[0] | upper }}_NB_OUTPUTS,
+            {{ out_name[0] | upper }}_OUT_HEIGHT,
+            {{ out_name[0] | upper }}_OUT_WIDTH,
+            {{ out_name[0]|upper }}_MEM_CONT_OFFSET,
+            {{ out_name[0]|upper }}_MEM_CONT_SIZE,
+            {{ out_name[0]|upper }}_MEM_WRAP_OFFSET,
+            {{ out_name[0]|upper }}_MEM_WRAP_SIZE,
+            {{ out_name[0] | upper }}_DEV_FMT>
+            ({{ out_name[0] }},
+            {{ out_name[0] | upper }}_STREAM);
+fclose({{ out_name[0] | upper }}_STREAM);
 #endif
diff --git a/aidge_export_cpp/templates/kernel_forward/activation_forward.jinja b/aidge_export_cpp/templates/kernel_forward/activation_forward.jinja
index 1dc4eb530aeafdac10b59b3b2c8a0313d4411659..b1fd417bc50ae495a537f505a7ea7729778d04a4 100644
--- a/aidge_export_cpp/templates/kernel_forward/activation_forward.jinja
+++ b/aidge_export_cpp/templates/kernel_forward/activation_forward.jinja
@@ -1,7 +1,17 @@
 {% filter indent(width=4, first=False) %}
 {% include "./_mem_offset.jinja" %}
-activation_forward<{{name|upper}}_NB_DATA,
-                   {{name|upper}}_ACTIVATION>
+activation_forward<{{name|upper}}_NB_ELTS,
+                   {{name|upper}}_ACTIVATION,
+                   {{ in_name[0]|upper }}_MEM_CONT_OFFSET,
+                   {{ in_name[0]|upper }}_MEM_CONT_SIZE,
+                   {{ in_name[0]|upper }}_MEM_WRAP_OFFSET,
+                   {{ in_name[0]|upper }}_MEM_WRAP_SIZE,
+                   {{ in_name[0]|upper }}_MEM_STRIDE,
+                   {{ out_name[0]|upper }}_MEM_CONT_OFFSET,
+                   {{ out_name[0]|upper }}_MEM_CONT_SIZE,
+                   {{ out_name[0]|upper }}_MEM_WRAP_OFFSET,
+                   {{ out_name[0]|upper }}_MEM_WRAP_SIZE,
+                   {{ out_name[0]|upper }}_MEM_STRIDE>
                    ({{in_name[0]}}, {{out_name[0]}}, {{name|upper}}_RESCALING);
 {% include "./_save_outputs.jinja" %}
 {% include "./_aidge_cmp.jinja" %}
diff --git a/aidge_export_cpp/templates/kernel_forward/concat_forward.jinja b/aidge_export_cpp/templates/kernel_forward/concat_forward.jinja
index 88cbc9a25f6e5342c2d3cc14f8e40fe452716944..aff15f26beae9dc68fae177a47e9546d49804073 100644
--- a/aidge_export_cpp/templates/kernel_forward/concat_forward.jinja
+++ b/aidge_export_cpp/templates/kernel_forward/concat_forward.jinja
@@ -1,22 +1,12 @@
 {% filter indent(width=4, first=False) %}
 {% include "./_mem_offset.jinja" %}
-const float* {{ name|upper }}_INPUTS[] = {
-    {%- for i in range(nb_in) -%}
-        {{ in_name[i] }}{{ ", " if not loop.last else "" }}
-    {%- endfor -%}
-};
-
-unsigned int {{ name|upper }}_SIZES[] = {
-    {%- for i in range(nb_in) -%}
-        {{ name|upper }}_INPUT_{{i}}_SIZE{{ ", " if not loop.last else "" }}
-    {%- endfor -%}
-};
-
+const {{ out_cdtype[0] }}* {{ name }}_inputs[] = { {{ in_name|join(", ") }} };
 concat_forward<{{ name|upper }}_AXIS_SIZE_POST,
                {{ name|upper }}_AXIS_SIZE_PRE,
-               {{ nb_in }}, float> 
-              ({{ name|upper }}_INPUTS,
-               {{ name|upper }}_SIZES,
+               {{ name|upper }}_AXIS_SIZE,
+               {{ name|upper }}_TOTAL_AXIS_SIZE,
+               {{ nb_in }}> 
+              ({{ name }}_inputs,
                {{ out_name[0] }});
 
 {%- endfilter %}
diff --git a/aidge_export_cpp/templates/kernel_forward/convolution_forward.jinja b/aidge_export_cpp/templates/kernel_forward/convolution_forward.jinja
index bdde325707eeb497a93ba2084c0672bd7f7e5daa..e4a5642f9f10057eac426a01ae2dadab186bf248 100644
--- a/aidge_export_cpp/templates/kernel_forward/convolution_forward.jinja
+++ b/aidge_export_cpp/templates/kernel_forward/convolution_forward.jinja
@@ -14,7 +14,17 @@ convolution{{ "_depthwise" if depthwise is defined else "" }}_forward<{{ in_name
                     {{name|upper}}_DILATION_X,
                     {{name|upper}}_KERNEL_HEIGHT,
                     {{name|upper}}_KERNEL_WIDTH,
-                    {{name|upper}}_ACTIVATION>
+                    {{name|upper}}_ACTIVATION,
+                    {{ in_name[0]|upper }}_MEM_CONT_OFFSET,
+                    {{ in_name[0]|upper }}_MEM_CONT_SIZE,
+                    {{ in_name[0]|upper }}_MEM_WRAP_OFFSET,
+                    {{ in_name[0]|upper }}_MEM_WRAP_SIZE,
+                    {{ in_name[0]|upper }}_MEM_STRIDE,
+                    {{ out_name[0]|upper }}_MEM_CONT_OFFSET,
+                    {{ out_name[0]|upper }}_MEM_CONT_SIZE,
+                    {{ out_name[0]|upper }}_MEM_WRAP_OFFSET,
+                    {{ out_name[0]|upper }}_MEM_WRAP_SIZE,
+                    {{ out_name[0]|upper }}_MEM_STRIDE>
                     ({{in_name[0]}}, {{out_name[0]}}, {{in_name[1]}}, {{in_name[2]}}, {{name|upper}}_RESCALING);
 {% include "./_save_outputs.jinja" %}
 {% include "./_aidge_cmp.jinja" %}
diff --git a/aidge_export_cpp/templates/kernel_forward/elemwise_forward.jinja b/aidge_export_cpp/templates/kernel_forward/elemwise_forward.jinja
index 1a99921c185d14f4494c923092e2c36ab684945e..300be27a7e6ddbfacd18bb259303aac201787252 100644
--- a/aidge_export_cpp/templates/kernel_forward/elemwise_forward.jinja
+++ b/aidge_export_cpp/templates/kernel_forward/elemwise_forward.jinja
@@ -1,12 +1,29 @@
 {% filter indent(width=4, first=False) %}
 {% include "./_mem_offset.jinja" %}
-elemwise_forward<{{name|upper}}_NB_ELTS,
-                 {{name|upper}}_ELEM_OP,
-                 {{name|upper}}_ACTIVATION>
-                 ({{out_name[0]}},
-                 {{name|upper}}_RESCALING,
-                 {{in_name[0]}},
-                 {{in_name[1]}});
+elemwise_forward<{{name|upper}}_NB_MAT,
+               {{name|upper}}_ELEM_OP,
+               {{name|upper}}_INPUT1_CONT_SIZE,
+               {{name|upper}}_INPUT2_CONT_SIZE,
+               {{name|upper}}_OUTPUT_CONT_SIZE,
+               {{name|upper}}_OFFSET_IN1,
+               {{name|upper}}_OFFSET_IN2,
+               {{name|upper}}_ACTIVATION,
+               {{ in_name[0]|upper }}_MEM_CONT_OFFSET,
+               {{ in_name[0]|upper }}_MEM_CONT_SIZE,
+               {{ in_name[0]|upper }}_MEM_WRAP_OFFSET,
+               {{ in_name[0]|upper }}_MEM_WRAP_SIZE,
+               {{ in_name[0]|upper }}_MEM_STRIDE,
+               {{ in_name[1]|upper }}_MEM_CONT_OFFSET,
+               {{ in_name[1]|upper }}_MEM_CONT_SIZE,
+               {{ in_name[1]|upper }}_MEM_WRAP_OFFSET,
+               {{ in_name[1]|upper }}_MEM_WRAP_SIZE,
+               {{ in_name[1]|upper }}_MEM_STRIDE,
+               {{ out_name[0]|upper }}_MEM_CONT_OFFSET,
+               {{ out_name[0]|upper }}_MEM_CONT_SIZE,
+               {{ out_name[0]|upper }}_MEM_WRAP_OFFSET,
+               {{ out_name[0]|upper }}_MEM_WRAP_SIZE,
+               {{ out_name[0]|upper }}_MEM_STRIDE>
+                 ({{out_name[0]}}, {{name|upper}}_RESCALING, {{in_name[0]}}, {{in_name[1]}});
 {% include "./_save_outputs.jinja" %}
 {% include "./_aidge_cmp.jinja" %}
 {% endfilter %}
diff --git a/aidge_export_cpp/templates/kernel_forward/erf_forward.jinja b/aidge_export_cpp/templates/kernel_forward/erf_forward.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..9f3fbf3009c96454aceb306042358355e6b60a22
--- /dev/null
+++ b/aidge_export_cpp/templates/kernel_forward/erf_forward.jinja
@@ -0,0 +1,6 @@
+{% filter indent(width=4, first=False) %}
+{% include "./_mem_offset.jinja" %}
+erf_forward<{{name|upper}}_NB_ELTS>
+                   ({{in_name[0]}}, {{out_name[0]}});
+{% include "./_save_outputs.jinja" %}
+{% endfilter %}
\ No newline at end of file
diff --git a/aidge_export_cpp/templates/kernel_forward/fullyconnected_forward.jinja b/aidge_export_cpp/templates/kernel_forward/fullyconnected_forward.jinja
index 9a35d799be09d1bc5b311f750e64b38656f723c1..ad4e7479fd2b27fc7c3ba2c5e7b0f37b4bc8262a 100644
--- a/aidge_export_cpp/templates/kernel_forward/fullyconnected_forward.jinja
+++ b/aidge_export_cpp/templates/kernel_forward/fullyconnected_forward.jinja
@@ -1,12 +1,22 @@
 {% filter indent(width=4, first=False) %}
 {% include "./_mem_offset.jinja" %}
-fullyconnected_forward<{{ in_name[0]|upper }}_NB_CHANNELS,
+fullyconnected{{ "_" ~ kernel if kernel != "" else "" }}_forward<{{ in_name[0]|upper }}_NB_CHANNELS,
                        {{ in_name[0]|upper }}_IN_HEIGHT,
                        {{ in_name[0]|upper }}_IN_WIDTH,
                        {{ out_name[0]|upper }}_NB_OUTPUTS,
                        {{ out_name[0]|upper }}_OUT_HEIGHT,
                        {{ out_name[0]|upper }}_OUT_WIDTH,
-                       {{name|upper}}_ACTIVATION>
+                       {{name|upper}}_ACTIVATION,
+                       {{ in_name[0]|upper }}_MEM_CONT_OFFSET,
+                       {{ in_name[0]|upper }}_MEM_CONT_SIZE,
+                       {{ in_name[0]|upper }}_MEM_WRAP_OFFSET,
+                       {{ in_name[0]|upper }}_MEM_WRAP_SIZE,
+                       {{ in_name[0]|upper }}_MEM_STRIDE,
+                       {{ out_name[0]|upper }}_MEM_CONT_OFFSET,
+                       {{ out_name[0]|upper }}_MEM_CONT_SIZE,
+                       {{ out_name[0]|upper }}_MEM_WRAP_OFFSET,
+                       {{ out_name[0]|upper }}_MEM_WRAP_SIZE,
+                       {{ out_name[0]|upper }}_MEM_STRIDE>
                        ({{in_name[0]}}, {{out_name[0]}}, {{in_name[1]}}, {{in_name[2]}}, {{name|upper}}_RESCALING);
 {% include "./_save_outputs.jinja" %}
 {% include "./_aidge_cmp.jinja" %}
diff --git a/aidge_export_cpp/templates/kernel_forward/reshape_forward.jinja b/aidge_export_cpp/templates/kernel_forward/identity_forward.jinja
similarity index 60%
rename from aidge_export_cpp/templates/kernel_forward/reshape_forward.jinja
rename to aidge_export_cpp/templates/kernel_forward/identity_forward.jinja
index 6af8ece3428b563689f8373c0dc7560d867fff1d..bf83d0809c8c5ca1e8796c90d533b7ad856000ed 100644
--- a/aidge_export_cpp/templates/kernel_forward/reshape_forward.jinja
+++ b/aidge_export_cpp/templates/kernel_forward/identity_forward.jinja
@@ -1,7 +1,7 @@
 {% filter indent(width=4, first=False) %}
 {% include "./_mem_offset.jinja" %}
-reshape_forward<{{name|upper}}_NB_ELTS>
-                 ({{in_name[0]}}, {{in_name[1]}}, {{out_name[0]}});
+identity_forward<{{name|upper}}_NB_ELTS>
+                 ({{in_name[0]}}, {{out_name[0]}});
 {% include "./_save_outputs.jinja" %}
 {% include "./_aidge_cmp.jinja" %}
 {% endfilter %}
diff --git a/aidge_export_cpp/templates/kernel_forward/leakyrelu_forward.jinja b/aidge_export_cpp/templates/kernel_forward/leakyrelu_forward.jinja
index 89cf2591139ef2719d516c2fba522534dbb806c8..15fdd913325854127fc1051c17a4d9924c15f699 100644
--- a/aidge_export_cpp/templates/kernel_forward/leakyrelu_forward.jinja
+++ b/aidge_export_cpp/templates/kernel_forward/leakyrelu_forward.jinja
@@ -1,7 +1,17 @@
 {% filter indent(width=4, first=False) %}
 {% include "./_mem_offset.jinja" %}
-leakyrelu_forward<{{name|upper}}_NB_DATA>
-                   ({{input_name}}, {{output_name}}, {{name|upper}}_ALPHA);
+leakyrelu_forward<{{name|upper}}_NB_ELTS,
+                  {{ in_name[0]|upper }}_MEM_CONT_OFFSET,
+                  {{ in_name[0]|upper }}_MEM_CONT_SIZE,
+                  {{ in_name[0]|upper }}_MEM_WRAP_OFFSET,
+                  {{ in_name[0]|upper }}_MEM_WRAP_SIZE,
+                  {{ in_name[0]|upper }}_MEM_STRIDE,
+                  {{ out_name[0]|upper }}_MEM_CONT_OFFSET,
+                  {{ out_name[0]|upper }}_MEM_CONT_SIZE,
+                  {{ out_name[0]|upper }}_MEM_WRAP_OFFSET,
+                  {{ out_name[0]|upper }}_MEM_WRAP_SIZE,
+                  {{ out_name[0]|upper }}_MEM_STRIDE>
+                   ({{in_name[0]}}, {{out_name[0]}}, {{name|upper}}_ALPHA);
 {% include "./_save_outputs.jinja" %}
 {% include "./_aidge_cmp.jinja" %}
 {% endfilter %}
diff --git a/aidge_export_cpp/templates/kernel_forward/matmul_forward.jinja b/aidge_export_cpp/templates/kernel_forward/matmul_forward.jinja
index 090fbac398b207d29a1f4ebf94eca564e032a53b..0333d2e81322cf1ba34c82306e8a4b7a247c83e0 100644
--- a/aidge_export_cpp/templates/kernel_forward/matmul_forward.jinja
+++ b/aidge_export_cpp/templates/kernel_forward/matmul_forward.jinja
@@ -1,8 +1,11 @@
 {% filter indent(width=4, first=False) %}
 {% include "./_mem_offset.jinja" %}
-matmul_forward<{{name|upper}}_M,
-               {{name|upper}}_K,
+matmul_forward<{{name|upper}}_NB_MAT,
                {{name|upper}}_N,
+               {{name|upper}}_M,
+               {{name|upper}}_K,
+               {{name|upper}}_OFFSET_IN1,
+               {{name|upper}}_OFFSET_IN2,
                {{name|upper}}_ACTIVATION>
                ({{in_name[0]}}, {{in_name[1]}}, {{out_name[0]}}, {{name|upper}}_RESCALING);
 {% include "./_save_outputs.jinja" %}
diff --git a/aidge_export_cpp/templates/kernel_forward/pooling_forward.jinja b/aidge_export_cpp/templates/kernel_forward/pooling_forward.jinja
index fb1f2b7e0a1b33602c93b96856533a93eeec9023..42bed98cb99aaec9f72af2ebdc15e0c46b5331f3 100644
--- a/aidge_export_cpp/templates/kernel_forward/pooling_forward.jinja
+++ b/aidge_export_cpp/templates/kernel_forward/pooling_forward.jinja
@@ -13,7 +13,17 @@ pooling_forward<{{ in_name[0]|upper }}_NB_CHANNELS,
                 {{name|upper}}_KERNEL_HEIGHT,
                 {{name|upper}}_KERNEL_WIDTH,
                 {{name|upper}}_POOLING_TYPE,
-                {{name|upper}}_ACTIVATION>
+                {{name|upper}}_ACTIVATION,
+                {{ in_name[0]|upper }}_MEM_CONT_OFFSET,
+                {{ in_name[0]|upper }}_MEM_CONT_SIZE,
+                {{ in_name[0]|upper }}_MEM_WRAP_OFFSET,
+                {{ in_name[0]|upper }}_MEM_WRAP_SIZE,
+                {{ in_name[0]|upper }}_MEM_STRIDE,
+                {{ out_name[0]|upper }}_MEM_CONT_OFFSET,
+                {{ out_name[0]|upper }}_MEM_CONT_SIZE,
+                {{ out_name[0]|upper }}_MEM_WRAP_OFFSET,
+                {{ out_name[0]|upper }}_MEM_WRAP_SIZE,
+                {{ out_name[0]|upper }}_MEM_STRIDE>
                 ({{in_name[0]}}, {{out_name[0]}});
 {% include "./_save_outputs.jinja" %}
 {% include "./_aidge_cmp.jinja" %}
diff --git a/aidge_export_cpp/templates/kernel_forward/reducemean_forward.jinja b/aidge_export_cpp/templates/kernel_forward/reducemean_forward.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..ed75773297b5a5f6c75d99c86ad21b4063d374de
--- /dev/null
+++ b/aidge_export_cpp/templates/kernel_forward/reducemean_forward.jinja
@@ -0,0 +1,17 @@
+{% filter indent(width=4, first=False) %}
+{% include "./_mem_offset.jinja" %}
+reducemean_forward<{{ in_name[0]|upper }}_NB_DIMS,
+                  {{ in_name[0]|upper }}_NB_ELTS,
+                  {{ out_name[0]|upper }}_NB_ELTS,
+                  {{ name|upper }}_NB_AXES_TO_REDUCES>
+                  ({{ name|upper }}_AXES_TO_REDUCE,
+                    {{ in_name[0]|upper }}_DIMS,
+                    {{ name|upper }}_PREAXIS_STRIDES,
+                    {{ name|upper }}_POSTAXIS_STRIDES,
+                    {{ in_name[0] }},
+                    {{ out_name[0] }});
+{% include "./_save_outputs.jinja" %}
+{% include "./_aidge_cmp.jinja" %}
+{% endfilter %}
+
+
diff --git a/aidge_export_cpp/templates/kernel_forward/rescaling_forward.jinja b/aidge_export_cpp/templates/kernel_forward/rescaling_forward.jinja
deleted file mode 100644
index ce4ffb869c7f99ac789311a9bc98b926253f968f..0000000000000000000000000000000000000000
--- a/aidge_export_cpp/templates/kernel_forward/rescaling_forward.jinja
+++ /dev/null
@@ -1,9 +0,0 @@
-{% filter indent(width=4, first=False) %}
-{% include "./_mem_offset.jinja" %}
-rescaling_forward<{{name|upper}}_NB_DATA,
-                   {{name|upper}}_ACTIVATION>
-                   ({{in_name[0]}},
-                   {{out_name[0]}},
-                   {{name|upper}}_RESCALING);
-{% include "./_save_outputs.jinja" %}
-{% endfilter %}
diff --git a/aidge_export_cpp/templates/kernel_forward/sigmoid_forward.jinja b/aidge_export_cpp/templates/kernel_forward/sigmoid_forward.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..6ce49f952a6a8411150720987d6f3d00091c61c5
--- /dev/null
+++ b/aidge_export_cpp/templates/kernel_forward/sigmoid_forward.jinja
@@ -0,0 +1,6 @@
+{% filter indent(width=4, first=False) %}
+{% include "./_mem_offset.jinja" %}
+sigmoid_forward<{{name|upper}}_NB_ELTS>
+                   ({{in_name[0]}}, {{out_name[0]}});
+{% include "./_save_outputs.jinja" %}
+{% endfilter %}
\ No newline at end of file
diff --git a/aidge_export_cpp/templates/kernel_forward/slice_forward.jinja b/aidge_export_cpp/templates/kernel_forward/slice_forward.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..41ed8a2a0197d3ff1e60967663678054800fd519
--- /dev/null
+++ b/aidge_export_cpp/templates/kernel_forward/slice_forward.jinja
@@ -0,0 +1,16 @@
+{% filter indent(width=4, first=False) %}
+{% include "./_mem_offset.jinja" %}
+slice_forward<{{in_cdtype[0]}},
+                     {{name|upper}}_NB_DIMS,
+                     {{name|upper}}_NB_ELTS,
+                     {{name|upper}}_NB_AXES,
+                     {{name|upper}}_STARTS,
+                     {{name|upper}}_ENDS,
+                     {{name|upper}}_STEPS,
+                     {{name|upper}}_AXES_MOD,
+                     {{name|upper}}_AXES_DIV>
+                    ({{in_name[0]}},
+                     {{out_name[0]}});
+{% include "./_save_outputs.jinja" %}
+{% include "./_aidge_cmp.jinja" %}
+{% endfilter %}
diff --git a/aidge_export_cpp/templates/kernel_forward/transpose_ND_forward.jinja b/aidge_export_cpp/templates/kernel_forward/transpose_ND_forward.jinja
index 58a1e01cd9dd1db49d210ac16579675f90c5e2d4..cd7a306b62957b4801e2b7c8ac9a2abd95a9cb66 100644
--- a/aidge_export_cpp/templates/kernel_forward/transpose_ND_forward.jinja
+++ b/aidge_export_cpp/templates/kernel_forward/transpose_ND_forward.jinja
@@ -1,11 +1,12 @@
 {% filter indent(width=4, first=False) %}
 {% include "./_mem_offset.jinja" %}
 transpose_ND_forward<{{in_cdtype[0]}},
-                     {{name|upper}}_NB_DIMS>
-                    ({{in_name[0]}},
-                     {{name|upper}}_DIMS,
-                     {{name|upper}}_PERMUTE,
+                     {{name|upper}}_NB_DIMS,
                      {{name|upper}}_NB_ELTS,
+                     {{name|upper}}_PERMUTE,
+                     {{name|upper}}_DIMS,
+                     {{name|upper}}_OUT_STRIDES>
+                    ({{in_name[0]}},
                      {{out_name[0]}});
 {% include "./_save_outputs.jinja" %}
 {% include "./_aidge_cmp.jinja" %}
diff --git a/aidge_export_cpp/unit_tests/test_examples.py b/aidge_export_cpp/unit_tests/test_examples.py
index d3bb40636eba420b457263873ca00fd6b2d3b6a9..98b5e749a44d2e9e5642b625045ed4790f165890 100644
--- a/aidge_export_cpp/unit_tests/test_examples.py
+++ b/aidge_export_cpp/unit_tests/test_examples.py
@@ -9,20 +9,21 @@ EXAMPLES_DIR = CURRENT_DIR / "../../examples"
 
 # Dictionary of test cases: {id: (script_name, script_args)}
 TEST_CASES = {
-    "lenet-no-args": ("export_LeNet/lenet.py", []),
-    "lenet-int8": ("export_LeNet/lenet.py", ["--dtype=int8"]),
-    "resnet18-no-args": ("export_ResNet18/resnet18.py", ["--mock_db"]),
-    "resnet18-int8": ("export_ResNet18/resnet18.py", ["--mock_db", "--dtype=int8"])
+    "lenet-no-args": ("export_LeNet/lenet.py", [], ["MODEL ACCURACY =  100.0 %", "Prediction out#0: 7 (1)"]),
+    "lenet-int8": ("export_LeNet/lenet.py", ["--dtype=int8"], ["MODEL ACCURACY =  100.0 %", "MODEL ACCURACY =  100.0 %", "QUANTIZED ACCURACY =  100.0 %", "Prediction out#0: 7 (119)"]),
+    "lenet-int8-wrap": ("export_LeNet/lenet.py", ["--dtype=int8", "--mem_wrap"], ["MODEL ACCURACY =  100.0 %", "MODEL ACCURACY =  100.0 %", "QUANTIZED ACCURACY =  100.0 %", "Prediction out#0: 7 (119)"]),
+    "resnet18-no-args": ("export_ResNet18/resnet18.py", ["--mock_db"], []),
+    "resnet18-int8": ("export_ResNet18/resnet18.py", ["--mock_db", "--dtype=int8"], [])
 }
 
 def generate_test_cases():
     """Parse TEST_CASES to provide valid pytest params.
     """
-    for test_id, (script, args) in TEST_CASES.items():
-        yield pytest.param(script, args, id=test_id)
+    for test_id, (script, args, result) in TEST_CASES.items():
+        yield pytest.param(script, args, result, id=test_id)
 
-@pytest.mark.parametrize(("script_name", "script_args"), generate_test_cases())
-def test_example_scripts_run_without_error(script_name, script_args):
+@pytest.mark.parametrize(("script_name", "script_args", "script_result"), generate_test_cases())
+def test_example_scripts_run_without_error(script_name, script_args, script_result):
     """Basic test to verify that examples script run withoput raising an Error.
     This test DO NOT check that the examples are working only that they are not broken.
     """
@@ -34,6 +35,9 @@ def test_example_scripts_run_without_error(script_name, script_args):
     )
     assert result.returncode == 0, f"{script_name} failed with error:\n{result.stderr}\n\nTraceback:\n{result.stdout}"
 
+    for res in script_result:
+        assert res in result.stdout, f"Expected output '{res}' not found in the script output:\n{result.stdout}"    
+
 
 def main():
     import sys
diff --git a/aidge_export_cpp/unit_tests/test_export.py b/aidge_export_cpp/unit_tests/test_export.py
index 0a34153cebfa405389dd7ccff20a5d359d7967a4..55f558220753723749a601e40cd625173655a089 100644
--- a/aidge_export_cpp/unit_tests/test_export.py
+++ b/aidge_export_cpp/unit_tests/test_export.py
@@ -12,6 +12,10 @@ import shutil
 from aidge_core.utils import run_command
 
 from aidge_export_cpp import cpp_fuse_to_metaops, set_nodes_names
+import pytest
+from _pytest.unittest import TestCaseFunction
+
+aidge_core.Log.set_console_level(aidge_core.Level.Error)  # Reduce useless logs
 
 def initFiller(model):
     # Initialize parameters (weights and biases)
@@ -22,10 +26,10 @@ def initFiller(model):
             value.set_backend("cpu")
             tuple_out = node.output(0)[0]
             # No conv in current network
-            if tuple_out[0].type() == "Conv" and tuple_out[1] == 1:
+            if tuple_out[0].type() == "Conv2D" and tuple_out[1] == 1:
                 # Conv weight
                 aidge_core.xavier_uniform_filler(value)
-            elif tuple_out[0].type() == "Conv" and tuple_out[1] == 2:
+            elif tuple_out[0].type() == "Conv2D" and tuple_out[1] == 2:
                 # Conv bias
                 aidge_core.constant_filler(value, 0.01)
             elif tuple_out[0].type() == "FC" and tuple_out[1] == 1:
@@ -64,15 +68,40 @@ def _np_init_ones(shape, default_value=0.01, dtype=np.float32):
     return data.reshape(shape).astype(dtype)
 
 
+# Global dictionary to store test reports
+test_reports = {}
+
+@pytest.hookimpl(hookwrapper=True)
+def pytest_runtest_makereport(item, call):
+    # Execute all other hooks to obtain the report object
+    outcome = yield
+    rep = outcome.get_result()
+    # Store the report in the global dictionary
+    test_reports[item.nodeid] = rep
+
 class test_operator_export(unittest.TestCase):
 
     def setUp(self):
-        # TODO change seed at each test ?
         RNG_SEED = 1234
         np.random.seed(RNG_SEED)
         aidge_core.random.Generator.set_seed(RNG_SEED)
+
     def tearDown(self):
-        pass
+        result = self._outcome.result
+        test_succeeded = True
+        if isinstance(result, TestCaseFunction):
+            print("Test ran with pytest cannot retrieve if the test was a success.")
+        else:
+            test_succeeded = not any(test is self for test, _ in result.failures + result.errors)
+        if test_succeeded:
+            shutil.rmtree(self.export_folder)
+
+        # rep = test_reports.get(self.id())
+        # if rep is not None and rep.passed:
+        #     # Perform teardown actions only if the test passed
+        #     print(g"Removing {self.export_folder}")
+        #     shutil.rmtree(self.export_folder, ignore_errors=True)
+
 
     def unit_test_export(self, graph_view, op_name, in_dims, random_inputs=True, random_weights=True, default_value=0.01):
         """
@@ -86,7 +115,8 @@ class test_operator_export(unittest.TestCase):
         3- Generate Cpp export with a main that compare the result of the inference with the result obtained at step 2.
         4- Retrieve standard output and using regex to now if the results are the same
         """
-        graph_view.compile("cpu", aidge_core.dtype.float32, dims=in_dims)
+        graph_view.set_backend("cpu")
+        graph_view.forward_dims(dims=in_dims)
 
         for node in graph_view.get_nodes():
             if node.type() == "Producer":
@@ -95,9 +125,8 @@ class test_operator_export(unittest.TestCase):
 
                 if (random_weights):
                     tensor = aidge_core.Tensor(_np_init(value.dims()))
-
                     node.get_operator().set_output(0, tensor)
-                else:
+                elif default_value != None:
                     aidge_core.constant_filler(value, default_value)
 
         # Fuse operators to match implemented cpp kernels
@@ -117,9 +146,19 @@ class test_operator_export(unittest.TestCase):
 
         # Note the convention ``<op_name>_test`` is useful for gitignore to avoid pushing generated export by accident.
         export_folder = op_name + "_test"
-
+        self.export_folder = export_folder
         shutil.rmtree(export_folder, ignore_errors=True)
 
+        aidge_core.export_utils.generate_main_compare_cpp(export_folder, graph_view)
+
+        graph_view.set_backend(aidge_export_cpp.ExportLibCpp._name)
+        aidge_core.adapt_to_backend(graph_view)
+        graph_view.forward_dims(dims=in_dims)
+        graph_view.save(export_folder + "/graph")
+    
+        scheduler = aidge_core.SequentialScheduler(graph_view)
+        scheduler.generate_scheduling()
+
         # Export the model in C++ standalone
         aidge_core.export_utils.scheduler_export(
                 scheduler,
@@ -129,7 +168,6 @@ class test_operator_export(unittest.TestCase):
                 memory_manager_args={"stats_folder": f"{export_folder}/stats", "wrapping": False }
         )
 
-        aidge_core.export_utils.generate_main_compare_cpp(export_folder, graph_view)
         print("COMPILATION")
 
         try:
@@ -241,7 +279,6 @@ class test_operator_export(unittest.TestCase):
 
         self.unit_test_export(model, "HardmaxAxis0", [[10]])
 
-    @unittest.expectedFailure
     def test_FC_image_in(self):
         """Test exporting a FC operator with a HWC input.
         """
@@ -259,6 +296,14 @@ class test_operator_export(unittest.TestCase):
 
         self.unit_test_export(model, "ReLU", [[1, 10]])
 
+    def test_leakyrelu(self):
+        print("LeakyReLU")
+        model = aidge_core.sequential([
+            aidge_core.LeakyReLU(name="leakyrelu0", negative_slope=0.1)
+        ])
+
+        self.unit_test_export(model, "LeakyReLU", [[1, 10]])
+
     def test_add(self):
         print("Add")
         model = aidge_core.sequential([
@@ -286,8 +331,6 @@ class test_operator_export(unittest.TestCase):
 
         self.unit_test_export(model, "Add", [[1, 5, 7]])
 
-    # "Broadcast not supported yet in export operator"
-    @unittest.expectedFailure
     def test_add_simple_broadcast(self):
         print("AddSimpleBroadcast")
         model = aidge_core.sequential([
@@ -297,8 +340,6 @@ class test_operator_export(unittest.TestCase):
 
         self.unit_test_export(model, "AddSimpleBroadcast", [[1, 7, 5]])
 
-    # "Broadcast not supported yet in export operator"
-    @unittest.expectedFailure
     def test_add_double_broadcast(self):
         print("AddDoubleBroadcast")
         model = aidge_core.sequential([
@@ -335,8 +376,6 @@ class test_operator_export(unittest.TestCase):
 
         self.unit_test_export(model, "Sub", [[1, 5, 7]])
 
-    # "Broadcast not supported yet in export operator"
-    @unittest.expectedFailure
     def test_sub_simple_broadcast(self):
         print("SubSimpleBroadcast")
         model = aidge_core.sequential([
@@ -346,8 +385,6 @@ class test_operator_export(unittest.TestCase):
 
         self.unit_test_export(model, "SubSimpleBroadcast", [[1, 7, 5]])
 
-    # "Broadcast not supported yet in export operator"
-    @unittest.expectedFailure
     def test_sub_double_broadcast(self):
         print("SubDoubleBroadcast")
         model = aidge_core.sequential([
@@ -384,8 +421,6 @@ class test_operator_export(unittest.TestCase):
 
         self.unit_test_export(model, "Mul", [[1, 5, 7]])
 
-    # "Broadcast not supported yet in export operator"
-    @unittest.expectedFailure
     def test_mul_simple_broadcast(self):
         print("MulSimpleBroadcast")
         model = aidge_core.sequential([
@@ -395,8 +430,6 @@ class test_operator_export(unittest.TestCase):
 
         self.unit_test_export(model, "MulSimpleBroadcast", [[1, 7, 5]])
 
-    # "Broadcast not supported yet in export operator"
-    @unittest.expectedFailure
     def test_mul_double_broadcast(self):
         print("MulDoubleBroadcast")
         model = aidge_core.sequential([
@@ -415,6 +448,54 @@ class test_operator_export(unittest.TestCase):
 
         self.unit_test_export(model, "MulBatch", [[3, 5, 7]])
 
+    def test_div(self):
+        print("Div")
+        model = aidge_core.sequential([
+            aidge_core.Producer([1, 5, 5], name="producer"),
+            aidge_core.Div(name="div")
+        ])
+
+        self.unit_test_export(model, "Div", [[1, 5, 5]])
+
+   
+    def test_div_higher(self):
+        print("Div")
+        model = aidge_core.sequential([
+            aidge_core.Producer([1, 5, 7], name="producer"),
+            aidge_core.Div(name="div")
+        ])
+
+        self.unit_test_export(model, "Div", [[1, 5, 7]])
+
+
+    def test_div_simple_broadcast(self):
+        print("DivSimpleBroadcast")
+        model = aidge_core.sequential([
+            aidge_core.Producer([1, 1, 5], name="producer"),
+            aidge_core.Div(name="div")
+        ])
+
+        self.unit_test_export(model, "DivSimpleBroadcast", [[1, 7, 5]])
+
+
+    def test_div_double_broadcast(self):
+        print("DivDoubleBroadcast")
+        model = aidge_core.sequential([
+            aidge_core.Producer([1, 1, 7], name="producer"),
+            aidge_core.Div(name="div")
+        ])
+
+        self.unit_test_export(model, "DivDoubleBroadcast", [[1, 5, 1]])
+
+    def test_div_batch(self):
+        print("DivBatch")
+        model = aidge_core.sequential([
+            aidge_core.Producer([3, 5, 7], name="producer"),
+            aidge_core.Div(name="div")
+        ])
+
+        self.unit_test_export(model, "DivBatch", [[3, 5, 7]])
+
     def test_concat(self):
         print("Concat")
         model = aidge_core.sequential([
@@ -432,6 +513,22 @@ class test_operator_export(unittest.TestCase):
 
         self.unit_test_export(model, "Transpose", [[1, 7, 8, 2]])
 
+    def test_identity(self):
+        print("Identity")
+        model = aidge_core.sequential([
+            aidge_core.Identity(name="identity")
+        ])
+
+        self.unit_test_export(model, "Identity", [[1, 5, 6, 8]])
+    
+    def test_slice(self):
+        print("Slice")
+        model = aidge_core.sequential([
+            aidge_core.Slice([1,3,5,2], [5,6,-1,3], [-4,2,1,3], [1,2,1,1], name="slice")
+        ])
+
+        self.unit_test_export(model, "Slice", [[5, 7, 12, 3]])
+    
     def test_reshape(self):
         print("Reshape")
         model = aidge_core.sequential([
@@ -441,7 +538,7 @@ class test_operator_export(unittest.TestCase):
         shape.add_child(model.get_node("reshape"), 0, 1)
         model.add(shape)
 
-        self.unit_test_export(model, "Reshape", [[1, 7, 8, 2]], random_weights=False)
+        self.unit_test_export(model, "Reshape", [[1, 7, 8, 2]], random_weights=False, default_value=None)
 
     def test_matmul(self):
         print("MatMul")
@@ -452,6 +549,51 @@ class test_operator_export(unittest.TestCase):
 
         self.unit_test_export(model, "MatMul", [[8, 4]])
 
+    def test_matmul_larger(self):
+        print("MatmulLarger")
+        model = aidge_core.sequential([
+            aidge_core.Producer([1, 5, 7], name="producer"),
+            aidge_core.MatMul(name="MatMul")
+        ])
+
+        self.unit_test_export(model, "MatMul", [[1, 7, 5]])
+
+    def test_matmul_higher(self):
+        print("MatMulHigher")
+        model = aidge_core.sequential([
+            aidge_core.Producer([1, 5, 7], name="producer"),
+            aidge_core.MatMul(name="matmul")
+        ])
+
+        self.unit_test_export(model, "MatMul", [[1, 7, 1]])
+
+    def test_matmul_simple_broadcast(self):
+        print("MatMulSimpleBroadcast")
+        model = aidge_core.sequential([
+            aidge_core.Producer([1, 1, 5], name="producer"),
+            aidge_core.MatMul(name="MatMul")
+        ])
+
+        self.unit_test_export(model, "MatMulSimpleBroadcast", [[1, 5, 7]])
+
+    def test_matmul_double_broadcast(self):
+        print("MatMulDoubleBroadcast")
+        model = aidge_core.sequential([
+            aidge_core.Producer([3, 1, 5], name="producer"),
+            aidge_core.MatMul(name="MatMul")
+        ])
+
+        self.unit_test_export(model, "MatMulDoubleBroadcast", [[3, 5, 5]])
+
+    def test_matmul_batch(self):
+        print("MatMulBatch")
+        model = aidge_core.sequential([
+            aidge_core.Producer([3, 5, 7], name="producer"),
+            aidge_core.MatMul(name="matmul")
+        ])
+
+        self.unit_test_export(model, "MatMulBatch", [[3, 7, 7]])
+
     def test_concat_axis_2(self):
         print("ConcatAxis2")
         model = aidge_core.sequential([
@@ -476,7 +618,47 @@ class test_operator_export(unittest.TestCase):
             aidge_core.Conv2D(in_channels=3, out_channels=3, kernel_dims=(3, 3), name="conv")
         ])
 
-        self.unit_test_export(model, "Conv2D", [[1, 3, 12, 12]], False, False)
+        self.unit_test_export(model, "Conv2D", [[1, 3, 12, 12]])
+
+    def test_conv2D_asym(self):
+        print("Conv2D_asym")
+        model = aidge_core.sequential([
+            aidge_core.Conv2D(in_channels=3, out_channels=3, kernel_dims=(2, 5), name="conv")
+        ])
+
+        self.unit_test_export(model, "Conv2D_asym", [[1, 3, 22, 22]])
+
+    def test_conv2D_asym2(self):
+        print("Conv2D_asym2")
+        model = aidge_core.sequential([
+            aidge_core.Conv2D(in_channels=3, out_channels=3, kernel_dims=(2, 5), name="conv")
+        ])
+
+        self.unit_test_export(model, "Conv2D_asym2", [[1, 3, 59, 22]])
+
+    def test_conv1D(self):
+        print("Conv1D")
+        model = aidge_core.sequential([
+            aidge_core.Conv1D(in_channels=3, out_channels=3, kernel_dims=[3],name="conv1d")
+        ])
+
+        self.unit_test_export(model, "Conv1D", [[1, 3, 12]])
+
+    def test_conv1D_stride(self):
+        print("Conv1D_stride")
+        model = aidge_core.sequential([
+            aidge_core.Conv1D(in_channels=3, out_channels=3, kernel_dims=[3], stride_dims=[3],name="conv1d")
+        ])
+
+        self.unit_test_export(model, "Conv1D_stride", [[1, 3, 24]])
+
+    def test_conv1D_dilation(self):
+        print("Conv1D_dilation")
+        model = aidge_core.sequential([
+            aidge_core.Conv1D(in_channels=1, out_channels=8, kernel_dims=[3], stride_dims=[3], dilation_dims=[5],name="conv1d")
+        ])
+
+        self.unit_test_export(model, "Conv1D_dilation", [[1, 1, 107]])
 
     def test_convDepthWise2D(self):
         print("ConvDepthWise2D")
@@ -484,15 +666,16 @@ class test_operator_export(unittest.TestCase):
             aidge_core.ConvDepthWise2D(nb_channels=3, kernel_dims=(3, 3), name="conv")
         ])
 
-        self.unit_test_export(model, "ConvDepthWise2D", [[1, 3, 12, 12]], False, False)
+        self.unit_test_export(model, "ConvDepthWise2D", [[1, 3, 12, 12]])
 
     def test_max_pooling(self):
         print("MaxPooling2D")
         model = aidge_core.sequential([
             aidge_core.MaxPooling2D(kernel_dims=(3, 3), name="max_pool")
         ])
+        model.set_ordered_outputs([(model.get_node("max_pool"), 0)], True)
 
-        self.unit_test_export(model, "MaxPooling2D", [[1, 2, 12, 12]], False, False)
+        self.unit_test_export(model, "MaxPooling2D", [[1, 2, 12, 12]])
 
     def test_avg_pooling(self):
         print("AvgPooling2D")
@@ -500,7 +683,7 @@ class test_operator_export(unittest.TestCase):
             aidge_core.AvgPooling2D(kernel_dims=(3, 3), name="avg_pool")
         ])
 
-        self.unit_test_export(model, "AvgPooling2D", [[1, 2, 12, 12]], False, False)
+        self.unit_test_export(model, "AvgPooling2D", [[1, 2, 12, 12]])
 
     def test_pad2D(self):
         print("Pad2D")
@@ -559,6 +742,46 @@ class test_operator_export(unittest.TestCase):
 
         self.unit_test_export(model, "Pad2DNotConstant", [[1, 5, 7, 11]])
 
+    def test_reducemean_2_axes_minimal(self):
+        print("ReduceMean2Axes_minimal")
+        model = aidge_core.sequential([
+            aidge_core.ReduceMean([1,2], keep_dims=False, name="ReduceMean")
+        ])
+
+        self.unit_test_export(model, "ReduceMean2Axes_minimal", [[1,2,3]])
+
+    def test_reducemean_1_axis(self):
+        print("ReduceMean1Axis")
+        model = aidge_core.sequential([
+            aidge_core.ReduceMean([2], keep_dims=False, name="ReduceMean")
+        ])
+
+        self.unit_test_export(model, "ReduceMean1Axis", [[3,3,2]])
+
+    def test_reducemean_all_axes(self):
+        print("ReduceMean")
+        model = aidge_core.sequential([
+            aidge_core.ReduceMean([], keep_dims=False, name="ReduceMean")
+        ])
+
+        self.unit_test_export(model, "ReduceMean0Axes", [[3,3,2]])
+
+    def test_reducemean_no_axes(self):
+        print("ReduceMean")
+        model = aidge_core.sequential([
+            aidge_core.ReduceMean([], keep_dims=False, noop_with_empty_axes = True, name="ReduceMean")
+        ])
+
+        self.unit_test_export(model, "ReduceMean0Axes", [[3,3,2]])
+
+    def test_reducemean_several_axes(self):
+        print("ReduceMeanSeveralAxes")
+        model = aidge_core.sequential([
+            aidge_core.ReduceMean([1, 3], keep_dims=False, name="ReduceMean")
+        ])
+
+        self.unit_test_export(model, "ReduceMeanSeveralAxes", [[1, 5, 7, 11]])
+
     def test_batchnorm2D(self):
         print("BatchNormalization2D")
         model = aidge_core.sequential([
@@ -624,16 +847,101 @@ class test_operator_export(unittest.TestCase):
 
         self.unit_test_export(model, "BatchNorm2DDenser", [[1, 3, 5, 7]], False, False)
 
-
-    def test_cpp(self):
-        print("Export test to do")
-
     def test_Conv(self):
+        print("Conv")
         model = aidge_core.sequential([
             aidge_core.Conv2D(1, 1, [3, 3], name="InputNode")
         ])
         initFiller(model)
         self.unit_test_export(model, "Conv", [[1, 1, 9, 9]])
 
+    def test_PaddedConv(self):
+        print("PaddedConv")
+        model = aidge_core.sequential([
+            aidge_core.PaddedConv2D(3, 7, kernel_dims=[7, 7], name="InputNode", stride_dims=[2, 2], padding_dims=[3, 3, 3, 3])
+        ])
+        initFiller(model)
+        self.unit_test_export(model, "PaddedConv", [[1, 3, 19, 19]])
+
+    def test_PaddedConv_asym(self):
+        print("PaddedConv_asym")
+        model = aidge_core.sequential([
+            aidge_core.PaddedConv2D(3, 7, kernel_dims=[3, 7], name="InputNode", stride_dims=[2, 1], padding_dims=[3, 1, 2, 3])
+        ])
+        initFiller(model)
+        self.unit_test_export(model, "PaddedConv_asym", [[1, 3, 19, 19]])
+
+    def test_PaddedConv_dilated(self):
+        print("PaddedConv_dilated")
+        model = aidge_core.sequential([
+            aidge_core.PaddedConv2D(3, 7, kernel_dims=[7, 7], name="InputNode", stride_dims=[2, 2], padding_dims=[3, 3, 3, 3], dilation_dims=[2, 2])
+        ])
+        initFiller(model)
+        self.unit_test_export(model, "PaddedConv_dilated", [[1, 3, 19, 19]])
+
+    def test_PaddedConv_dilated_asym(self):
+        print("PaddedConv_dilated_asym")
+        model = aidge_core.sequential([
+            aidge_core.PaddedConv2D(3, 7, kernel_dims=[7, 5], name="InputNode", stride_dims=[1, 2], padding_dims=[1, 3, 3, 0], dilation_dims=[1, 2])
+        ])
+        initFiller(model)
+        self.unit_test_export(model, "PaddedConv_dilated_asym", [[1, 3, 19, 19]])
+
+    def test_PaddedConvDW(self):
+        print("PaddedConvDW")
+        model = aidge_core.sequential([
+            aidge_core.PaddedConvDepthWise2D(3, kernel_dims=[7, 7], name="InputNode", stride_dims=[2, 2], padding_dims=[3, 3, 3, 3])
+        ])
+        initFiller(model)
+        self.unit_test_export(model, "PaddedConvDW", [[1, 3, 19, 19]])
+
+    def test_PaddedConvDW_asym(self):
+        print("PaddedConvDW_asym")
+        model = aidge_core.sequential([
+            aidge_core.PaddedConvDepthWise2D(3, kernel_dims=[3, 7], name="InputNode", stride_dims=[2, 1], padding_dims=[3, 1, 2, 3])
+        ])
+        initFiller(model)
+        self.unit_test_export(model, "PaddedConvDW_asym", [[1, 3, 19, 19]])
+
+    def test_PaddedConvDW_dilated(self):
+        print("PaddedConvDW_dilated")
+        model = aidge_core.sequential([
+            aidge_core.PaddedConvDepthWise2D(3, kernel_dims=[7, 7], name="InputNode", stride_dims=[2, 2], padding_dims=[3, 3, 3, 3], dilation_dims=[2, 2])
+        ])
+        initFiller(model)
+        self.unit_test_export(model, "PaddedConvDW_dilated", [[1, 3, 19, 19]])
+
+    def test_PaddedConvDW_dilated_asym(self):
+        print("PaddedConvDW_dilated_asym")
+        model = aidge_core.sequential([
+            aidge_core.PaddedConvDepthWise2D(3, kernel_dims=[7, 5], name="InputNode", stride_dims=[1, 2], padding_dims=[1, 3, 3, 0], dilation_dims=[1, 2])
+        ])
+        initFiller(model)
+        self.unit_test_export(model, "PaddedConvDW_dilated_asym", [[1, 3, 19, 19]])
+
+    def test_Conv2(self):
+        print("Conv2")
+        model = aidge_core.sequential([
+            aidge_core.Conv2D(2, 2, [3, 3], name="InputNode")
+        ])
+        initFiller(model)
+        self.unit_test_export(model, "Conv2", [[1, 2, 9, 9]], random_inputs=True, random_weights=False, default_value=None)
+
+    def test_erf(self):
+        print("Erf")
+        model = aidge_core.sequential([
+            aidge_core.Erf(name="erf")
+        ])
+
+        self.unit_test_export(model, "Erf", [[1, 10]])
+
+    def test_sigmoid(self):
+        print("Sigmoid")
+        model = aidge_core.sequential([
+            aidge_core.Sigmoid(name="sigmoid")
+        ])
+
+        self.unit_test_export(model, "Sigmoid", [[1, 100]])
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/examples/export_LeNet/.gitignore b/examples/export_LeNet/.gitignore
index 98ce649a943a90590bb8f4f067a3c1ac9691dcbc..faa9a65c0125f8177ff3f772833f8ffd5f88d886 100644
--- a/examples/export_LeNet/.gitignore
+++ b/examples/export_LeNet/.gitignore
@@ -1,6 +1,8 @@
 # Exclude export artefacts
-export_lenet_int8/
+export_lenet_*
 log_outputs/*
+graph_log/
 assets/*
 data/*
 log.txt
+*.json
diff --git a/examples/export_LeNet/lenet.py b/examples/export_LeNet/lenet.py
index 1cda87b9583a37007b61ef481d49c2bed037020f..8dccad9a5ce9387871ae48585e76a6ac6986546b 100644
--- a/examples/export_LeNet/lenet.py
+++ b/examples/export_LeNet/lenet.py
@@ -4,32 +4,20 @@ lenet.py
 Run this file to export a LeNet using the Aidge CPP Export module.
 """
 
-import os
-import shutil
 import random
-import aidge_core.utils
-import numpy as np
 import subprocess
+import numpy as np
 
 # Aidge Modules
 import aidge_core
 import aidge_onnx
-import aidge_backend_cpu
-import aidge_quantization
 import aidge_export_cpp
+import aidge_backend_cpu
+import aidge_quantization 
 
-from aidge_export_cpp.export_utils import (
-    cpp_fuse_to_metaops,
-    set_nodes_names,
-    set_nodes_datatypes,
-    exclude_unwanted_producers)
-
-from aidge_core.export_utils import remove_optional_inputs, get_node_from_metaop
+from aidge_export_cpp.export_utils import *
 
 # Torch (Dataset)
-import torch
-import torch.nn.functional as F
-from torch import nn
 from torchvision import transforms, datasets
 
 # Arguments
@@ -38,7 +26,8 @@ import argparse
 supported_types = ["float32", "int8"]
 
 parser = argparse.ArgumentParser(description="Export the LeNet model with the aidge_export_cpp module.")
-parser.add_argument("--dev", action="store_true", help="Export in dev mode")
+parser.add_argument("--dev_mode", action="store_true", help="Export in dev mode")
+parser.add_argument("--mem_wrap", action="store_true", help="Use memory wrapping")
 parser.add_argument("--no_cuda", action="store_true", help="Disable USE_CUDA usage to perform inferences and training.")
 parser.add_argument("--dtype", type=str, choices=supported_types, default="float32", help="Specify the targeted datatype : [int8, float32]")
 parser.add_argument("--aidge_cmp", action="store_true", help="Use aidge tensor results as reference.")
@@ -59,8 +48,14 @@ parser.add_argument(
 )
 args = parser.parse_args()
 
+# CUDA Usage
 USE_CUDA = not args.no_cuda
 
+if USE_CUDA:
+    import aidge_backend_cuda
+
+backend = "cuda" if USE_CUDA else "cpu"
+
 # Setting Aidge verbose level
 if args.verbose == 0:
     aidge_core.Log.set_console_level(aidge_core.Level.Error)
@@ -71,236 +66,97 @@ elif args.verbose == 2:
 elif args.verbose >= 3:
     aidge_core.Log.set_console_level(aidge_core.Level.Debug)
 
-if USE_CUDA:
-    import aidge_backend_cuda
-
 # ------------------------------------------------------------
-# EXPORT CONFIG
+# DOWNLOAD & LOAD THE MODEL
 # ------------------------------------------------------------
 
-"""
-Export configuration details :
-- RNG_SEED :        Fix a random seed for torch to always get the same images from the dataset,
-                        therefore always getting the same output.
-- NB_TEST :         Number of example inferences to perform (used to get an accuracy approximation).
-- NB_CALIB :        Number of samples used for the calibration step of quantization.
-- MODEL_NAME :      Should be the same name as the onnx file you want to load and export.
-- DO_EXAMPLES :     Perform example inferences (and allow to get accuracy approximation)
-- NB_BITS :         Quantization output precision. Should be 8 to work with this export.
-- TARGET_TYPE :     The aidge datatype for tensors to be casted after the quantization step [float64, float32, int32].
-- OPTIM_SIGN :      Quantization optional optimization based on data sign.
-- SINGLE_SHIFT :    Quantization option specifying if inserted scaling nodes should be
-                        single shift or floating point.
-- NO_QUANT :        Skip the quantization step.
-- CLIPPING :        Clipping method during quantization.
-- FOLD_GRAPH :      The quantization step adds cast nodes to cast the graph into the given TARGET_TYPE.
-                        Enabling the FOLD_GRAPH will automatically fold these nodes into the following
-                        ones at the end of quantization step.
-- USE_CUDA :        Determine if the quantization step uses the GPU. It is generally recommended
-                        to enable this option if you have access to GPUs as the quantization step
-                        may take a while to complete.
-- DEV_MODE :        The dev mode allows to identify errors more easily exporting the model with
-                        symbolic links enabling to modify the source files directly in the
-                        generated export (make sure you installed the export plugin running
-                        `pip install -e .`).
-                        Enabled running this python file, adding the --dev argument.
-- AIDGE_CMP :       Saves and export the outputs generated by the aidge inferences in order
-                        to compare it with the export outputs.
-                        Enabled running this python file, adding the --aidge_cmp argument.
-"""
+# Download the model
+file_url = "https://huggingface.co/EclipseAidge/LeNet/resolve/main/lenet_mnist.onnx?download=true"
+file_path = "lenet_mnist.onnx"
+aidge_core.utils.download_file(file_path, file_url)
 
-print(" Available backends : ", aidge_core.Tensor.get_available_backends())
-
-quantize_model = False
-NB_BITS = 32
-TARGET_TYPE = aidge_core.dtype.float32
-
-if args.dtype == "float32":
-    quantize_model = False
-elif args.dtype == "int8":
-    quantize_model = True
-    NB_BITS = 8
-    TARGET_TYPE = aidge_core.dtype.int32    # int8 not yet available
-else:
-    print(f"[ERROR] Datatype '{args.dtype}' not supported.")
-    print(f"[ERROR] Supported datatypes : {supported_types}.")
-    exit(1)
-
-RNG_SEED      = 1234
-NB_TEST       = 10 # Example inferences
-NB_CALIB      = 20 # Calibration set
-MODEL_NAME    = 'lenet'
-EXPORT_FOLDER   = f"export_{MODEL_NAME}_int8"
-DO_EXAMPLES   = True
-
-# Quantization params
-OPTIM_SIGN      = False
-SINGLE_SHIFT    = True
-ROUNDING        = True
-NO_QUANT = False
-CLIPPING        = aidge_quantization.Clipping.MSE  # 'MAX'
-FOLD_GRAPH      = True
-
-# Export modes
-DEV_MODE      = args.dev
-AIDGE_CMP     = args.aidge_cmp
-
-print('\n RNG_SEED         = ', RNG_SEED)
-print(' MODEL_NAME       = ', MODEL_NAME)
-print(' NB_TEST          = ', NB_TEST)
-print(' NB_CALIB         = ', NB_CALIB)
-print(' NB_BITS          = ', NB_BITS)
-print(' OPTIM_SIGN       = ', OPTIM_SIGN)
-print(' NO_QUANT         = ', NO_QUANT)
-print(' CLIPPING         = ', CLIPPING)
-print(' SINGLE_SHIFT     = ', SINGLE_SHIFT)
-print(' USE_CUDA         = ', USE_CUDA)
-print(' DEV_MODE         = ', DEV_MODE)
-
-torch.manual_seed(RNG_SEED)
-random.seed(RNG_SEED)
+# Load the model
+model = aidge_onnx.load_onnx(file_path, verbose=False)
 
-backend = "cuda" if USE_CUDA else "cpu"
+# --------------------------------------------------------------
+# CLEAN THE MODEL
+# --------------------------------------------------------------
 
-# ------------------------------------------------------------
-# CREATE THE LENET MODEL
-# ------------------------------------------------------------
-"""
-The LeNet model is created and trained using the create_lenet file.
-If a lenet.onnx file is already present in the current folder, this step will be skiped.
-The generated network is not yet quantized.
-"""
-# Define the target path and filename
-file_url = "https://huggingface.co/EclipseAidge/LeNet/resolve/main/lenet_mnist.onnx?download=true"
-file_path = MODEL_NAME + "_mnist.onnx"
-aidge_core.utils.download_file(file_path, file_url)
+aidge_core.remove_flatten(model)
+aidge_core.fuse_batchnorm(model)
+aidge_core.expand_metaops(model, name_format="{0}_{1}_{2}")
 
 # --------------------------------------------------------------
-# CREATE THE SAMPLES
+# TEST THE MODEL
 # --------------------------------------------------------------
 
+# Create the samples
+
+NB_TEST       = 10 # XXX 100 - Example inferences
+NB_CALIB      = 20 # XXX 20  - Calibration set
+
 transform = transforms.ToTensor()
 test_set  = datasets.MNIST(root='./data', train=False, transform=transform, download=True)
 
 tensors = []
 labels  = []
-index = 0
-for in_tensor, label in test_set:
-    array = np.array(in_tensor)
-    array = np.reshape(array, (1, 1, 28, 28))
-    tensor = aidge_core.Tensor(array)
-    tensor.set_backend(backend)
-    tensor.set_datatype(aidge_core.dtype.float32)
+for i, (tensor, label) in enumerate(test_set):
+    tensor = np.reshape(tensor.numpy(), (1, 1, 28, 28))
+    tensor = aidge_core.Tensor(tensor)
+    tensor.set_backend("cpu")
     tensors.append(tensor)
     labels.append(label)
-    index += 1
-    if (index == max(NB_TEST, NB_CALIB)):
+    if i >= max(NB_TEST, NB_CALIB):
         break
 
-# --------------------------------------------------------------
-# LOAD THE MODEL
-# --------------------------------------------------------------
-
-"""
-Load the .onnx model and perform some usual graph modifications :
-    - Remove the flatten nodes;
-    - Fuse the batchnorm nodes into the biases producers.
-    - Expand the metaOperators to perform the desired fusions.
-"""
-
-model = aidge_onnx.load_onnx(file_path, verbose=False)
-aidge_core.remove_flatten(model)
-aidge_core.fuse_batchnorm(model)
-aidge_core.expand_metaops(model)
-model.save("imported_model")
-
-# --------------------------------------------------------------
-# SET UP THE AIDGE SCHEDULER
-# --------------------------------------------------------------
-
-"""
-The scheduler is an ordered version of the model, allowing to schedule
-nodes to be able to run inferences, for instance.
-"""
-
 # Set up the backend
 model.set_datatype(aidge_core.dtype.float32)
 model.set_backend(backend)
 
-# Create the Scheduler
+# Create the Scheduler 
 scheduler = aidge_core.SequentialScheduler(model)
 
-# --------------------------------------------------------------
-# RUN SOME EXAMPLES INFERENCES
-# --------------------------------------------------------------
-
-def propagate(model, scheduler, tensor):
-    """
-    Propagate the given tensor into the model and return the
-    output tensor.
-    """
-    print(f"Propagate: {tensor.backend()}")
-    # Run the inference
-    scheduler.forward(True, [tensor])
-    # Gather the results
-    output_node = model.get_output_nodes().pop()
-    output_tensor = output_node.get_operator().get_output(0).clone()
-    output_tensor.set_backend("cpu")
-    return np.array(output_tensor)
-
+# Run some examples
 accuracy = 0
-if (DO_EXAMPLES):
-    print('\n EXAMPLE INFERENCES :')
-    nb_valid = 0
-    base_values = []
-    for i in range(NB_TEST):
-        print(f"Inférence: {tensors[i].backend()}")
-        output_array = propagate(model, scheduler, tensors[i])
-        print(labels[i], ' VS ', np.argmax(output_array), ' -> ', np.max(output_array))
-        base_values.append(np.max(output_array))
-        if (labels[i] == np.argmax(output_array)):
-            nb_valid += 1
-    accuracy = nb_valid / NB_TEST
-    print('\n MODEL ACCURACY = ', accuracy * 100, '%')
+print('\n EXAMPLE INFERENCES :')
+nb_valid = 0
+base_values = []
+for i in range(NB_TEST):
+    output_array = propagate(model, scheduler, tensors[i])
+    print(labels[i], ' VS ', np.argmax(output_array), ' -> ', np.max(output_array))
+    base_values.append(np.max(output_array))
+    if (labels[i] == np.argmax(output_array)):
+        nb_valid += 1
+accuracy = nb_valid / NB_TEST
+print('\n MODEL ACCURACY = ', accuracy * 100, '%')
 
 # --------------------------------------------------------------
 # PERFORM THE QUANTIZATION
 # --------------------------------------------------------------
 
-if quantize_model:
+if args.dtype != "float32":
+
+    TARGET_TYPE = aidge_core.dtype.int32
+    if args.dtype == "int8":
+        NB_BITS = 8
+    else:
+        aidge_core.Log.fatal(f"Type {args.dtype} not yet supported. \
+                             \nSupported types : {supported_types}.")
+
     aidge_quantization.quantize_network(
         network = model,
         nb_bits = NB_BITS,
         calibration_set = tensors[0:NB_CALIB],
-        clipping_mode = CLIPPING,
         target_type = TARGET_TYPE,
-        no_quant = NO_QUANT,
-        optimize_signs = OPTIM_SIGN,
-        single_shift = SINGLE_SHIFT,
-        use_cuda = USE_CUDA,
-        fold_graph = FOLD_GRAPH)
-
-# Tag the scaling producers
-for node in model.get_nodes():
-    if node.type() == "Quantizer":
-        for SNode in get_node_from_metaop(node, "BitShift"):
-            SNode.get_parent(1).attributes().shift_prod = True
-        for CNode in get_node_from_metaop(node, "Mul"):
-            CNode.get_parent(1).attributes().coef_prod = True
-
-model.save("post_ptq_model")
-
-# --------------------------------------------------------------
-# RESCALE THE INPUT SAMPLES
-# --------------------------------------------------------------
+        single_shift = True,
+        clipping_mode = aidge_quantization.Clipping.MSE,
+        use_cuda = USE_CUDA)
 
-"""
-Once the quantization is done, the graph now only accepts integer inputs.
-So we need to rescale the dataset for the data to be within [0, 255].
-Also, tensors should be casted to be the same type as TARGET_TYPE.
-"""
+    # --------------------------------------------------------------
+    # TEST THE QUANTIZED MODEL
+    # --------------------------------------------------------------
 
-if quantize_model:
+    # Quantize the samples
     rescaling = 2**(NB_BITS-1)-1
     for i in range(NB_TEST):
         tensors[i].set_backend("cpu")
@@ -308,39 +164,20 @@ if quantize_model:
         array = np.round(array).astype(int)
         tensors[i] = aidge_core.Tensor(array)
         tensors[i].set_datatype(TARGET_TYPE)
-        tensors[i].set_backend("cpu")
-    # Setting modele to CPU for export
-    model.set_backend("cpu")
-
-
-# --------------------------------------------------------------
-# GENERATE NEW SCHEDULER
-# --------------------------------------------------------------
 
-"""
-Each time the graph has been change, it has to be reset.
-Here some Quantizer and Cast nodes have been added.
-"""
-
-""" [Issue]
-We need first to manually add an input tensor with the correct datatype,
-as it is not automatically done in PTQ.
-"""
-if quantize_model:
-    input_node = model.get_ordered_inputs()[0]
-    input_node[0].get_operator().set_input(0, tensors[0])
+    # Set the backend back to CPU
+    if USE_CUDA:
+        model.set_backend("cpu")
+        
+    # Reset the scheduler as the graph may have changed
     scheduler.reset_scheduling()
+    scheduler.generate_scheduling()
 
-# --------------------------------------------------------------
-# PERFORM THE EXAMPLE INFERENCES AGAIN
-# --------------------------------------------------------------
-
-if (DO_EXAMPLES and quantize_model):
-    print('\n QUANTIZED EXAMPLE INFERENCES:')
+    # Perform example inferences on quantized model
+    print('\n QUANTIZED EXAMPLE INFERENCES :')
     nb_valid = 0
     post_values = []
     for i in range(NB_TEST):
-        print(f"QEI: {tensors[i].backend()}")
         output_array = propagate(model, scheduler, tensors[i])
         print(labels[i], ' VS ', np.argmax(output_array), ' -> ', np.max(output_array))
         post_values.append(np.max(output_array))
@@ -351,154 +188,38 @@ if (DO_EXAMPLES and quantize_model):
     print('\n MODEL ACCURACY = ', accuracy * 100, '%')
     print('\n QUANTIZED ACCURACY = ', quant_accuracy * 100, '%')
 
+    output_array = propagate(model, scheduler, tensors[0])
 
 # --------------------------------------------------------------
-# FUSE NODES INTO METAOPS
-# --------------------------------------------------------------
-
-"""
-Here is made the link between the Aidge model and the CPP
-kernels implementation. In aidge, all the nodes calculations
-are performed separately (Pad -> Conv -> Quantizer -> ReLU -> ...).
-
-However within the CPP export, some core operators are merged
-in meta operators. For instance, the padding, scaling and ReLU are
-performed within the Conv kernel.
-
-In this step, we use graph regex techniques to find the desired patterns
-within the graph in order to match the export implementation of the kernels.
-"""
-
-# Exclude unwanted producers
-"""
-Before fusing the nodes, we set a tag on the Producers in order to exclude
-from the export the ones holding coefficients, as they are directly handled
-within the layers parameters.
-"""
-exclude_unwanted_producers(model)
-
-# Fuse nodes
-cpp_fuse_to_metaops(model)
-
-# Remove optional inputs
-"""
-Some optional inputs may be added by the quantization step (for instance with the clipping nodes).
-Here we make sure that they will not be considered as actual graph inputs by the export, by
-excluding them from the ordered_inputs list of the model.
-"""
-remove_optional_inputs(model)
-
-# Reset scheduler to apply graph modifications
-"""
-The scheduler always needs to be reset after graph manipulation.
-"""
-scheduler.reset_scheduling()
-
-# Name newly created MetaOps
-"""
-As names are optional in Aidge, the fuse_to_metaops function will not automatically
-give a name to the newly created metaOps. However, in an export context, we need
-our operators to be named, as this will be used to name the corresponding files.
-"""
-
-scheduler.generate_scheduling() # Scheduler needs to be generated as it has just been reset
-set_nodes_names(scheduler)
-
-# --------------------------------------------------------------
-# LOG OUTPUTS FOR THE FIRST IMAGE OF THE TEST DATASET
-# --------------------------------------------------------------
-
-"""
-Here a final inference is made on the input we want to export and run.
-This will ensure that all the feature maps tensors (between the layers)
-hold the data corresponding to this specific input.
-Then, the "log_outputs()" function (called later) will store these tensors
-into log files that may be exported as well for comparison purpose.
-"""
-
-output_array = propagate(model, scheduler, tensors[0])
-
-print("### Exported Sample ###")
-print("Aidge prediction :", np.argmax(output_array), "(" + str(np.max(output_array)) + ")")
-print("Label :", labels[0])
-
-# --------------------------------------------------------------
-# HANDLING DATATYPE
-# --------------------------------------------------------------
-
-"""
-Now, despite the quantization stage, all the tensors of the model are
-still "virtually" in Int32. Before exporting the model, we have to set
-tensors' datatypes to Int8, except for biases which should remain in Int32.
-"""
-
-if quantize_model:
-    set_nodes_datatypes(model)
-
-# Store tensors values into log files
-"""
-Once the tensors have been casted, the log_outputs() function can be
-called to store their values into log files.
-"""
-
-if os.path.isdir("log_outputs"):
-    shutil.rmtree("log_outputs")
-model.log_outputs("log_outputs")
-
-# --------------------------------------------------------------
-# TEST MODE
-# --------------------------------------------------------------
-
-"""
-The test mode is mainly used for validation and benchmark. The model will be
-exported in a way that each layer's result will be compared with the CPU implementation.
-The timings for each layer will be displayed.
-In case of error, you will be able to enter debug mode, showing in-layer data or
-changing the inputs of the layer, to isolate the source of the issue.
-"""
-
-for node in model.get_nodes():
-    node.attributes().dev_mode = DEV_MODE
-
-# --------------------------------------------------------------
-# AIDGE CMP
+# EXPORT THE MODEL
 # --------------------------------------------------------------
 
-"""
-If the --aidge_cmp option is enabled, the feature maps generated by aidge with the
-backend cpu will be exported in the generated export. It will be used as reference
-to verify that the results with the optimized kernels are correct for the exported
-model.
-This option has to be passed to each node in order to be used within the Export Nodes.
-(JConv, JPad, ...) that you can find in the "operators" folder.
-"""
+export_folder_name = f"export_lenet_{args.dtype}"
 
-if AIDGE_CMP:
-    for node in model.get_nodes():
-        node.attributes().aidge_cmp = True
+aidge_export_cpp.export(
+    export_folder_name=export_folder_name,
+    model=model,
+    scheduler=scheduler,
+    inputs_tensor=tensors[0],
+    labels=aidge_core.Tensor(labels[0]),
+    dev_mode=args.dev_mode,
+    aidge_cmp=args.aidge_cmp,
+    memory_manager_args={"wrapping": True} if args.mem_wrap else {})
 
 # --------------------------------------------------------------
-# EXPORT THE MODEL
+# COMPILING THE EXPORT
 # --------------------------------------------------------------
 
-model.save("exported_model")
-
-aidge_export_cpp.export(EXPORT_FOLDER,
-                        model,
-                        scheduler,
-                        # tensors[0],
-                        labels = aidge_core.Tensor(labels[0]),
-                        dev_mode = DEV_MODE,
-                        aidge_cmp = AIDGE_CMP)
 print("\n### Compiling the export ###")
 try:
-    for std_line in aidge_core.utils.run_command(["make"], cwd=EXPORT_FOLDER):
+    command = ["make", f"AIDGE_CMP={str(args.aidge_cmp).lower()}", "SAVE_OUTPUTS=false"]
+    for std_line in aidge_core.utils.run_command(command, cwd=export_folder_name):
         print(std_line, end="")
 except subprocess.CalledProcessError as e:
             raise RuntimeError(0, f"An error occurred, failed to build export.") from e
 print("\n### Running the export ###")
 try:
-    for std_line in aidge_core.utils.run_command(["./bin/run_export"], cwd=EXPORT_FOLDER):
+    for std_line in aidge_core.utils.run_command(["./bin/run_export"], cwd=export_folder_name):
         print(std_line, end="")
 except subprocess.CalledProcessError as e:
-    raise RuntimeError(0, f"An error occurred, failed to run export.") from e
\ No newline at end of file
+    raise RuntimeError(0, f"An error occurred, failed to run export.") from e
diff --git a/examples/export_ResNet18/.gitignore b/examples/export_ResNet18/.gitignore
index a6e4e9706bae169c65e15f7a2a8c090fe21618c7..d4813dedaac332924cfe5242c6c49f4f6f4a14ee 100644
--- a/examples/export_ResNet18/.gitignore
+++ b/examples/export_ResNet18/.gitignore
@@ -1,6 +1,7 @@
 # Exclude export artefacts
-export_resnet18_int8/
-log_outputs/*
+export_resnet18_*
+log_outputs
+graph_log
 assets/*
 data/*
 log.txt
diff --git a/examples/export_ResNet18/resnet18.py b/examples/export_ResNet18/resnet18.py
index 81e335567194a83794bd4be8c2dcbd8056fa3ebf..2eba6431151c627be37b6c4223fa5f6abd75eda2 100644
--- a/examples/export_ResNet18/resnet18.py
+++ b/examples/export_ResNet18/resnet18.py
@@ -7,41 +7,35 @@ In order for this file to work properly, you should first download the imagenet
 (search for "ILSVRC2012").
 """
 
+import os
 import random
+import subprocess
 import numpy as np
-import os
-import shutil
 from PIL import Image
-import requests
 from pathlib import Path
-import subprocess
 from random import randint
+
 # Aidge Modules
 import aidge_core
 import aidge_onnx
+import aidge_export_cpp
 import aidge_backend_cpu
 import aidge_quantization
-import aidge_export_cpp
 
-from aidge_export_cpp.export_utils import (
-    cpp_fuse_to_metaops,
-    exclude_unwanted_producers,
-    set_nodes_names,
-    set_nodes_datatypes,
-    normalize)
-
-from aidge_core.export_utils import remove_optional_inputs, get_node_from_metaop
+from aidge_export_cpp.export_utils import *
 
 # Torch (Dataset)
 import torch
 from torchvision import transforms
+
 # Arguments
 import argparse
 
 supported_types = ["float32", "int8"]
 
 parser = argparse.ArgumentParser(description="Export the ResNet18 model with the aidge_export_cpp module.")
-parser.add_argument("--dev", action="store_true", help="Export in dev mode")
+parser.add_argument("--dev_mode", action="store_true", help="Export in dev mode")
+parser.add_argument("--mem_wrap", action="store_true", help="Use memory wrapping")
 parser.add_argument("--no_cuda", action="store_true", help="Disable USE_CUDA usage to perform inferences and training.")
 parser.add_argument("--dtype", type=str, choices=supported_types, default="float32", help="Specify the targeted datatype : [int8, float32]")
 parser.add_argument("--aidge_cmp", action="store_true", help="Use aidge tensor results as reference.")
@@ -80,8 +74,14 @@ parser.add_argument(
 
 args = parser.parse_args()
 
+# CUDA Usage
 USE_CUDA = not args.no_cuda
 
+if USE_CUDA:
+    import aidge_backend_cuda
+
+backend = "cuda" if USE_CUDA else "cpu"
+
 # Setting Aidge verbose level
 if args.verbose == 0:
     aidge_core.Log.set_console_level(aidge_core.Level.Error)
@@ -92,124 +92,32 @@ elif args.verbose == 2:
 elif args.verbose >= 3:
     aidge_core.Log.set_console_level(aidge_core.Level.Debug)
 
-if USE_CUDA:
-    import aidge_backend_cuda
-
-# ------------------------------------------------------------
-# EXPORT CONFIG
-# ------------------------------------------------------------
-
-"""
-Export configuration details :
-- RNG_SEED :        Fix a random seed for torch to always get the same images from the dataset,
-                        therefore always getting the same output.
-- NB_TEST :         Number of example inferences to perform (used to get an accuracy approximation).
-- NB_CALIB :        Number of samples used for the calibration step of quantization.
-- MODEL_NAME :      Should be the same name as the onnx file you want to load and export.
-- DO_EXAMPLES :     Perform example inferences (and allow to get accuracy approximation)
-- NB_BITS :         Quantization output precision. Should be 8 to work with this export.
-- TARGET_TYPE :     The aidge datatype for tensors to be casted after the quantization step.
-- OPTIM_SIGN :      Quantization optional optimization based on data sign.
-- SINGLE_SHIFT :    Quantization option specifying if inserted scaling nodes should be
-                        single shift or floating point.
-- NO_QUANT : Skip the quantization step. Should be set to False.
-- CLIPPING :        Clipping method during quantization.
-- FOLD_GRAPH :      The quantization step adds cast nodes to cast the graph into the given TARGET_TYPE.
-                        Enabling the FOLD_GRAPH will automatically fold these nodes into the following
-                        ones at the end of quantization step.
-- USE_CUDA :        Determine if the quantization step uses the GPU. It is generally recommended
-                        to enable this option if you have access to GPUs as the quantization step
-                        may take a while to complete.
-- DEV_MODE :        The dev mode allows to identify errors more easily export the model with
-                        symbolic links enabling to modify the source files directly in the
-                        generated export (make sure you installed the export plugin running
-                        `pip install -e .`).
-                        Enabled running this python file, adding the --test argument.
-- AIDGE_MODE :      Saves and export the outputs generated by the aidge inferences in order
-                        to compare it with the export outputs.
-                        Enabled running this python file, adding the --aidge_cmp argument.
-"""
-
-print(" Available backends : ", aidge_core.Tensor.get_available_backends())
-
-quantize_model = False
-NB_BITS = 32
-TARGET_TYPE = aidge_core.dtype.float32
+# Init random seed
+RNG_SEED        = 1234 
+torch.manual_seed(RNG_SEED)
+random.seed(RNG_SEED)
+np.random.seed(RNG_SEED)
 
-if args.dtype == "float32":
-    quantize_model = False
-elif args.dtype == "int8":
-    quantize_model = True
-    NB_BITS = 8
-    TARGET_TYPE = aidge_core.dtype.int32    # int8 not yet available
-else:
-    print(f"[ERROR] Datatype '{args.dtype}' not supported.")
-    print(f"[ERROR] Supported datatypes : {supported_types}.")
-    exit(1)
+# --------------------------------------------------------------
+# CREATE THE SAMPLES
+# --------------------------------------------------------------
 
-RNG_SEED        = 1234
 NB_TEST         = 20 # Test set
 NB_CALIB        = 20 # Calibration set
-MODEL_NAME      = 'resnet18'
-EXPORT_FOLDER   = f"export_{MODEL_NAME}_int8"
-DO_EXAMPLES     = True
-
-# Quantization params
-OPTIM_SIGN      = False
-SINGLE_SHIFT    = True
-ROUNDING        = True
-NO_QUANT        = False
-CLIPPING        = aidge_quantization.Clipping.MSE  # 'MAX'
-FOLD_GRAPH      = True
-
-# Export modes
-DEV_MODE      = args.dev
-AIDGE_CMP     = args.aidge_cmp
-
-# Path to databases
-IMAGENET_PATH = args.imagenet_path # Path to ImageNet database
-LABEL_PATH = args.imagenet_labels           # File containing labels of image of val folder (Look for val.txt)
-###########################
-
-def print_cfg():
-    print("")
-    print(' RNG_SEED         = ', RNG_SEED)
-    print(' MODEL_NAME       = ', MODEL_NAME)
-    print(' NB_TEST          = ', NB_TEST)
-    print(' NB_CALIB         = ', NB_CALIB)
-    print(' NB_BITS          = ', NB_BITS)
-    print(' OPTIM_SIGN       = ', OPTIM_SIGN)
-    print(' NO_QUANT         = ', NO_QUANT)
-    print(' CLIPPING         = ', CLIPPING)
-    print(' SINGLE_SHIFT     = ', SINGLE_SHIFT)
-    print(' TARGET_TYPE      = ', TARGET_TYPE)
-    print(' FOLD_GRAPH       = ', FOLD_GRAPH)
-    print(' USE_CUDA         = ', USE_CUDA)
-    print(' DEV_MODE         = ', DEV_MODE)
-    print(' IMAGENET_PATH    = ', IMAGENET_PATH)
-    print(' LABEL_PATH       = ', LABEL_PATH)
-    print(' MOCK_DB          = ', args.mock_db)
-
-print_cfg()
-
-torch.manual_seed(RNG_SEED)
-random.seed(RNG_SEED)
-np.random.seed(RNG_SEED)
 
-backend = "cuda" if USE_CUDA else "cpu"
-aidge_tensors = []
+tensors = []
 labels  = []
 if args.mock_db:
-    for i in range(NB_TEST):
-        aidge_tensor = aidge_core.Tensor(dims=(1, 3, 224, 224))
-        aidge_tensor.set_backend(backend)
-        aidge_tensor.set_datatype(aidge_core.dtype.float32)
-        aidge_core.uniform_filler(aidge_tensor, -1.0, 1.0)
-        aidge_tensors.append(aidge_tensor)
+    for i in range(max(NB_TEST, NB_CALIB)):
+        tensor = aidge_core.Tensor(dims=(1, 3, 224, 224))
+        tensor.set_backend(backend)
+        tensor.set_datatype(aidge_core.dtype.float32)
+        aidge_core.uniform_filler(tensor, -1.0, 1.0)
+        tensors.append(tensor)
         labels.append(randint(1, 1000))
 else:
     image_label_pairs = []
-    with open(LABEL_PATH, 'r') as f:
+    with open(args.imagenet_labels, 'r') as f:
         for line in f:
             parts = line.strip().split()
             if len(parts) == 2:
@@ -220,23 +128,15 @@ else:
     NB_SELECT = max(NB_TEST, NB_CALIB)  # Check that NB_TEST and NB_CALIB are fixed
     selected_pairs = image_label_pairs[:NB_SELECT]
 
-    # --------------------------------------------------------------
-    # CREATE THE SAMPLES
-    # --------------------------------------------------------------
-
     transform_val = transforms.Compose([transforms.Resize(256),
                                         transforms.CenterCrop(224),
                                         transforms.ToTensor(),
                                         transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
                                         ])
 
-    tensors = []
-    labels  = []
     paths   = []
-    index   = 0
-
     for image_name, label in selected_pairs:
-        image_path = os.path.join(IMAGENET_PATH, image_name)
+        image_path = os.path.join(args.imagenet_path, image_name)
         if os.path.exists(image_path):
             try:
                 image = Image.open(image_path)
@@ -250,50 +150,39 @@ else:
                 print(f"Error with image {image_path}: {e}")
 
 
-    for tensor in tensors:
+    for i, tensor in enumerate(tensors):
         array = tensor.numpy()
         array = np.reshape(array, (1, 3, 224, 224))
         array = normalize(array)
         aidge_tensor = aidge_core.Tensor(array)
         aidge_tensor.set_backend(backend)
         aidge_tensor.set_datatype(aidge_core.dtype.float32)
-        aidge_tensors.append(aidge_tensor)
-
+        tensors[i] = aidge_tensor
 
 # --------------------------------------------------------------
-# LOAD THE MODEL
+# DOWNLOAD & LOAD THE MODEL
 # --------------------------------------------------------------
 
-"""
-Load the .onnx model and perform some usual graph modifications :
-    - Remove the flatten nodes;
-    - Fuse the batchnorm nodes into the biases producers.
-    - Expand the metaOperators to perform the desired fusions.
-"""
-
-# Define the target path and filename
+# Download the model
 file_url = "https://huggingface.co/EclipseAidge/resnet18/resolve/main/resnet18_imagenet_1k.onnx?download=true"
-file_path = Path(MODEL_NAME + "_imagenet_1k.onnx")
-
+file_path = "resnet18_imagenet_1k.onnx"
 aidge_core.utils.download_file(file_path, file_url)
 
+# Load the model
 model = aidge_onnx.load_onnx(file_path, verbose=False)
 
-model.save("imported_model")
+# --------------------------------------------------------------
+# CLEAN THE MODEL
+# --------------------------------------------------------------
+
 aidge_core.remove_flatten(model)
 aidge_core.fuse_batchnorm(model)
-aidge_core.expand_metaops(model)
-model.save("imported_model_fused_bn")
+aidge_core.expand_metaops(model, name_format="{0}_{1}_{2}")
 
 # --------------------------------------------------------------
-# SET UP THE AIDGE SCHEDULER
+# TEST THE MODEL
 # --------------------------------------------------------------
 
-"""
-The scheduler is an ordered version of the model, allowing to schedule
-nodes to be able to run inferences, for instance.
-"""
-
 # Set up the backend
 model.set_datatype(aidge_core.dtype.float32)
 model.set_backend(backend)
@@ -301,113 +190,73 @@ model.set_backend(backend)
 # Create the Scheduler
 scheduler = aidge_core.SequentialScheduler(model)
 
-# --------------------------------------------------------------
-# RUN SOME EXAMPLES INFERENCES
-# --------------------------------------------------------------
-
-def propagate(model, scheduler, aidge_tensor):
-    """ Propagate the given tensor into the model
-    """
-    # Run the inference
-    scheduler.forward(True, [aidge_tensor])
-    # Gather the results
-    output_node = model.get_output_nodes().pop()
-    output_tensor = output_node.get_operator().get_output(0).clone()
-    output_tensor.set_backend("cpu")
-    return np.array(output_tensor)
-
+# Run some examples
 accuracy = 0
-if (DO_EXAMPLES):
-    print('\n EXAMPLE INFERENCES :')
-    nb_valid = 0
-    base_values = []
-    for i in range(NB_TEST):
-        output_array = propagate(model, scheduler, aidge_tensors[i])
-        print(labels[i], ' VS ', np.argmax(output_array), ' -> ', np.max(output_array))
-        base_values.append(np.max(output_array))
-        if (labels[i] == np.argmax(output_array)):
-            nb_valid += 1
-    accuracy = nb_valid / NB_TEST
-    print('\n MODEL ACCURACY = ', accuracy * 100, '%')
+print('\n EXAMPLE INFERENCES :')
+nb_valid = 0
+base_values = []
+for i in range(NB_TEST):
+    output_array = propagate(model, scheduler, tensors[i])
+    print(labels[i], ' VS ', np.argmax(output_array), ' -> ', np.max(output_array))
+    base_values.append(np.max(output_array))
+    if (labels[i] == np.argmax(output_array)):
+        nb_valid += 1
+accuracy = nb_valid / NB_TEST
+print('\n MODEL ACCURACY = ', accuracy * 100, '%')
 
-
-#--------------------------------------------------------------
+# --------------------------------------------------------------
 # PERFORM THE QUANTIZATION
 # --------------------------------------------------------------
 
-if quantize_model:
+if args.dtype != "float32":
+
+    TARGET_TYPE = aidge_core.dtype.int32
+    if args.dtype == "int8":
+        NB_BITS = 8
+    else:
+        aidge_core.Log.fatal(f"Type {args.dtype} not yet supported. \
+                             \nSupported types : {supported_types}.")
+
     aidge_quantization.quantize_network(
         network = model,
         nb_bits = NB_BITS,
-        calibration_set = aidge_tensors[0:NB_CALIB],
-        clipping_mode = CLIPPING,
+        calibration_set = tensors[0:NB_CALIB],
         target_type = TARGET_TYPE,
-        no_quant = NO_QUANT,
-        optimize_signs = OPTIM_SIGN,
-        single_shift = SINGLE_SHIFT,
-        use_cuda = USE_CUDA,
-        fold_graph = FOLD_GRAPH)
-
-# Tag the scaling producers
-for node in model.get_nodes():
-    if node.type() == "Quantizer":
-        for SNode in get_node_from_metaop(node, "BitShift"):
-            SNode.get_parent(1).attributes().shift_prod = True
-        for CNode in get_node_from_metaop(node, "Mul"):
-            CNode.get_parent(1).attributes().coef_prod = True
-
-model.save("post_ptq_model")
+        single_shift = True,
+        clipping_mode = aidge_quantization.Clipping.MSE,
+        use_cuda = USE_CUDA)
 
-# --------------------------------------------------------------
-# RESCALE THE INPUT SAMPLES
-# --------------------------------------------------------------
+    # --------------------------------------------------------------
+    # TEST THE QUANTIZED MODEL
+    # --------------------------------------------------------------
 
-"""
-Once the quantization is done, the graph now only accepts integer inputs.
-So we need to rescale the dataset for the data to be within [0, 255].
-Also, tensors should be casted to be the same type as TARGET_TYPE.
-"""
-if quantize_model:
+    # The CUDA backend does not yet support all quantized nodes
+    if USE_CUDA:
+        model.set_backend("cpu")
+
+    # Quantize the samples
     rescaling = 2**(NB_BITS-1)-1
-    for i in range(max(NB_TEST, NB_CALIB)):
-        array = np.array(aidge_tensors[i]) * rescaling
+    for i in range(NB_TEST):
+        tensors[i].set_backend("cpu")
+        array = np.array(tensors[i]) * rescaling
         array = np.round(array).astype(int)
-        aidge_tensors[i] = aidge_core.Tensor(array)
-        aidge_tensors[i].set_datatype(TARGET_TYPE)
-        aidge_tensors[i].set_backend("cpu")
-    # Setting modele to CPU for export
-    model.set_backend("cpu")
-
-# --------------------------------------------------------------
-# GENERATE NEW SCHEDULER
-# --------------------------------------------------------------
-
-"""
-Each time the graph has been change, it has to be reset.
-Here some Quantizer and Cast nodes have been added.
-"""
-
-""" [Issue]
-We need first to manually add an input tensor with the correct datatype,
-as it is not automatically done in PTQ.
-"""
-if quantize_model:
-    input_node = model.get_ordered_inputs()[0]
-    input_node[0].get_operator().set_input(0, aidge_tensors[0])
+        tensors[i] = aidge_core.Tensor(array)
+        tensors[i].set_datatype(TARGET_TYPE)
+
+    # The input has not been casted
+    for node in model.get_ordered_inputs():
+        node[0].get_operator().get_input(0).set_datatype(TARGET_TYPE)
+        
+    # Reset the scheduler as the graph may have changed
     scheduler.reset_scheduling()
+    scheduler.generate_scheduling()
 
-# --------------------------------------------------------------
-# PERFORM THE EXAMPLE INFERENCES AGAIN
-# --------------------------------------------------------------
-
-model.save("post_ptq")
-
-if (DO_EXAMPLES and quantize_model):
+    # Perform example inferences on quantized model
     print('\n QUANTIZED EXAMPLE INFERENCES :')
     nb_valid = 0
     post_values = []
     for i in range(NB_TEST):
-        output_array = propagate(model, scheduler, aidge_tensors[i])
+        output_array = propagate(model, scheduler, tensors[i])
         print(labels[i], ' VS ', np.argmax(output_array), ' -> ', np.max(output_array))
         post_values.append(np.max(output_array))
         if (labels[i] == np.argmax(output_array)):
@@ -417,164 +266,38 @@ if (DO_EXAMPLES and quantize_model):
     print('\n MODEL ACCURACY = ', accuracy * 100, '%')
     print('\n QUANTIZED ACCURACY = ', quant_accuracy * 100, '%')
 
-    output_array = propagate(model, scheduler, aidge_tensors[0])
-
-if USE_CUDA:
-    model.set_backend("cpu")
-    for aidge_tensor in aidge_tensors:
-        aidge_tensor.set_backend("cpu")
-
-# --------------------------------------------------------------
-# FUSE NODES INTO METAOPS
-# --------------------------------------------------------------
-
-"""
-Here is made the link between the Aidge model and the CPP
-kernels implementation. In aidge, all the nodes calculations
-are performed separately (Pad -> Conv -> Quantizer -> ReLU -> ...).
-
-However within the CPP export, some core operators are merged
-in meta operators. For instance, the padding, scaling and ReLU are
-performed within the Conv kernel.
-
-In this step, we use graph regex techniques to find the desired patterns
-within the graph in order to match the export implementation of the kernels.
-"""
-
-# Exclude unwanted producers
-"""
-Before fusing the nodes, we set a tag on the Producers in order to exclude
-from the export the ones holding coefficients, as they are directly handled
-within the layers parameters.
-"""
-exclude_unwanted_producers(model)
-
-# Fuse nodes
-cpp_fuse_to_metaops(model)
-
-# Remove optional inputs
-"""
-Some optional inputs may be added by the quantization step (for instance with the clipping nodes).
-Here we make sure that they will not be considered as actual graph inputs by the export, by
-excluding them from the ordered_inputs list of the model.
-"""
-remove_optional_inputs(model)
-
-# Reset scheduler to apply graph modifications
-"""
-The scheduler always needs to be reset after graph manipulation.
-"""
-scheduler.reset_scheduling()
-
-# Name newly created MetaOps
-"""
-As names are optional in Aidge, the fuse_to_metaops function will not automatically
-give a name to the newly created metaOps. However, in an export context, we need
-our operators to be named, as this will be used to name the corresponding files.
-"""
-scheduler.generate_scheduling() # Scheduler needs to be generated as it has just been reset
-set_nodes_names(scheduler)
-
-# --------------------------------------------------------------
-# LOG OUTPUTS FOR THE LAST IMAGE OF THE TEST DATASET
-# --------------------------------------------------------------
-
-"""
-Here a final inference is made on the input we want to export and run.
-This will ensure that all the feature maps tensors (between the layers)
-hold the data corresponding to this specific input.
-Then, the "log_outputs()" function (called later) will store these tensors
-into log files that may be exported as well for comparison purpose.
-"""
-
-output_array = propagate(model, scheduler, aidge_tensors[0])
+    output_array = propagate(model, scheduler, tensors[0])
 
-print("### Exported Sample ###")
-print("Aidge prediction after quantization :", np.argmax(output_array), "(" + str(np.max(output_array)) + ")")
-print("Label :", labels[0])
-
-# --------------------------------------------------------------
-# HANDLING DATATYPE
 # --------------------------------------------------------------
-
-"""
-Now, despite the quantization stage, all the tensors of the model are
-still "virtually" in Int32. Before exporting the model, we have to set
-tensors' datatypes to Int8, except for biases which should remain in Int32.
-"""
-if quantize_model:
-    set_nodes_datatypes(model)
-
-# Store tensors values into log files
-"""
-Once the tensors has been casted, the log_outputs() function can be
-called to store their values into log files.
-"""
-
-if os.path.isdir("log_outputs"):
-    shutil.rmtree("log_outputs")
-model.log_outputs("log_outputs")
-
-# --------------------------------------------------------------
-# TEST MODE
-# --------------------------------------------------------------
-
-"""
-The test mode is mainly used for validation and benchmark. The model will be
-exported in a way that each layer's result will be compared with the CPU implementation.
-The timings for each layer will be displayed.
-In case of error, you will be able to enter debug mode, showing in-layer data or
-changing the inputs of the layer, to isolate the source of the issue.
-"""
-
-for node in model.get_nodes():
-    node.attributes().dev_mode = DEV_MODE
-
-# --------------------------------------------------------------
-# AIDGE CMP
+# EXPORT THE MODEL
 # --------------------------------------------------------------
 
-"""
-If the --aidge_cmp option is enabled, the feature maps generated by aidge with the
-backend cpu will be exported in the generated export. It will be used as reference
-to verify that the results with the optimized kernels are correct for the exported
-model.
-This option has to be passed to each node in order to be used within the Export Nodes.
-(JConv, JPad, ...) that you can find in the "export_gen/operator_export" folder.
-"""
+export_folder_name = f"export_resnet18_{args.dtype}"
 
-if AIDGE_CMP:
-    for node in model.get_nodes():
-        node.attributes().aidge_cmp = True
+aidge_export_cpp.export(
+    export_folder_name=export_folder_name,
+    model=model,
+    scheduler=scheduler,
+    inputs_tensor=tensors[0],
+    labels=aidge_core.Tensor(labels[0]),
+    dev_mode=args.dev_mode,
+    aidge_cmp=args.aidge_cmp,
+    memory_manager_args={"wrapping": True} if args.mem_wrap else {})
 
 # --------------------------------------------------------------
-# EXPORT THE MODEL
+# COMPILING THE EXPORT
 # --------------------------------------------------------------
 
-model.save("exported_model")
-inputs_tensor = aidge_core.Tensor(np.array(aidge_tensors[0]))
-inputs_tensor.set_data_format(aidge_core.dformat.nchw)  # Init the dataformat (default -> nchw)
-inputs_tensor.set_data_format(aidge_core.dformat.nhwc)  # Transpose the data  (nchw -> nhwc)
-if args.dtype == "int8":
-    inputs_tensor.set_datatype(aidge_core.dtype.int8)
-
-aidge_export_cpp.export(EXPORT_FOLDER,
-                        model,
-                        scheduler,
-                        labels = aidge_core.Tensor(labels[0]),
-                        inputs_tensor=inputs_tensor,
-                        dev_mode = DEV_MODE,
-                        aidge_cmp = AIDGE_CMP)
-
 print("\n### Compiling the export ###")
 try:
-    for std_line in aidge_core.utils.run_command(["make"], cwd=EXPORT_FOLDER):
+    command = ["make", f"AIDGE_CMP={str(args.aidge_cmp).lower()}", "SAVE_OUTPUTS=false"]
+    for std_line in aidge_core.utils.run_command(command, cwd=export_folder_name):
         print(std_line, end="")
 except subprocess.CalledProcessError as e:
             raise RuntimeError(0, f"An error occurred, failed to build export.") from e
 print("\n### Running the export ###")
 try:
-    for std_line in aidge_core.utils.run_command(["./bin/run_export"], cwd=EXPORT_FOLDER):
+    for std_line in aidge_core.utils.run_command(["./bin/run_export"], cwd=export_folder_name):
         print(std_line, end="")
 except subprocess.CalledProcessError as e:
-    raise RuntimeError(0, f"An error occurred, failed to run export.") from e
\ No newline at end of file
+    raise RuntimeError(0, f"An error occurred, failed to run export.") from e
diff --git a/version.txt b/version.txt
index 9e11b32fcaa96816319e5d0dcff9fb2873f04061..d15723fbe8de36b1c3ae302c77d8095459ea88e6 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.3.1
+0.3.2