Compare revisions

869e1ffd · bb47cc49 · 4d0751a4 · e500d204 · 8cb3a2c1 · 011d5d35
--- a/.gitignore
+++ b/.gitignore
@@ -13,6 +13,11 @@ __pycache__
 dist*/
 aidge_export_cpp/_version.py
 wheelhouse/*
+env_aidge/
+
+# Temp test folders
+aidge_export_cpp/unit_tests/*_temp_test
+*_test/

 # Mermaid
 *.mmd
@@ -22,3 +27,6 @@ xml*/

 # ONNX
 *.onnx
+
+# GDB
+.gdb_history
\ No newline at end of file
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -12,12 +12,21 @@ stages:
  - deploy

 include:
-  - project: 'eclipse/aidge/gitlab_shared_files' 
+  - project: 'eclipse/aidge/gitlab_shared_files'
    ref: 'main'
-    file: 
+    file:
      # choose which jobs to run by including the corresponding files.
      - '.gitlab/ci/ubuntu_python.gitlab-ci.yml'
      - '.gitlab/ci/release/pip.gitlab-ci.yml'

      #  Since aidge_export_cpp is a pure python package building on windows and on ubuntu doesn't differ
-      # - '.gitlab/ci/windows_python.gitlab-ci.yml' 
+      # - '.gitlab/ci/windows_python.gitlab-ci.yml'
+
+test:ubuntu_python:
+  before_script:
+    - !reference [.setup:test:ubuntu_python, before_script]
+    - DEPS_NAMES=("aidge_onnx" "aidge_quantization")
+    - DEPENDENCY_JOB="build:ubuntu_python"
+    - !reference [.ubuntu:download:artifacts, script]
+    # Need to install extra dependence for tests:
+    - python -m pip install torch torchvision
\ No newline at end of file
--- a/CHANGELOG
+++ b/CHANGELOG
+# Version 0.2.1 (January 31, 2025)
+
 # Version 0.2.0 (december 6, 2024)

 # Version 0.0.1 (January 23, 2024)

--- a/aidge_export_cpp/__init__.py
+++ b/aidge_export_cpp/__init__.py
@@ -2,15 +2,15 @@ r"""
 Aidge Export for CPP standalone projects

 """
-from .export_registry import ExportLibCpp
-
-from .operators import *
-from collections import defaultdict
-import aidge_core
+from pathlib import Path

-from aidge_export_cpp.utils import ROOT
-
-from ._version import *
+# Constants
+FILE = Path(__file__).resolve()
+ROOT = FILE.parents[0]

+from .export_registry import ExportLibCpp
+from .export_utils import *
+from .operators import *
 from .export import *
+from . import benchmark

--- a/aidge_export_cpp/benchmark.py
+++ b/aidge_export_cpp/benchmark.py
+import contextlib
+import os
+from shutil import rmtree
+from subprocess import run
+
+import numpy as np
+
+import aidge_core
+import aidge_backend_cpu
+import aidge_export_cpp
+
+def measure_inference_time(model: aidge_core.GraphView, input_data: list[str, np.ndarray], nb_warmup: int = 10, nb_iterations: int = 50) -> list[float]:
+    # load and set up the model
+    # model.set_datatype(ai.dtype.float32)
+    model.set_backend("cpu")
+
+    # create input Tensor list for the GraphView
+    ordered_inputs: list[aidge_core.Tensor] = [aidge_core.Tensor(i[1]) for i in input_data]
+    # [tmp fix] manual transpositin of data for input of export BEFORE converting to Tensor
+    # for i in input_data:
+    #     nb_dims = len(i[1].shape)
+    #     if nb_dims == 3:
+    #         ordered_inputs.append(aidge_core.Tensor(i[1].transpose(0,2,1).reshape(i[1].shape).copy()))
+    #     if nb_dims == 4:
+    #         ordered_inputs.append(aidge_core.Tensor(np.transpose(i[1], axes=(0,2,3,1)).reshape(i[1].shape).copy()))
+    #     else:
+    #         ordered_inputs.append(aidge_core.Tensor(i[1]))
+
+    # set inputs for the export
+    for i, inp in enumerate(model.get_ordered_inputs()):
+        op = inp[0].get_operator()
+        ordered_inputs[i].set_data_format(aidge_core.dformat.nchw)
+        op.associate_input(i, ordered_inputs[i])
+
+    scheduler = aidge_core.SequentialScheduler(model)
+    scheduler.generate_scheduling()
+
+    for i in range(len(ordered_inputs)):
+        ordered_inputs[i].set_data_format(aidge_core.dformat.nhwc)
+
+    model.set_dataformat(aidge_core.dformat.nhwc)
+    model.set_backend(aidge_export_cpp.ExportLibCpp._name)
+    aidge_core.adapt_to_backend(model)
+    aidge_core.adapt_fc_params_format(model)
+    model.forward_dims([t.dims() for t in ordered_inputs])
+
+    scheduler.reset_scheduling()
+    scheduler.generate_scheduling()
+
+    # for ordered_input in ordered_inputs:
+        # ordered_input.set_backend("cpu")
+    operator_type: str = model.get_ordered_outputs()[0][0].get_operator().type()
+
+    folder_name: str = f"{operator_type.lower()}_test_export_cpp"
+    with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f):
+        aidge_core.export_utils.scheduler_export(
+            scheduler,
+            folder_name,
+            aidge_export_cpp.ExportLibCpp,
+            memory_manager=aidge_core.mem_info.generate_optimized_memory_info,
+            memory_manager_args={"wrapping": False }
+        )
+        aidge_core.export_utils.generate_main_inference_time_cpp(folder_name, model, nb_iterations, nb_warmup)
+
+    with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f):
+        run(['make'], cwd=folder_name, stdout=f)
+
+    timings_str = run(f'./{folder_name}/bin/run_export', capture_output=True, text=True)
+
+    folder_path = os.path.abspath(folder_name)
+    if os.path.exists(folder_path):
+        rmtree(folder_path, ignore_errors=True)
+
+    timings = [float(t) for t in timings_str.stdout.split(' ') if t.strip()]
+    return timings
+
+def compute_output(model: aidge_core.GraphView, input_data: list[str, np.ndarray]) -> list[np.ndarray]:
+    # load and set up the model
+    model.set_backend("cpu")
+
+    # create input Tensor list for the GraphView
+    ordered_inputs: list[aidge_core.Tensor] = [aidge_core.Tensor(i[1]) for i in input_data]
+
+    # set inputs for the export
+    for i, inp in enumerate(model.get_ordered_inputs()):
+        op = inp[0].get_operator()
+        ordered_inputs[i].set_data_format(aidge_core.dformat.nchw)
+        op.associate_input(i, ordered_inputs[i])
+
+    scheduler = aidge_core.SequentialScheduler(model)
+    scheduler.generate_scheduling()
+
+    for i in range(len(ordered_inputs)):
+        ordered_inputs[i].set_data_format(aidge_core.dformat.nhwc)
+
+    model.set_dataformat(aidge_core.dformat.nhwc)
+    model.set_backend(aidge_export_cpp.ExportLibCpp._name)
+    aidge_core.adapt_to_backend(model)
+    aidge_core.adapt_fc_params_format(model)
+    model.forward_dims([t.dims() for t in ordered_inputs])
+
+    scheduler.reset_scheduling()
+    scheduler.generate_scheduling()
+
+    operator_type: str = model.get_ordered_outputs()[0][0].get_operator().type()
+
+    folder_name: str = f"{operator_type.lower()}_test_export_cpp"
+    with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f):
+        aidge_core.export_utils.scheduler_export(
+            scheduler,
+            folder_name,
+            aidge_export_cpp.ExportLibCpp,
+            memory_manager=aidge_core.mem_info.generate_optimized_memory_info,
+            memory_manager_args={"wrapping": False }
+        )
+        aidge_core.export_utils.generate_main_display_output_cpp(folder_name, model)
+
+    with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f):
+        run(['make'], cwd=folder_name, stdout=f)
+
+    output_str: str = run(f'./{folder_name}/bin/run_export', capture_output=True, text=True)
+    folder_path = os.path.abspath(folder_name)
+    if os.path.exists(folder_path):
+        rmtree(folder_path, ignore_errors=True)
+
+    outputs_str: list[str] = output_str.stdout.strip().split('\n')
+    outputs = [np.array([float(val) for val in single_output_str.split(' ') if val.strip()]) for i, single_output_str in enumerate(outputs_str)]
+    output_tensors = []
+    outputs_dims = [pair[0].get_operator().get_output(pair[1]).dims() for pair in model.get_ordered_outputs()]
+    for out_idx, arr in enumerate(outputs):
+        t = aidge_core.Tensor(arr.reshape(outputs_dims[out_idx]))
+        t.set_data_format(aidge_core.dformat.nhwc)
+        t.set_data_format(aidge_core.dformat.nchw)
+        output_tensors.append(np.array(t))
+
+    return output_tensors
--- a/aidge_export_cpp/export.py
+++ b/aidge_export_cpp/export.py
-import re
 import os
-import numpy as np
+import shutil
+from pathlib import Path
+from typing import List, Union

 import aidge_core
+from aidge_core.mem_info import generate_optimized_memory_info
+from aidge_core.export_utils import scheduler_export, generate_main_cpp

-from aidge_core.export_utils.code_generation import *
-from aidge_core.mem_info import compute_default_mem_info
-
-from aidge_export_cpp.utils import ROOT
-from aidge_export_cpp.utils.converter import numpy_dtype2ctype
 from aidge_export_cpp import ExportLibCpp
-from aidge_export_cpp.utils.generation import *
-# from aidge_export_cpp.memory import *
-
-
-def generate_input_file(export_folder:str,
-                        array_name:str,
-                        array: np.ndarray):
-
-    # If directory doesn't exist, create it
-    if not os.path.exists(export_folder):
-        os.makedirs(export_folder)
-
-    generate_file(
-        file_path=f"{export_folder}/{array_name}.h",
-        template_path=str(ROOT / "templates" / "data" / "inputs.jinja"),
-        dims = array.shape,
-        data_t = numpy_dtype2ctype(array.dtype),
-        name = array_name,
-        values = array.tolist()
-    )
-
-
-def export(export_folder_name, graphview, scheduler, mem_wrapping=False):
-    aidge_core.export_utils.scheduler_export(
-        scheduler,
-        export_folder_name,
-        ExportLibCpp,
-        memory_manager=compute_default_mem_info
-    )
-
-    # export_folder = Path().absolute() / export_folder_name
-
-    # os.makedirs(str(export_folder), exist_ok=True)
-
-    # dnn_folder = export_folder / "dnn"
-    # os.makedirs(str(dnn_folder), exist_ok=True)
-
-    # list_actions = []
-    # list_configs = []
-    # peak_mem, mem_info = compute_default_mem_info(scheduler)
-    # list_forward_nodes = scheduler.get_static_scheduling()
-
-    # for node in list_forward_nodes:
-    #     if ExportLibCpp.exportable(node):
-    #         op = ExportLibCpp.get_export_node(node)(node, mem_info[node])
-    #         # For configuration files
-    #         list_configs = op.export(dnn_folder, list_configs)
-
-    #         # For forward file
-    #         list_actions = op.forward(list_actions)
-    #     else:
-    #         raise RuntimeError(f"Operator not supported: {node.type()} !")
-
-    # # Memory management
-    # # stats_folder = export_folder / "statistics"
-    # # os.makedirs(str(stats_folder), exist_ok=True)
-    # # mem_size, mem_info = generate_optimized_memory_info(stats_folder, scheduler, mem_wrapping)
-    # # peak_mem, mem_info = compute_default_mem_info(scheduler)
-
-    # # Generate the memory file
-    # # generate_file(
-    # #     str(dnn_folder / "memory" / "mem_info.h"),
-    # #     str(ROOT / "templates" / "memory" / "mem_info.jinja"),
-    # #     mem_size = mem_size,
-    # #     mem_info_legends = MEMORY_INFO_TEMPLATE,
-    # #     mem_info = mem_info
-    # # )
-    # # list_configs.append("memory/mem_info.h")
-
-    # # Get entry nodes
-    # # Store the datatype & name
-    # list_inputs_name = []
-    # for node in graphview.get_input_nodes():
-    #     for idx, node_input_tuple in enumerate(node.inputs()):
-    #         node_input, _ = node_input_tuple
-    #         if node_input is None:
-    #             export_type = aidge2c(node.get_operator().get_output(0).dtype())
-    #             list_inputs_name.append((export_type, f"{node.name()}_input_{idx}"))
-    #         elif node_input not in graphview.get_nodes():
-    #             export_type = aidge2c(node_input.get_operator().get_output(0).dtype())
-    #             list_inputs_name.append((export_type, node_input.name()))
-
-
-    # # Get output nodes
-    # # Store the datatype & name, like entry nodes
-    # list_outputs_name = []
-    # for node in graphview.get_nodes():
-    #     if len(node.get_children()) == 0:
-    #         export_type = aidge2c(node.get_operator().get_output(0).dtype())
-    #         list_outputs_name.append((export_type, f"{node.name()}_output_0"))
-
-    # # Generate forward file
-    # # TODO: for now the mem type is bound for all intermediate results, should change.
-    # # Note that we may have all inputs constants, hence select output type
-    # assert len(list_outputs_name) >= 1, f"TODO: requires some output to determine mem type"
-    # mem_ctype = list_outputs_name[0][0]
-    # generate_file(
-    #     str(dnn_folder / "src" / "forward.cpp"),
-    #     str(ROOT / "templates" / "network" / "network_forward.jinja"),
-    #     headers=set(list_configs),
-    #     actions=list_actions,
-    #     inputs= list_inputs_name,
-    #     outputs=list_outputs_name,
-    #     mem_ctype=mem_ctype,
-    #     peak_mem=peak_mem
-    # )
-
-    # # Generate dnn API
-    # generate_file(
-    #     str(dnn_folder / "include" / "dnn.hpp"),
-    #     str(ROOT / "templates" / "network" / "dnn_header.jinja"),
-    #     libraries=[],
-    #     functions=get_functions_from_c_file(str(dnn_folder / "src" / "forward.cpp")),
-    # )
-
-    # # Copy all static files in the export
-    # shutil.copy(str(ROOT / "static" / "main.cpp"), str(export_folder))
-    # shutil.copy(str(ROOT / "static" / "Makefile"), str(export_folder))
-    # shutil.copytree(str(ROOT / "static" / "include"), str(dnn_folder / "include"), dirs_exist_ok=True)
+from aidge_export_cpp.export_utils import *
+
+
+def export(export_folder_name: str,
+           model: aidge_core.GraphView,
+           scheduler: Union[List[aidge_core.Node],
+                            aidge_core.Scheduler],
+           inputs_tensor: aidge_core.Tensor = None,
+           labels: aidge_core.Tensor = None,
+           dev_mode: bool = False,
+           aidge_cmp: bool = False,
+           memory_manager = generate_optimized_memory_info,
+           memory_manager_args = {}):
+    
+    """ Export an aidge_core.Scheduler to C++ code
+    
+    :param export_folder_name: Export folder name
+    :type export_folder_name: str
+    :param model: An instance of :py:class:`aidge_core.graph_view`, providing access to nodes and
+                       ordered input/output data within the computational graph.
+    :type model: aidge_core.GraphView
+    :param scheduler: Scheduler instance managing the computation graph.
+                      Uses `graph_view` and `get_sequential_static_scheduling` methods
+    :param inputs_tensor: **For future** argument to provide tensor to use in the main function, not implemented yet!
+                          By default, the input of the given graph will be exported.
+    :type input_tensor: aidge_core.Tensor
+                    to retrieve the computation graph layout and ordered nodes.
+    :type scheduler: aidge_core.Scheduler
+    :param labels: Argument to provide labels tensor to generate and use in the main function. 
+    :type labels: aidge_core.Tensor
+    :param dev_mode: Wether or not the developer mode is enabled. If enabled, the export files
+                     will be symlinks from the aidge_export_cpp module. Therefore, modifying
+                     a file within the export will change the module as well. 
+    :type dev_mode: boolean
+    """
+
+    # Graph Log (Save the state of the graph through export steps)
+    os.makedirs("graph_log", exist_ok=True)
+    model.save(f"graph_log/0_export_start")
+
+    # Remove scaling producers from the export
+    exclude_unwanted_producers(model)
+
+    # Fuse nodes into MetaOps adapted to the CPP Export
+    cpp_fuse_to_metaops(model)
+    model.save(f"graph_log/1_fused_model")
+
+    # Reset the scheduler after graph modification
+    scheduler = aidge_core.SequentialScheduler(model) if scheduler is None else scheduler
+    scheduler.reset_scheduling()
+    scheduler.generate_scheduling()
+
+    # Normalize nodes names
+    set_nodes_names(scheduler)
+    model.save(f"graph_log/2_named_model")
+
+    # Last inference to set the inputs as well as the ifmaps (aidge_cmp)
+    if inputs_tensor is not None:
+        output_array = propagate(model, scheduler, inputs_tensor)
+        aidge_core.Log.notice(f"Exported sample results : {np.argmax(output_array)} ( {str(np.max(output_array))} )")
+        aidge_core.Log.notice(f"Label : {labels}")
+    elif aidge_cmp:
+        aidge_cmp = False
+        aidge_core.Log.error("aidge_cmp : No input_tensor has been provided to the export() function.\n\
+                             Therefore ifmaps have not been generated and aidge_cmp cannot be used.")
+
+    # Set nodes datatypes if the model has been quantized
+    # TODO : Should be changed with future quantization feature
+    if inputs_tensor is not None:
+        if inputs_tensor.dtype() == aidge_core.dtype.int32:
+            set_nodes_datatypes(model)      # Set datatype to int8 only
+            inputs_tensor.set_datatype(aidge_core.dtype.int8)
+            model.save(f"graph_log/3_set_datatypes")
+
+    # [aidge_cmp] Export feature maps tensors as json
+    if aidge_cmp:
+        generate_aidge_ifmaps(model)
+
+    # [aidge_cmp] Set flags on each node
+    if aidge_cmp:
+        for node in model.get_nodes():
+            node.attributes().aidge_cmp = True
+
+    # Set model's dataformat (NHWC)
+    ## Inputs
+    for in_node in model.get_ordered_inputs():
+        input = in_node[0].get_operator().get_input(0)
+        if input is not None:
+            # Transpose the input
+            input_cpy = input.clone()
+            input_cpy.set_data_format(aidge_core.dformat.nchw)
+            input_cpy.set_data_format(aidge_core.dformat.nhwc)
+            in_node[0].get_operator().set_input(0, input_cpy)
+    ## Rest of the graph
+    model.set_dataformat(aidge_core.dformat.nhwc)
+    model.save(f"graph_log/4_set_dataformats")
+
+    # Set model's backend
+    model.set_backend(ExportLibCpp._name)
+
+    # Adapt the graph to the selected backend
+    aidge_core.adapt_to_backend(model)
+    model.save(f"graph_log/5_adapt_to_backend")
+    aidge_core.adapt_fc_params_format(model)
+    model.save(f"graph_log/6_adapt_fc_params_format")
+
+    # At this point, the graph dimensions are supposed to be statically
+    # forwardable, thus allow_data_dependency can be safely set to True
+    dims = []
+    for in_node in model.get_ordered_inputs():
+        dims.append(in_node[0].get_operator().get_input(0).dims())
+    model.forward_dims(dims=dims, allow_data_dependency=True)
+
+    # Reset the scheduling as the graph may have been changed
+    scheduler.reset_scheduling()
+    scheduler.generate_scheduling()
+
+    # Remove existing export
+    export_folder_name = Path(export_folder_name)
+    if os.path.isdir(export_folder_name):
+        print("Removing existing export directory...")
+        shutil.rmtree(export_folder_name)
+
+    # Save the model
+    model.save("graph_log/7_exported_model")
+
+    # Setup stats folder
+    if "stats_folder" not in memory_manager_args:
+        memory_manager_args["stats_folder"] = f"{export_folder_name}/stats"
+
+    # Generate the export
+    scheduler_export(scheduler,
+                     export_folder_name,
+                     ExportLibCpp,
+                     memory_manager=memory_manager,
+                     memory_manager_args=memory_manager_args,
+                     dev_mode=dev_mode)
+    model.save(f"{export_folder_name}/graph")
+    
+    # Generate main file
+    generate_main_cpp(export_folder_name, model, labels=labels, inputs_tensor=inputs_tensor)
+
+    # Generate log files (aidge_cmp option)
+    if aidge_cmp:
+        export_aidge_ifmaps(export_folder_name)
--- a/aidge_export_cpp/export_registry.py
+++ b/aidge_export_cpp/export_registry.py
 from aidge_core.export_utils import ExportLib
-from aidge_export_cpp.utils import ROOT
+from aidge_export_cpp import ROOT

 class ExportLibCpp(ExportLib):
    _name="export_cpp"
    static_files={
        str(ROOT / "static" / "Makefile"): "",
-        str(ROOT / "static" / "include" / "network" / "typedefs.hpp"): "dnn/include/network",
-        str(ROOT / "static" / "include" / "network" / "utils.hpp"): "dnn/include/network",
+        str(ROOT / "static" / "typedefs.hpp"): "dnn/include/network",
+        str(ROOT / "static" / "utils.hpp"): "dnn/include/network",
+        str(ROOT / "static" / "rescaling_utils.hpp"): "dnn/include/network",
+        str(ROOT / "static" / "activation_utils.hpp"): "dnn/include/network",
    }
--- a/aidge_export_cpp/export_utils.py
+++ b/aidge_export_cpp/export_utils.py
+import os
+import json
+import numpy as np
+from collections import OrderedDict
+
+import aidge_core
+from aidge_core.export_utils import get_node_from_metaop, aidge2c, generate_file
+
+from aidge_export_cpp import ROOT
+
+def cpp_fuse_to_metaops(graph_view: aidge_core.GraphView):
+    """ 
+    Fuse nodes into metaops adapted for the CPP Export
+    TODO: These recipes should be in aidge_core
+
+    :param graph_view: An instance of :py:class:`aidge_core.GraphView`, providing access to nodes and
+                       ordered input/output data within the computational graph.
+    """
+
+    cpp_recipes = OrderedDict({
+        # Quantization
+        "QMul":           "Mul->Quantizer",     # Fixed Point Scaling
+
+        # FC
+        "QFC":            "FC->(Quantizer|QMul)",
+        "FCAct":          "(FC|QFC)->ReLU",
+
+        # Conv
+        "QConv":          "Conv2D->(Quantizer|QMul)",
+        "PadConv":        "(QConv|Conv2D)<-Pad2D",
+        "ConvAct":        "(QConv|Conv2D)->ReLU",
+        "PadConvAct":     "PadConv->ReLU",
+
+        # ConvDw
+        "QConvDw":          "ConvDepthWise2D->(Quantizer|QMul)",
+        "ConvDwPad":        "(QConvDw|ConvDepthWise2D)->Pad2D",
+        "ConvDwAct":        "(QConvDw|ConvConvDepthWise2D2D)->ReLU",
+        "ConvDwActPad":     "ConvDwAct->Pad2D",
+
+        # Max Pooling
+        "PadMaxPool":     "MaxPooling2D<-Pad2D",
+        "MaxPoolAct":     "MaxPooling2D->ReLU",
+        "PadMaxPoolAct":  "PadMaxPool->ReLU",
+
+        # Average Pooling
+        "PadAvgPool":     "AvgPooling2D<-Pad2D",
+        "AvgPoolAct":     "AvgPooling2D->ReLU",
+        "PadAvgPoolAct":  "PadAvgPool->ReLU",
+
+        # Global Average Pooling
+        "PadGlobalAvgPool":     "GlobalAveragePooling2D<-Pad2D",
+        "GlobalAvgPoolAct":     "GlobalAveragePooling2D->ReLU",
+        "PadGlobalAvgPoolAct":  "PadGlobalAveragePool->ReLU",
+
+        # ElemWise
+        "QAdd":      "Add->(Quantizer|QMul)",
+        "QSub":      "Sub->(Quantizer|QMul)",
+        # "QMul":    "Mul->Quantizer",      # Already defined
+        "AddAct":    "(QAdd|Add)->ReLU",
+        "SubAct":    "(QSub|Sub)->ReLU",
+        "MulAct":    "(QMul|Mul)->ReLU",
+
+        # Activation
+        "QReLU":        "ReLU->(Quantizer|QMul)",
+    })
+
+    for node, recipe in cpp_recipes.items():
+        aidge_core.fuse_to_metaops(graph_view, recipe, node)
+
+
+
+def set_nodes_names(scheduler):
+    """
+    Set the CPP nodes names as well as their producers. 
+    The producers naming is handled from their child node.
+    
+    [TODO] Fc and Conv layers will always have weights as parent 1 and 
+    possibly biases as parent 2. It may be better to previously label the 
+    producers. 
+
+    :param scheduler: Scheduler instance managing the computation graph.
+                      Uses `graph_view` and `get_sequential_static_scheduling` methods
+                      to retrieve the computation graph layout and ordered nodes.
+    :type scheduler: aidge_core.Scheduler
+    """
+
+    node_ids = {}   # Dict holding the node type along with a counter
+    node_it = 0     # Node Iterator
+
+    ## MetaOps
+    for node in scheduler.get_sequential_static_scheduling():
+        node_type = node.type()
+
+        if node_type != "Producer":
+            if node.type() not in node_ids:
+                node_ids[node_type] = 0
+
+            # Set node name
+            node.set_name("_" + str(node_it) + "_" +
+                            str(node_type) + "_" + str(node_ids[node_type]))
+            node_ids[node_type] += 1
+            node_it += 1
+
+            # Set producers names
+            ## Weights & Biases producers
+            if get_node_from_metaop(node, "FC") or \
+               get_node_from_metaop(node, "Conv2D") or \
+               get_node_from_metaop(node, "ConvDepthWise2D"):
+                
+                node.get_parent(1).set_name(node.name() + "_weights")
+                if node.get_parent(2) is not None:
+                    node.get_parent(2).set_name(node.name() + "_biases")  
+
+    ## Scaling Producers
+    for node in scheduler.get_sequential_static_scheduling():
+        """
+        TODO: If multiple quantizer nodes are found, the producers will
+        all have the same name and this will not work properly. 
+        """
+        if node.type() == "Producer":
+            child_node = node.output(0)[0][0]
+            if node.attributes().has_attr("shift_prod"):
+                node.set_name(child_node.name() + "_shift")
+            if node.attributes().has_attr("coef_prod"):
+                node.set_name(child_node.name() + "_coef")
+
+
+
+def set_nodes_datatypes(graph_view: aidge_core.GraphView):
+    """ Set the nodes' datatypes
+
+    The set_datatype function can't be used on Conv2D and FC nodes directly
+    as the biases datatype is different from the other inputs. 
+    TODO: Should be using forward_datatype()
+
+    :param graph_view: An instance of :py:class:`aidge_core.graph_view`, providing access to nodes and
+                       ordered input/output data within the computational graph.
+    """
+    for node in graph_view.get_nodes():
+        if node.type() != "Producer":
+            if get_node_from_metaop(node, "FC") or \
+               get_node_from_metaop(node, "Conv2D") or \
+               get_node_from_metaop(node, "ConvDepthWise2D"):
+                
+                if node.get_operator().get_input(0) is not None:
+                    node.get_operator().get_input(0).set_datatype(aidge_core.dtype.int8)    # Input
+                node.get_operator().get_input(1).set_datatype(aidge_core.dtype.int8)    # Weights
+                if node.get_parent(2) is not None:
+                    node.get_operator().get_input(2).set_datatype(aidge_core.dtype.int32)   # Biases
+                node.get_operator().get_output(0).set_datatype(aidge_core.dtype.int8)       # Output
+            else:
+                node.get_operator().set_datatype(aidge_core.dtype.int8)
+
+    # Set input node's datatype
+    for n in graph_view.get_input_nodes():
+        if n.get_operator().get_input(0) is not None:
+            n.get_operator().get_input(0).set_datatype(aidge_core.dtype.int8)
+
+
+
+def exclude_unwanted_producers(model):
+    """ Exclude some producers not needed for the export
+
+    Currently excludes the producers attached to the Mul and BitShift nodes, as they are
+    tensors holding a single data. This data is retrieved during the export
+    generation process and passed as argument directly within the Mul layer
+    configuration. 
+    """
+
+    nodes_to_ignore = ["Mul", "BitShift", "Clip"]
+
+    for node in model.get_nodes():
+        if node.type() == "Producer":
+            children_nodes = [n.type() for n in node.get_children()]
+            for node_type in nodes_to_ignore:
+                if node_type in children_nodes:
+                    node.attributes().ignore = True
+                    break
+
+
+
+def set_scaling_attributes(export_node: aidge_core.export_utils.ExportNode, node: aidge_core.Node):
+    """
+    Look recursively for a Quantizer node inside of the given node, 
+    then set shift and coef attributes of the given export node.
+    [TODO] Should be moved into aidge_core.ExportNode
+
+    :param export_node: An instance of :py:class:`aidge_core.export_utils.ExportNode` to set the scaling
+                        attributes needed for a quantized export. 
+    :type export_node: aidge_core.export_utils.ExportNode
+    :param node: Node which may hold a Quantizer node. 
+    :type node: aidge_core.Node
+    """
+
+    QNode = get_node_from_metaop(node, "Quantizer")
+    if QNode:
+        BNode = get_node_from_metaop(QNode[0], "BitShift")
+        export_node.attributes["shift_value"] = BNode[0].get_operator().get_input(1)[0]
+
+    QMulNode = get_node_from_metaop(node, "QMul")
+    if QMulNode:
+        CNode = get_node_from_metaop(QMulNode[0], "Mul")
+        export_node.attributes["coef_value"] = CNode[0].get_operator().get_input(1)[0]
+
+      
+            
+def normalize(array):
+    """
+    Normalize an input image between -1 and 1
+    """
+    if array.max() == array.min():
+        return array/array.max()
+    array = (array - array.min()) / (array.max() - array.min())
+    return 2 * array - 1
+
+
+
+def generate_aidge_ifmaps(model):
+
+    json_nodes = []
+    for node in model.get_nodes():
+        if node.type() != "Producer":
+
+            output = node.get_operator().get_output(0)
+            data = {
+                "name": node.name(),
+                "dims": output.dims(),
+                "dtype": aidge2c(output.dtype()),
+                "dformat": str(output.dformat()),
+                "values": np.array(output).tolist()
+            }
+            json_nodes.append(data)
+
+    # Write the entire list to the JSON file after the loop
+    with open('aidge_output.json', 'w') as file:
+        json.dump(json_nodes, file, indent=2, separators=(",", ": "))
+
+
+
+def export_aidge_ifmaps(export_folder_name):
+    os.makedirs(export_folder_name / "data" / "aidge_outputs")
+    os.makedirs(export_folder_name / "data" / "export_outputs")
+
+    # Load the JSON data from the file
+    with open('aidge_output.json', 'r') as file:
+        json_nodes = json.load(file)
+
+    # Access the data
+    for node in json_nodes:
+        name = node["name"]
+        dims = node["dims"]
+        dtype = node["dtype"]
+        dformat = node["dformat"]
+        values = node["values"]
+
+        generate_file(export_folder_name / "data" / "aidge_outputs" / (name + ".hpp"),
+                      ROOT / "templates" / "data" / "aidge_tensor.jinja",
+                      dtype=dtype,
+                      dformat=dformat,
+                      name=name + "_output_0_aidge",
+                      dims=dims,
+                      values=values)
+
+    # Remove the JSON file
+    os.remove('aidge_output.json')
+
+
+
+def propagate(model, scheduler, tensor):
+    """
+    Propagate the given tensor into the model and return the
+    output tensor.
+    """
+    # Run the inference
+    scheduler.forward(True, [tensor])
+    # Gather the results
+    output_node = model.get_ordered_outputs()[0][0]
+    output_tensor = output_node.get_operator().get_output(0).clone()
+    output_tensor.set_backend("cpu")
+    return np.array(output_tensor)
--- a/aidge_export_cpp/kernels/activation.hpp
+++ b/aidge_export_cpp/kernels/activation.hpp
 #ifndef __AIDGE_EXPORT_CPP_KERNELS_ACTIVATION__
 #define __AIDGE_EXPORT_CPP_KERNELS_ACTIVATION__

-#include <type_traits>
-#include "network/typedefs.hpp"
-#include "network/utils.hpp"
-#include "kernels/rescaling.hpp"
+#include "network/activation_utils.hpp"
+#include "network/rescaling_utils.hpp"
+#include <sys/types.h>

-template<typename Output_T, typename T,
-         typename std::enable_if<std::is_floating_point<T>::value>::type* = nullptr>
-__attribute__((always_inline)) inline
-Output_T saturate (T value, int32_t /*sat*/)
-{
-    return value;
-}
-
-template<typename Output_T, typename T,
-         typename std::enable_if<!std::is_floating_point<T>::value>::type* = nullptr>
-__attribute__((always_inline)) inline
-Output_T saturate (T value, uint32_t sat)
-{
-    if (std::is_unsigned<Output_T>::value) {
-        return clamp(value, T(0), (T(1) << sat) - 1);
-    } else {
-        return clamp(value, -(T(1) << (sat - 1)), (T(1) << (sat - 1)) - 1);
-    }
-}
-
-template<typename Output_T,
-         typename Sum_T,
-         typename Rescaling_T>
-__attribute__((always_inline)) inline
-Output_T activation_forward_value (Sum_T weightedSum,
-                                   int output,
-                                   ActivationFunction_T func,
-                                   const Rescaling_T& __restrict rescaling)
-{
-    switch(func) {
-        case Linear:
-        case Saturation: {
-            break;
-        }
-        case Rectifier: {
-            if(weightedSum <= 0) weightedSum = 0;
-            break;
-        }
-        default:
-            // Unsupported activation function
-            break;
-    }
-
-    // Value fixed here for now but it should be generated by
-    // the export module or determined by the type of Output_T
-    // For now only works for int8_t and uint8_t
-    const uint32_t NB_BITS = 8;
-    return saturate<Output_T>(rescaling(weightedSum, output), NB_BITS);
-}
-
-
-template<int NB_DATA,
+template<size_t NB_ELTS,
         ActivationFunction_T ACTIVATION,
+         // Memory mapping: inputs
+         size_t INPUT_MEM_CONT_OFFSET,
+         size_t INPUT_MEM_CONT_SIZE,
+         size_t INPUT_MEM_WRAP_OFFSET,
+         size_t INPUT_MEM_WRAP_SIZE,
+         size_t INPUT_MEM_STRIDE,
+         // Memory mapping: outputs
+         size_t OUTPUT_MEM_CONT_OFFSET,
+         size_t OUTPUT_MEM_CONT_SIZE,
+         size_t OUTPUT_MEM_WRAP_OFFSET,
+         size_t OUTPUT_MEM_WRAP_SIZE,
+         size_t OUTPUT_MEM_STRIDE,
         typename Input_T, typename Output_T, typename Rescaling_T>
 __attribute__((always_inline)) inline
 void activation_forward (
@@ -66,12 +26,22 @@ void activation_forward (
    Output_T* __restrict outputs,
    const Rescaling_T& __restrict rescaling)
 {
-    for (int i = 0; i < NB_DATA; ++i)
-    {
-        outputs[i] = activation_forward_value<Output_T>(inputs[i], i, ACTIVATION, rescaling);
-    }
+    int inOffset = 0;
+    int outOffset = 0;

-}
+    for (size_t i = 0; i < NB_ELTS; ++i) {
+        if (INPUT_MEM_WRAP_SIZE > 0 && i == static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) {
+            inOffset = (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET
+                        - INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
+        }
+
+        if (OUTPUT_MEM_WRAP_SIZE > 0 && i == static_cast<int>(OUTPUT_MEM_CONT_SIZE / sizeof(Output_T))) {
+            outOffset = (OUTPUT_MEM_WRAP_OFFSET - OUTPUT_MEM_CONT_OFFSET
+                        - OUTPUT_MEM_CONT_SIZE) / sizeof(Output_T);
+        }

+        outputs[outOffset + i] = activation_forward_value<Output_T>(inputs[inOffset + i], i, ACTIVATION, rescaling);
+    }
+}

 #endif  // __AIDGE_EXPORT_CPP_KERNELS_ACTIVATION__
--- a/aidge_export_cpp/kernels/batchnorm.hpp
+++ b/aidge_export_cpp/kernels/batchnorm.hpp
@@ -2,16 +2,18 @@
 #define __AIDGE_EXPORT_CPP_KERNELS_BATCHNORM__

 #include "network/typedefs.hpp"
-#include "kernels/rescaling.hpp"
+#include "network/activation_utils.hpp"
+#include <sys/types.h>
 #include <math.h>

 // WARNING: this kernel only works for 32-bits floating point values

-template<int NB_OUTPUTS,
-         int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
+template<size_t NB_BATCHES, size_t NB_OUTPUTS,
+         size_t OUTPUTS_HEIGHT, size_t OUTPUTS_WIDTH,
         ActivationFunction_T ACTIVATION,
         typename Input_T, typename Output_T,
-         typename Param_T>
+         typename Param_T,
+         typename Rescaling_T>
 __attribute__((always_inline)) inline
 void batchnorm_forward (
    const Input_T* __restrict inputs,
@@ -20,18 +22,22 @@ void batchnorm_forward (
    const Param_T* __restrict variances,
    const Param_T* __restrict means,
    const Param_T* __restrict scales,
-    const double epsilon)
+    const double epsilon,
+    const Rescaling_T& __restrict rescaling)
 {
-    for (unsigned int output = 0; output < NB_OUTPUTS; ++output) {
-        const Output_T var = sqrt(variances[output] + epsilon);
+    for (size_t batch = 0; batch < NB_BATCHES; ++batch) {
+        for (size_t output = 0; output < NB_OUTPUTS; ++output) {
+            // If the variance is 0, we need to avoid division by 0
+            Output_T var = sqrt(variances[output] > 0.0 ? variances[output] + epsilon : epsilon);

-        for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
-            for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
-                const int outputOffset = OUTPUTS_HEIGHT * oy + ox;
+            for (size_t oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
+                for (size_t ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
+                    const size_t outputOffset = batch * OUTPUTS_WIDTH * OUTPUTS_HEIGHT * NB_OUTPUTS + output * OUTPUTS_WIDTH * OUTPUTS_HEIGHT + OUTPUTS_WIDTH * oy + ox;

-                const Output_T normalized = (inputs[outputOffset + output] - means[output]) / var;
-                const Output_T sAs = scales[output] * normalized + biases[output];
-                outputs[outputOffset + output] = sat<Output_T>(sAs, output, ACTIVATION, NoScaling);
+                    const Output_T normalized = (inputs[outputOffset] - means[output]) / var;
+                    const Output_T sAs = scales[output] * normalized + biases[output];
+                    outputs[outputOffset] = activation_forward_value<Output_T>(sAs, output, ACTIVATION, rescaling);
+                }
            }
        }
    }

--- a/aidge_export_cpp/kernels/concat.hpp
+++ b/aidge_export_cpp/kernels/concat.hpp
+#ifndef __AIDGE_EXPORT_CPP_KERNELS_CONCAT__
+#define __AIDGE_EXPORT_CPP_KERNELS_CONCAT__
+
+#include <sys/types.h>
+
+template<size_t AXIS_SIZE_POST,
+         size_t AXIS_SIZE_PRE,
+         const size_t AXIS_SIZE[],
+         size_t TOTAL_AXIS_SIZE,
+         size_t NB_INPUTS,
+         typename T>
+__attribute__((always_inline)) inline static
+void concat_forward (
+    const T* const * __restrict inputs,
+    T* __restrict output)
+{
+    for (size_t i = 0; i < AXIS_SIZE_PRE; ++i) {
+        // Loop over post-axis (e.g., dims after axis 1)
+        for (size_t j = 0; j < AXIS_SIZE_POST; ++j) {
+            size_t axis_offset = 0;
+
+            // Loop over each input tensor
+            for (size_t n = 0; n < NB_INPUTS; ++n) {
+                for (size_t k = 0; k < AXIS_SIZE[n]; ++k) {
+                    const size_t input_idx  = i * AXIS_SIZE[n] * AXIS_SIZE_POST + k * AXIS_SIZE_POST + j;
+
+                    output[i * TOTAL_AXIS_SIZE * AXIS_SIZE_POST + (axis_offset + k) * AXIS_SIZE_POST + j] =
+                        inputs[n][input_idx];
+                }
+
+                axis_offset += AXIS_SIZE[n];  // move along axis in output
+            }
+        }
+    }
+}
+
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_CONCAT__
\ No newline at end of file
--- a/aidge_export_cpp/kernels/convolution.hpp
+++ b/aidge_export_cpp/kernels/convolution.hpp
@@ -2,21 +2,33 @@
 #define __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION__

 #include "network/typedefs.hpp"
-#include "kernels/rescaling.hpp"
+#include "network/rescaling_utils.hpp"
 #include "network/utils.hpp"
-#include "kernels/macs.hpp"
-#include "kernels/activation.hpp"
-
-
-template<int NB_CHANNELS,
-         int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
-         int NB_OUTPUTS,
-         int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
-         int PADDING_Y, int PADDING_X,
-         int STRIDE_Y, int STRIDE_X,
-         int DILATION_Y, int DILATION_X,
-         int KERNEL_HEIGHT, int KERNEL_WIDTH,
+#include "network/macs.hpp"
+#include "network/activation_utils.hpp"
+#include <sys/types.h>
+
+template<size_t NB_CHANNELS,
+         size_t CHANNELS_HEIGHT, size_t CHANNELS_WIDTH,
+         size_t NB_OUTPUTS,
+         size_t OUTPUTS_HEIGHT, size_t OUTPUTS_WIDTH,
+         size_t PADDING_Y, size_t PADDING_X,
+         size_t STRIDE_Y, size_t STRIDE_X,
+         size_t DILATION_Y, size_t DILATION_X,
+         size_t KERNEL_HEIGHT, size_t KERNEL_WIDTH,
         ActivationFunction_T ACTIVATION,
+         // Memory mapping: inputs
+         size_t INPUT_MEM_CONT_OFFSET,
+         size_t INPUT_MEM_CONT_SIZE,
+         size_t INPUT_MEM_WRAP_OFFSET,
+         size_t INPUT_MEM_WRAP_SIZE,
+         size_t INPUT_MEM_STRIDE,
+         // Memory mapping: outputs
+         size_t OUTPUT_MEM_CONT_OFFSET,
+         size_t OUTPUT_MEM_CONT_SIZE,
+         size_t OUTPUT_MEM_WRAP_OFFSET,
+         size_t OUTPUT_MEM_WRAP_SIZE,
+         size_t OUTPUT_MEM_STRIDE,
         typename Input_T, typename Output_T,
         typename Weight_T, typename Bias_T,
         typename Rescaling_T>
@@ -28,60 +40,82 @@ void convolution_forward(
    const Bias_T* __restrict biases,
    const Rescaling_T& __restrict rescaling)
 {
-    constexpr int DILATED_KERNEL_HEIGHT
-            = KERNEL_HEIGHT + (DILATION_Y - 1) * (KERNEL_HEIGHT - 1);
-
-    constexpr int DILATED_KERNEL_WIDTH
-            = KERNEL_WIDTH + (DILATION_X - 1) * (KERNEL_WIDTH - 1);
-
-    constexpr int OUTPUTS_HEIGHT_NOPAD
+    constexpr size_t OUTPUTS_HEIGHT_NOPAD
        = (CHANNELS_HEIGHT - DILATION_Y * (KERNEL_HEIGHT - 1) - 1 + STRIDE_Y) / STRIDE_Y;
-    constexpr int OUTPUTS_WIDTH_NOPAD
+    constexpr size_t OUTPUTS_WIDTH_NOPAD
        = (CHANNELS_WIDTH - DILATION_X * (KERNEL_WIDTH - 1) - 1 + STRIDE_X) / STRIDE_X;

-    for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
-        const int syMin = (PADDING_Y == 0) ? 0
-            : max(PADDING_Y - (oy * STRIDE_Y), 0);
-        const int syMax = (PADDING_Y == 0
-                && OUTPUTS_HEIGHT == OUTPUTS_HEIGHT_NOPAD) ? DILATED_KERNEL_HEIGHT
-            : clamp(CHANNELS_HEIGHT + PADDING_Y - (oy * STRIDE_Y),
-                    0, DILATED_KERNEL_HEIGHT);
-        const int iy = (oy * STRIDE_Y) - PADDING_Y;
+    for (size_t oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
+        const size_t syMin = (PADDING_Y == 0) ? 0
+             : max((PADDING_Y - (oy * STRIDE_Y) + DILATION_Y - 1) / DILATION_Y, 0);
+        const size_t syMax = (PADDING_Y == 0
+                && OUTPUTS_HEIGHT == OUTPUTS_HEIGHT_NOPAD) ? KERNEL_HEIGHT
+            : clamp((CHANNELS_HEIGHT + PADDING_Y - (oy * STRIDE_Y)) / DILATION_Y,
+                    0, KERNEL_HEIGHT);
+        const int iy = static_cast<int>(oy * STRIDE_Y) - static_cast<int>(PADDING_Y);

+#ifdef _OPENMP
 #pragma omp parallel for collapse(2)
-        for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
-            for (int output = 0; output < NB_OUTPUTS; ++output) {
+#endif
+        for (size_t ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
+            for (size_t output = 0; output < NB_OUTPUTS; ++output) {
                // moved to inner loop for collapsing -->
-                const int sxMin = (PADDING_X == 0) ? 0
-                    : max(PADDING_X - (ox * STRIDE_X), 0);
-                const int sxMax = (PADDING_X == 0
+                const size_t sxMin = (PADDING_X == 0) ? 0
+                    : max((PADDING_X - (ox * STRIDE_X) + DILATION_X - 1) / DILATION_X, 0);
+                const size_t sxMax = (PADDING_X == 0
                        && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
-                            ? DILATED_KERNEL_WIDTH
-                    : clamp(CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X),
-                            0, DILATED_KERNEL_WIDTH);
-                const int ix = (ox * STRIDE_X) - PADDING_X;
+                            ? KERNEL_WIDTH
+                    : clamp((CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X)) / DILATION_X,
+                            0, KERNEL_WIDTH);
+                const int ix = static_cast<int>(ox * STRIDE_X) - static_cast<int>(PADDING_X);

-                const int oPos = (ox + OUTPUTS_WIDTH * oy);
-                int oOffset = NB_OUTPUTS * oPos;
+                const size_t oPos = (ox + OUTPUTS_WIDTH * oy);
+                int oOffset = (OUTPUT_MEM_STRIDE / sizeof(Output_T)) * oPos;

-                // <--
+                if (OUTPUT_MEM_WRAP_SIZE > 0 && oOffset >= static_cast<int>(OUTPUT_MEM_CONT_SIZE / sizeof(Output_T))) {
+                    oOffset += (OUTPUT_MEM_WRAP_OFFSET - OUTPUT_MEM_CONT_OFFSET
+                                - OUTPUT_MEM_CONT_SIZE) / sizeof(Output_T);
+                }

-                Bias_T weightedSum = biases[output];
+                // <--
+                // Check if the biases are defined
+                Bias_T weightedSum = biases ? biases[output] : 0;

-                for (int sy = 0; sy < KERNEL_HEIGHT; ++sy) {
+                for (size_t sy = 0; sy < KERNEL_HEIGHT; ++sy) {
                    if ((PADDING_Y != 0
                            || OUTPUTS_HEIGHT != OUTPUTS_HEIGHT_NOPAD)
-                        && ((sy*DILATION_Y < syMin) || (sy*DILATION_Y >= syMax)))
+                        && sy >= syMax - syMin)
                    {
-                        continue;
+                        break;
                    }

-                    const int iPos = ix + CHANNELS_WIDTH * (iy + sy*DILATION_Y);
-                    int iOffset = NB_CHANNELS * iPos;
+                    const size_t iPos = static_cast<size_t>(sxMin * DILATION_X + ix)
+                                        + CHANNELS_WIDTH * (static_cast<size_t>(iy + (syMin + sy) * DILATION_Y));
+                    int iOffset = (INPUT_MEM_STRIDE / sizeof(Input_T)) * iPos;

-                    const int wOffset = (output*KERNEL_HEIGHT + sy) * KERNEL_WIDTH * NB_CHANNELS;
+                    // Wrapping cannot occur in the middle of a line, except if
+                    // there is only one line (1D)!
+                    bool wrapInRange = false;

-                    if (DILATION_X == 1 && ((PADDING_X == 0 && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
+                    if (INPUT_MEM_WRAP_SIZE > 0
+                        && iOffset >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
+                    {
+                        iOffset += (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET
+                                    - INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
+                    }
+                    else if (INPUT_MEM_WRAP_SIZE > 0 && KERNEL_WIDTH > 1
+                        && CHANNELS_HEIGHT == 1 // single line (1D)!
+                        && iOffset + KERNEL_WIDTH * NB_CHANNELS
+                            > (INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
+                    {
+                        wrapInRange = true;
+                    }
+
+                    const size_t wOffset = NB_CHANNELS * (sxMin
+                        + KERNEL_WIDTH * (syMin + sy + KERNEL_HEIGHT * output));
+
+                    if (!wrapInRange && NB_CHANNELS == (INPUT_MEM_STRIDE / sizeof(Input_T))
+                        && DILATION_X == 1 && ((PADDING_X == 0 && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
                        || sxMax - sxMin == KERNEL_WIDTH))
                    {
                        macsOnRange<KERNEL_WIDTH * NB_CHANNELS>(
@@ -90,16 +124,24 @@ void convolution_forward(
                            weightedSum);
                    }
                    else {
-                        for (int sx = 0; sx < KERNEL_WIDTH; ++sx) {
+                        for (size_t sx = 0; sx < KERNEL_WIDTH; ++sx) {
                            if ((PADDING_X != 0
                                    || OUTPUTS_WIDTH != OUTPUTS_WIDTH_NOPAD)
-                                && ((sx*DILATION_X < sxMin) || (sx*DILATION_X >= sxMax)))
+                                && sx >= sxMax - sxMin)
                            {
-                                continue;
+                                break;
                            }

                            int iOffsetInRange = iOffset
-                                + sx * DILATION_X * NB_CHANNELS;
+                                + sx * DILATION_X * (INPUT_MEM_STRIDE / sizeof(Input_T));
+
+                            if (wrapInRange
+                                && iOffsetInRange >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
+                            {
+                                iOffsetInRange += (INPUT_MEM_WRAP_OFFSET
+                                            - INPUT_MEM_CONT_OFFSET
+                                            - INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
+                            }

                            macsOnRange<NB_CHANNELS>(
                                // same input line so no wrapping can occur
@@ -116,4 +158,69 @@ void convolution_forward(
    }
 }

+// Template overloading when biases are not given to the convolution
+template<size_t NB_CHANNELS,
+         size_t CHANNELS_HEIGHT, size_t CHANNELS_WIDTH,
+         size_t NB_OUTPUTS,
+         size_t OUTPUTS_HEIGHT, size_t OUTPUTS_WIDTH,
+         size_t PADDING_Y, size_t PADDING_X,
+         size_t STRIDE_Y, size_t STRIDE_X,
+         size_t DILATION_Y, size_t DILATION_X,
+         size_t KERNEL_HEIGHT, size_t KERNEL_WIDTH,
+         ActivationFunction_T ACTIVATION,
+         // Memory mapping: inputs
+         size_t INPUT_MEM_CONT_OFFSET,
+         size_t INPUT_MEM_CONT_SIZE,
+         size_t INPUT_MEM_WRAP_OFFSET,
+         size_t INPUT_MEM_WRAP_SIZE,
+         size_t INPUT_MEM_STRIDE,
+         // Memory mapping: outputs
+         size_t OUTPUT_MEM_CONT_OFFSET,
+         size_t OUTPUT_MEM_CONT_SIZE,
+         size_t OUTPUT_MEM_WRAP_OFFSET,
+         size_t OUTPUT_MEM_WRAP_SIZE,
+         size_t OUTPUT_MEM_STRIDE,
+         typename Input_T, typename Output_T,
+         typename Weight_T,
+         typename Rescaling_T>
+__attribute__((always_inline)) inline
+void convolution_forward(
+    const Input_T* __restrict inputs,
+    Output_T* __restrict outputs,
+    const Weight_T* __restrict weights,
+    std::nullptr_t __restrict,
+    const Rescaling_T& __restrict rescaling)
+{
+    const float* b = nullptr;
+
+    convolution_forward<NB_CHANNELS,
+                        CHANNELS_HEIGHT,
+                        CHANNELS_WIDTH,
+                        NB_OUTPUTS,
+                        OUTPUTS_HEIGHT,
+                        OUTPUTS_WIDTH,
+                        PADDING_Y,
+                        PADDING_X,
+                        STRIDE_Y,
+                        STRIDE_X,
+                        DILATION_Y,
+                        DILATION_X,
+                        KERNEL_HEIGHT,
+                        KERNEL_WIDTH,
+                        ACTIVATION,
+                        // Memory mapping: inputs
+                        INPUT_MEM_CONT_OFFSET,
+                        INPUT_MEM_CONT_SIZE,
+                        INPUT_MEM_WRAP_OFFSET,
+                        INPUT_MEM_WRAP_SIZE,
+                        INPUT_MEM_STRIDE,
+                        // Memory mapping: outputs
+                        OUTPUT_MEM_CONT_OFFSET,
+                        OUTPUT_MEM_CONT_SIZE,
+                        OUTPUT_MEM_WRAP_OFFSET,
+                        OUTPUT_MEM_WRAP_SIZE,
+                        OUTPUT_MEM_STRIDE>
+                        (inputs, outputs, weights, b, rescaling);
+}
+
 #endif  // __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION__
--- a/aidge_export_cpp/kernels/convolution_depthwise.hpp
+++ b/aidge_export_cpp/kernels/convolution_depthwise.hpp
+#ifndef __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION_DEPTHWISE__
+#define __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION_DEPTHWISE__
+
+#include "network/typedefs.hpp"
+#include "network/rescaling_utils.hpp"
+#include "network/utils.hpp"
+#include "network/macs.hpp"
+#include "network/activation_utils.hpp"
+#include <sys/types.h>
+
+template<size_t NB_CHANNELS, 
+         size_t CHANNELS_HEIGHT, size_t CHANNELS_WIDTH,
+         size_t NB_OUTPUTS,
+         size_t OUTPUTS_HEIGHT, size_t OUTPUTS_WIDTH,
+         size_t PADDING_Y, size_t PADDING_X,
+         size_t STRIDE_Y, size_t STRIDE_X,
+         size_t DILATION_Y, size_t DILATION_X,
+         size_t KERNEL_HEIGHT, size_t KERNEL_WIDTH,
+         ActivationFunction_T ACTIVATION,
+         // Memory mapping: inputs
+         size_t INPUT_MEM_CONT_OFFSET,
+         size_t INPUT_MEM_CONT_SIZE,
+         size_t INPUT_MEM_WRAP_OFFSET,
+         size_t INPUT_MEM_WRAP_SIZE,
+         size_t INPUT_MEM_STRIDE,
+         // Memory mapping: outputs
+         size_t OUTPUT_MEM_CONT_OFFSET,
+         size_t OUTPUT_MEM_CONT_SIZE,
+         size_t OUTPUT_MEM_WRAP_OFFSET,
+         size_t OUTPUT_MEM_WRAP_SIZE,
+         size_t OUTPUT_MEM_STRIDE,
+         typename Input_T, typename Output_T,
+         typename Weight_T, typename Bias_T,
+         typename Rescaling_T>
+__attribute__((always_inline)) inline
+void convolution_depthwise_forward(
+    const Input_T* __restrict inputs,
+    Output_T* __restrict outputs,
+    const Weight_T* __restrict weights,
+    const Bias_T* __restrict biases,
+    const Rescaling_T& __restrict rescaling)
+{
+    static_assert(NB_OUTPUTS % NB_CHANNELS == 0,
+        "NB_OUTPUTS should be a multiple of NB_CHANNELS.");
+
+    constexpr size_t DILATED_KERNEL_HEIGHT 
+            = KERNEL_HEIGHT + (DILATION_Y - 1) * (KERNEL_HEIGHT - 1);
+
+    constexpr size_t DILATED_KERNEL_WIDTH 
+            = KERNEL_WIDTH + (DILATION_X - 1) * (KERNEL_WIDTH - 1);
+
+    constexpr size_t OUTPUTS_HEIGHT_NOPAD
+        = (CHANNELS_HEIGHT - DILATION_Y * (KERNEL_HEIGHT - 1) - 1 + STRIDE_Y) / STRIDE_Y;
+    constexpr size_t OUTPUTS_WIDTH_NOPAD
+        = (CHANNELS_WIDTH - DILATION_X * (KERNEL_WIDTH - 1) - 1 + STRIDE_X) / STRIDE_X;
+
+    for (size_t oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
+        const size_t syMin = (PADDING_Y == 0) ? 0
+            : max(PADDING_Y - (oy * STRIDE_Y), 0);
+        const size_t syMax = (PADDING_Y == 0
+                && OUTPUTS_HEIGHT == OUTPUTS_HEIGHT_NOPAD) ? DILATED_KERNEL_HEIGHT
+            : clamp(CHANNELS_HEIGHT + PADDING_Y - (oy * STRIDE_Y), 
+                    0, DILATED_KERNEL_HEIGHT);
+        const int iy = static_cast<int>(oy * STRIDE_Y) - static_cast<int>(PADDING_Y);
+
+#ifdef _OPENMP
+#pragma omp parallel for collapse(2)
+#endif
+        for (size_t ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
+            for (size_t output = 0; output < NB_OUTPUTS; ++output) {
+                // moved to inner loop for collapsing -->
+                const size_t sxMin = (PADDING_X == 0) ? 0
+                    : max(PADDING_X - (ox * STRIDE_X), 0);
+                const size_t sxMax = (PADDING_X == 0
+                        && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
+                            ? DILATED_KERNEL_WIDTH
+                    : clamp(CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X), 
+                            0, DILATED_KERNEL_WIDTH);
+                const int ix = static_cast<int>(ox * STRIDE_X) - static_cast<int>(PADDING_X);
+
+                const size_t oPos = (ox + OUTPUTS_WIDTH * oy);
+                int oOffset = (OUTPUT_MEM_STRIDE / sizeof(Output_T)) * oPos;
+
+                if (OUTPUT_MEM_WRAP_SIZE > 0 && oOffset >= static_cast<int>(OUTPUT_MEM_CONT_SIZE / sizeof(Output_T))) {
+                    oOffset += (OUTPUT_MEM_WRAP_OFFSET - OUTPUT_MEM_CONT_OFFSET
+                                - OUTPUT_MEM_CONT_SIZE) / sizeof(Output_T);
+                }
+                // <--
+
+                const size_t channel = (output * NB_CHANNELS) / NB_OUTPUTS;
+
+                Bias_T weightedSum = biases ? biases[output] : 0;
+
+                for (size_t sy = 0; sy < KERNEL_HEIGHT; ++sy) {
+                    if ((PADDING_Y != 0
+                            || OUTPUTS_HEIGHT != OUTPUTS_HEIGHT_NOPAD)
+                        && ((sy*DILATION_Y < syMin) || (sy*DILATION_Y >= syMax)))
+                    {
+                        continue;
+                    }
+
+                    const size_t iPos = static_cast<size_t>(ix)
+                        + CHANNELS_WIDTH * (static_cast<size_t>(iy + sy * DILATION_Y));
+                    int iOffset = (INPUT_MEM_STRIDE / sizeof(Input_T)) * iPos;
+
+                    // Wrapping cannot occur in the middle of a line, except if
+                    // there is only one line (1D)!
+                    bool wrapInRange = false;
+
+                    if (INPUT_MEM_WRAP_SIZE > 0
+                        && iOffset >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
+                    {
+                        iOffset += (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET
+                                    - INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
+                    }
+                    else if (INPUT_MEM_WRAP_SIZE > 0 && KERNEL_WIDTH > 1
+                        && CHANNELS_HEIGHT == 1 // single line (1D)!
+                        && iOffset + KERNEL_WIDTH * NB_CHANNELS
+                            > (INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
+                    {
+                        wrapInRange = true;
+                    }
+
+                    const size_t wOffset = (output*KERNEL_HEIGHT + sy) 
+                                        * KERNEL_WIDTH;
+
+                    if (!wrapInRange && NB_CHANNELS == (INPUT_MEM_STRIDE / sizeof(Input_T))
+                        && DILATION_X == 1 && ((PADDING_X == 0
+                            && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
+                        || sxMax - sxMin == KERNEL_WIDTH))
+                    {
+                        macsOnRange<KERNEL_WIDTH, NB_CHANNELS>(
+                            inputs + iOffset + channel, 
+                            weights + wOffset, 
+                            weightedSum);
+                    }
+                    else {
+                        for (size_t sx = 0; sx < KERNEL_WIDTH; ++sx) {
+                            if ((PADDING_X != 0
+                                    || OUTPUTS_WIDTH != OUTPUTS_WIDTH_NOPAD)
+                                && ((sx*DILATION_X < sxMin) || (sx*DILATION_X >= sxMax)))
+                            {
+                                continue;
+                            }
+
+                            int iOffsetInRange = iOffset
+                                + sx * DILATION_X * (INPUT_MEM_STRIDE / sizeof(Input_T));
+
+                            if (wrapInRange
+                                && iOffsetInRange >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
+                            {
+                                iOffsetInRange += (INPUT_MEM_WRAP_OFFSET
+                                            - INPUT_MEM_CONT_OFFSET
+                                            - INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
+                            }
+
+                            weightedSum += inputs[iOffsetInRange + channel]
+                                * weights[wOffset + sx];
+                        }
+                    }
+                }
+
+                outputs[oOffset + output] = activation_forward_value<Output_T>(weightedSum, output, ACTIVATION, rescaling);
+            }
+        }
+    }
+}
+
+// Template specialization when biases are not given to the convolution
+template<int NB_CHANNELS,
+         int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
+         int NB_OUTPUTS,
+         int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
+         int PADDING_Y, int PADDING_X,
+         int STRIDE_Y, int STRIDE_X,
+         int DILATION_Y, int DILATION_X,
+         int KERNEL_HEIGHT, int KERNEL_WIDTH,
+         ActivationFunction_T ACTIVATION,
+         // Memory mapping: inputs
+         int INPUT_MEM_CONT_OFFSET,
+         int INPUT_MEM_CONT_SIZE,
+         int INPUT_MEM_WRAP_OFFSET,
+         int INPUT_MEM_WRAP_SIZE,
+         int INPUT_MEM_STRIDE,
+         // Memory mapping: outputs
+         int OUTPUT_MEM_CONT_OFFSET,
+         int OUTPUT_MEM_CONT_SIZE,
+         int OUTPUT_MEM_WRAP_OFFSET,
+         int OUTPUT_MEM_WRAP_SIZE,
+         int OUTPUT_MEM_STRIDE,
+         typename Input_T, typename Output_T,
+         typename Weight_T,
+         typename Rescaling_T>
+__attribute__((always_inline)) inline
+void convolution_depthwise_forward(
+    const Input_T* __restrict inputs,
+    Output_T* __restrict outputs,
+    const Weight_T* __restrict weights,
+    std::nullptr_t __restrict,
+    const Rescaling_T& __restrict rescaling)
+{
+    const float* b = nullptr;
+
+    convolution_depthwise_forward<NB_CHANNELS,
+                        CHANNELS_HEIGHT,
+                        CHANNELS_WIDTH,
+                        NB_OUTPUTS,
+                        OUTPUTS_HEIGHT,
+                        OUTPUTS_WIDTH,
+                        PADDING_Y,
+                        PADDING_X,
+                        STRIDE_Y,
+                        STRIDE_X,
+                        DILATION_Y,
+                        DILATION_X,
+                        KERNEL_HEIGHT,
+                        KERNEL_WIDTH,
+                        ACTIVATION,
+                        // Memory mapping: inputs
+                        INPUT_MEM_CONT_OFFSET,
+                        INPUT_MEM_CONT_SIZE,
+                        INPUT_MEM_WRAP_OFFSET,
+                        INPUT_MEM_WRAP_SIZE,
+                        INPUT_MEM_STRIDE,
+                        // Memory mapping: outputs
+                        OUTPUT_MEM_CONT_OFFSET,
+                        OUTPUT_MEM_CONT_SIZE,
+                        OUTPUT_MEM_WRAP_OFFSET,
+                        OUTPUT_MEM_WRAP_SIZE,
+                        OUTPUT_MEM_STRIDE>
+                        (inputs, outputs, weights, b, rescaling);
+}
+
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION_DEPTHWISE__
--- a/aidge_export_cpp/kernels/elemwise.hpp
+++ b/aidge_export_cpp/kernels/elemwise.hpp
@@ -2,170 +2,74 @@
 #define __AIDGE_EXPORT_CPP_KERNELS_ELEMWISE__

 #include "network/typedefs.hpp"
-#include "kernels/activation.hpp"
-
-// Generic function for two inputs
-
-template<int NB_ELTS,
-         ElemWise_T ELEM_OP,
-         ActivationFunction_T ACTIVATION,
-         typename Input_T, typename Output_T,
-         typename Rescaling_T>
+#include "network/activation_utils.hpp"
+#include <sys/types.h>
+
+template<size_t NB_MAT, ElemWise_T ELEM_OP,
+        size_t INPUT1_CONT_SIZE, size_t INPUT2_CONT_SIZE, size_t OUTPUT_CONT_SIZE,
+        const size_t OFFSET_IN1[], const size_t OFFSET_IN2[],
+        ActivationFunction_T ACTIVATION,
+         // Memory mapping: inputs
+         size_t INPUT1_MEM_CONT_OFFSET,
+         size_t INPUT1_MEM_CONT_SIZE,
+         size_t INPUT1_MEM_WRAP_OFFSET,
+         size_t INPUT1_MEM_WRAP_SIZE,
+         size_t INPUT1_MEM_STRIDE,
+         size_t INPUT2_MEM_CONT_OFFSET,
+         size_t INPUT2_MEM_CONT_SIZE,
+         size_t INPUT2_MEM_WRAP_OFFSET,
+         size_t INPUT2_MEM_WRAP_SIZE,
+         size_t INPUT2_MEM_STRIDE,
+         // Memory mapping: outputs
+         size_t OUTPUT_MEM_CONT_OFFSET,
+         size_t OUTPUT_MEM_CONT_SIZE,
+         size_t OUTPUT_MEM_WRAP_OFFSET,
+         size_t OUTPUT_MEM_WRAP_SIZE,
+         size_t OUTPUT_MEM_STRIDE,
+        typename Input_T, typename Output_T, typename Rescaling_T>
 __attribute__((always_inline)) inline
-void elemwise_forward (
+void elemwise_forward(
    Output_T* __restrict outputs,
    const Rescaling_T& __restrict rescaling,
    const Input_T* __restrict inputs1,
    const Input_T* __restrict inputs2)
 {
-    if (std::is_floating_point<Input_T>::value)
-    {
-        Input_T val = 0;
-
-        switch (ELEM_OP) {
-            case Add: {
-                for (int i = 0; i < NB_ELTS; ++i) {
-                    val = inputs1[i] + inputs2[i];
-                    outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
-                }
-                break;
-            }
-            case Sub: {
-                for (int i = 0; i < NB_ELTS; ++i) {
-                    val = inputs1[i] - inputs2[i];
-                    outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
-
-                }
-                break;
-            }
-            case Mul: {
-                for (int i = 0; i < NB_ELTS; ++i) {
-                    val = inputs1[i] * inputs2[i];
-                    outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
-                }
-                break;
-            }
-            default: {
-                // Copy inputs1 in outputs for default case
-                for (int i = 0; i < NB_ELTS; ++i) {
-                    val = inputs1[i];
-                    outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
-                }
-                break;
-            }
-        }
-    }
-    else
-    {
-        int32_t val = 0;
+    static_assert(INPUT1_MEM_WRAP_SIZE == 0, "Incompatible input memory wrapping");
+    static_assert(INPUT2_MEM_WRAP_SIZE == 0, "Incompatible input memory wrapping");
+    static_assert(OUTPUT_MEM_CONT_SIZE % OUTPUT_CONT_SIZE == 0, "Incompatible output memory wrapping");

+    auto apply_op = [](auto a, auto b) -> Output_T {
        switch (ELEM_OP) {
-            case Add: {
-                for (int i = 0; i < NB_ELTS; ++i) {
-                    val = inputs1[i] + inputs2[i];
-                    outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
-                }
-                break;
-            }
-            case Sub: {
-                for (int i = 0; i < NB_ELTS; ++i) {
-                    val = inputs1[i] - inputs2[i];
-                    outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
-                }
-                break;
-            }
-            case Mul: {
-                for (int i = 0; i < NB_ELTS; ++i) {
-                    val = inputs1[i] * inputs2[i];
-                    outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
-                }
-                break;
-            }
-            default: {
-                // Copy inputs1 in outputs for default case
-                for (int i = 0; i < NB_ELTS; ++i) {
-                    val = inputs1[i];
-                    outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
-                }
-                break;
-            }
+            case Add: return a + b;
+            case Sub: return a - b;
+            case Mul: return a * b;
+            case Div: return a / b;
+            default:  return a;
        }
-    }
-}
-
+    };

-// Generic function for multiple inputs
-// Not working
-
-// template<ElemWise_T ELEM_OP, typename Output_T>
-// __attribute__((always_inline)) inline
-// Output_T elemWise (int /*pos*/, int /*ch*/)
-// {
-//     return 0;
-// }
-
-// template<ElemWise_T ELEM_OP,
-//          int NB_CHANNELS,
-//          // For next inputs
-//          int... ARGS,
-//          typename... INPUTS,
-//          // Types
-//          typename Input_T, typename Output_T>
-// __attribute__((always_inline)) inline
-// Output_T elemWise (int pos, int ch,
-//                    const Input_T* __restrict firstInputs,
-//                    INPUTS... inputs)
-// {
-//     int iOffset = NB_CHANNELS * pos;
-
-//     return firstInputs[iOffset + ch]
-//                 + elemWise<ELEM_OP, ARGS...>(pos, ch, inputs...);
-// }
-
-// template<// For all inputs
-//          int NB_CHANNELS,
-//          int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
-//          int NB_ELTS,
-//          int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
-//          ElemWise_T ELEM_OP,
-//          ActivationFunction_T ACTIVATION,
-//          // For next inputs
-//          int... ARGS,
-//          typename... INPUTS,
-//          // Types
-//          typename Input_T, typename Output_T,
-//          typename Rescaling_T>
-// __attribute__((always_inline)) inline
-// void elemWise_forward (
-//     Output_T* __restrict outputs,
-//     const Rescaling_T& __restrict rescaling,
-//     const Input_T* __restrict firstInputs,
-//     INPUTS... inputs)
-// {
-//     for (int oy = 0; oy < OUTPUTS_HEIGHT; oy++) {
-//         for (int ox = 0; ox < OUTPUTS_WIDTH; ox++) {
-//             const int pos = (ox + OUTPUTS_WIDTH * oy);
-//             int oOffset = NB_ELTS * pos;
-
-//             for (int ch = 0; ch < NB_ELTS; ++ch) {
-//                 const Add_T val = elemWise<ELEM_OP,
-//                                         INPUT_NB_CHANNELS,
-//                                         INPUT_MEM_CONT_OFFSET,
-//                                         INPUT_MEM_CONT_NB_ELTS,
-//                                         INPUT_MEM_WRAP_OFFSET,
-//                                         INPUT_MEM_WRAP_NB_ELTS,
-//                                         INPUT_MEM_STRIDE,
-//                                         ARGS...>(pos, ch, firstInputs, inputs...);
-
-//                 outputs[oOffset + ch]
-//                     = sat<Output_T>(val, ch, ACTIVATION, rescaling);
-//             }
-//         }
-//     }
-// }
+    for (size_t stack = 0; stack < NB_MAT; ++stack) {
+        const size_t offset_in1 = OFFSET_IN1[stack] * INPUT1_CONT_SIZE;
+        const size_t offset_in2 = OFFSET_IN2[stack] * INPUT2_CONT_SIZE;
+        int out_offset = stack * OUTPUT_CONT_SIZE;

+        if (OUTPUT_MEM_WRAP_SIZE > 0 && out_offset >= static_cast<int>(OUTPUT_MEM_CONT_SIZE / sizeof(Output_T))) {
+            out_offset += (OUTPUT_MEM_WRAP_OFFSET - OUTPUT_MEM_CONT_OFFSET
+                        - OUTPUT_MEM_CONT_SIZE) / sizeof(Output_T);
+        }

+        for (size_t i = 0; i < OUTPUT_CONT_SIZE; ++i) {
+            const size_t in0_id = (INPUT1_CONT_SIZE != 1) ? i : 0;
+            const size_t in1_id = (INPUT2_CONT_SIZE != 1) ? i : 0;
+            const size_t out_id = out_offset + i;

+            const auto val1 = inputs1[in0_id + offset_in1];
+            const auto val2 = inputs2[in1_id + offset_in2];
+            const Output_T val = apply_op(val1, val2);

+            outputs[out_id] = activation_forward_value<Output_T>(val, out_id, ACTIVATION, rescaling);
+        }
+    }
+}

 #endif  // __AIDGE_EXPORT_CPP_KERNELS_ELEMWISE__
--- a/aidge_export_cpp/kernels/erf.hpp
+++ b/aidge_export_cpp/kernels/erf.hpp
+#ifndef __AIDGE_EXPORT_CPP_KERNELS_ERP__
+#define __AIDGE_EXPORT_CPP_KERNELS_ERP__
+
+#include "network/typedefs.hpp"
+#include "math.h"
+#include <sys/types.h>
+
+template<size_t NB_ELTS,
+         typename Input_T, typename Output_T>
+__attribute__((always_inline)) inline 
+void erf_forward (
+    const Input_T* __restrict inputs,
+    Output_T* __restrict outputs)
+{
+    constexpr double a1 =  0.254829592;
+    constexpr double a2 = -0.284496736;
+    constexpr double a3 =  1.421413741;
+    constexpr double a4 = -1.453152027;
+    constexpr double a5 =  1.061405429;
+    constexpr double p  =  0.3275911;
+
+    #ifdef _OPENMP
+    #pragma omp parallel for
+    #endif
+    for (size_t i = 0; i < NB_ELTS; ++i) {
+        int sign = 1;
+        if (inputs[i] < 0)
+            sign = -1;
+        const double abs_value = abs(inputs[i]);
+        
+        // A&S formula 7.1.26
+        const double t = 1.0/(1.0 + p*abs_value);
+        const double y = 1.0 - (((((a5*t + a4)*t) + a3)*t + a2)*t + a1)*t*exp(-abs_value*abs_value);
+        outputs[i] = sign*y;
+
+    }
+}
+
+
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_ERP_
\ No newline at end of file
--- a/aidge_export_cpp/kernels/fullyconnected.hpp
+++ b/aidge_export_cpp/kernels/fullyconnected.hpp
@@ -2,16 +2,33 @@
 #define __AIDGE_EXPORT_CPP_KERNELS_FULLYCONNECTED__

 #include "network/typedefs.hpp"
-#include "kernels/rescaling.hpp"
+#include "network/rescaling_utils.hpp"
 #include "network/utils.hpp"
-#include "kernels/macs.hpp"
-#include "kernels/activation.hpp"
+#include "network/macs.hpp"
+#include "network/activation_utils.hpp"
+#include <sys/types.h>

-template<int NB_CHANNELS,
-         int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
-         int NB_OUTPUTS,
-         int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
+/**
+ * @brief Kernel to use when the input is in the NHWC format, and the
+ * weights have been transposed accordingly.
+ */
+template<size_t NB_CHANNELS,
+         size_t CHANNELS_HEIGHT, size_t CHANNELS_WIDTH,
+         size_t NB_OUTPUTS,
+         size_t OUTPUTS_HEIGHT, size_t OUTPUTS_WIDTH,
         ActivationFunction_T ACTIVATION,
+         // Memory mapping: inputs
+         size_t INPUT_MEM_CONT_OFFSET,
+         size_t INPUT_MEM_CONT_SIZE,
+         size_t INPUT_MEM_WRAP_OFFSET,
+         size_t INPUT_MEM_WRAP_SIZE,
+         size_t INPUT_MEM_STRIDE,
+         // Memory mapping: outputs
+         size_t OUTPUT_MEM_CONT_OFFSET,
+         size_t OUTPUT_MEM_CONT_SIZE,
+         size_t OUTPUT_MEM_WRAP_OFFSET,
+         size_t OUTPUT_MEM_WRAP_SIZE,
+         size_t OUTPUT_MEM_STRIDE,
         typename Input_T, typename Output_T,
         typename Weight_T, typename Bias_T,
         typename Rescaling_T>
@@ -23,50 +40,209 @@ void fullyconnected_forward (
    const Bias_T* __restrict biases,
    const Rescaling_T& __restrict rescaling)
 {
-    // Warning, there is a trick here !
-    // To use this kernel, the inputs have to be in NHWC and the weights are in NCHW
-    // It is only an issue if the FC was after a flatten layer.
-    // Otherwise it is not an issue for the other FC because CHANNELS_WIDTH = CHANNELS_HEIGHT = 1
-    // Solution: Add a system to check dataformat
-    for (int och = 0; och < NB_OUTPUTS; och++) {
-
-        Bias_T weightedSum = biases[och];
-
-        for (int iy = 0; iy < CHANNELS_HEIGHT; ++iy) {
-            for (int ix = 0; ix < CHANNELS_WIDTH; ++ix) {
-                for (int ch = 0; ch < NB_CHANNELS; ++ch) {
-                    weightedSum += inputs[CHANNELS_WIDTH*NB_CHANNELS*iy + NB_CHANNELS*ix + ch]
-                                * weights[CHANNELS_HEIGHT*CHANNELS_WIDTH*NB_CHANNELS*och + CHANNELS_HEIGHT*CHANNELS_WIDTH*ch + CHANNELS_HEIGHT*iy + ix];
+    constexpr size_t INPUT_WIDTH_STRIDE = (INPUT_MEM_STRIDE / sizeof(Input_T));
+    constexpr size_t INPUT_HEIGHT_STRIDE = (INPUT_MEM_STRIDE / sizeof(Input_T))*CHANNELS_WIDTH;
+    // constexpr size_t INPUT_OUT_CHANNELS_STRIDE = (INPUT_MEM_STRIDE / sizeof(Input_T))*CHANNELS_WIDTH*CHANNELS_HEIGHT;
+
+    constexpr size_t WEIGHT_WIDTH_STRIDE = NB_CHANNELS;
+    constexpr size_t WEIGHT_HEIGHT_STRIDE = NB_CHANNELS*CHANNELS_WIDTH;
+    constexpr size_t WEIGHT_OUT_CHANNELS_STRIDE = NB_CHANNELS*CHANNELS_WIDTH*CHANNELS_HEIGHT;
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+    for (size_t och = 0; och < NB_OUTPUTS; ++och) {
+        Bias_T weightedSum = (biases) ? biases[och] : Bias_T(0);
+
+        for (size_t iy = 0; iy < CHANNELS_HEIGHT; ++iy) {
+            int iOffset = INPUT_HEIGHT_STRIDE * iy;
+
+            // Wrapping cannot occur in the middle of a line, except if
+            // there is only one line (1D)!
+            bool wrapInRange = false;
+
+            if (INPUT_MEM_WRAP_SIZE > 0 && iOffset >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) {
+                iOffset += (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET
+                            - INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
+            }
+            else if (INPUT_MEM_WRAP_SIZE > 0 && CHANNELS_WIDTH > 1
+                && CHANNELS_HEIGHT == 1 // single line (1D)!
+                && iOffset + CHANNELS_WIDTH * NB_CHANNELS
+                    > (INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
+            {
+                wrapInRange = true;
+            }
+
+            const size_t wOffset = WEIGHT_HEIGHT_STRIDE * iy + WEIGHT_OUT_CHANNELS_STRIDE * och;
+
+            if (!wrapInRange && INPUT_WIDTH_STRIDE == WEIGHT_WIDTH_STRIDE) {
+                macsOnRange<INPUT_HEIGHT_STRIDE>(
+                    inputs + iOffset,
+                    weights + wOffset,
+                    weightedSum);
+            }
+            else {
+                for (size_t ix = 0; ix < CHANNELS_WIDTH; ++ix) {
+                    int iOffsetInRange = iOffset + ix * INPUT_WIDTH_STRIDE;
+
+                    if (wrapInRange
+                        && iOffsetInRange >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
+                    {
+                        iOffsetInRange += (INPUT_MEM_WRAP_OFFSET
+                                    - INPUT_MEM_CONT_OFFSET
+                                    - INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
+                    }
+
+                    macsOnRange<INPUT_WIDTH_STRIDE>(
+                        inputs + iOffsetInRange,
+                        weights + wOffset + ix * WEIGHT_WIDTH_STRIDE,
+                        weightedSum);
                }
            }
        }

        outputs[och] = activation_forward_value<Output_T>(weightedSum, och, ACTIVATION, rescaling);
    }
-/*
-Here the kernel to use with inputs in NHWC and weights in NHWC
+}
+
+/**
+ * @brief Kernel to use when the input is in the NCHW or Default format
+ * format (4D or 2D).
+ */
+template<size_t NB_CHANNELS,
+         size_t CHANNELS_HEIGHT, size_t CHANNELS_WIDTH,
+         size_t NB_OUTPUTS,
+         size_t OUTPUTS_HEIGHT, size_t OUTPUTS_WIDTH,
+         ActivationFunction_T ACTIVATION,
+         // Memory mapping: inputs
+         size_t INPUT_MEM_CONT_OFFSET,
+         size_t INPUT_MEM_CONT_SIZE,
+         size_t INPUT_MEM_WRAP_OFFSET,
+         size_t INPUT_MEM_WRAP_SIZE,
+         size_t INPUT_MEM_STRIDE,
+         // Memory mapping: outputs
+         size_t OUTPUT_MEM_CONT_OFFSET,
+         size_t OUTPUT_MEM_CONT_SIZE,
+         size_t OUTPUT_MEM_WRAP_OFFSET,
+         size_t OUTPUT_MEM_WRAP_SIZE,
+         size_t OUTPUT_MEM_STRIDE,
+         typename Input_T, typename Output_T,
+         typename Weight_T, typename Bias_T,
+         typename Rescaling_T>
+__attribute__((always_inline)) inline
+void fullyconnected_default_forward (
+    const Input_T* __restrict inputs,
+    Output_T* __restrict outputs,
+    const Weight_T* __restrict weights,
+    const Bias_T* __restrict biases,
+    const Rescaling_T& __restrict rescaling)
+{
+    constexpr size_t WEIGHT_OUT_CHANNELS_STRIDE = NB_CHANNELS*CHANNELS_WIDTH*CHANNELS_HEIGHT;
+
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+    for (size_t och = 0; och < NB_OUTPUTS; och++) {
+        Bias_T weightedSum = (biases) ? biases[och] : Bias_T(0);
+
+        const size_t wOffset = WEIGHT_OUT_CHANNELS_STRIDE * och;
+
+        macsOnRange<WEIGHT_OUT_CHANNELS_STRIDE>(
+            inputs,
+            weights + wOffset,
+            weightedSum);
+
+        outputs[och] = activation_forward_value<Output_T>(weightedSum, och, ACTIVATION, rescaling);
+    }
+}
+
+/**
+ * @brief Kernel to use when the input is in the NHWC format, but the
+ * weights have not been transposed and still follow the NCHW format order.
+ */
+template<size_t NB_CHANNELS,
+         size_t CHANNELS_HEIGHT, size_t CHANNELS_WIDTH,
+         size_t NB_OUTPUTS,
+         size_t OUTPUTS_HEIGHT, size_t OUTPUTS_WIDTH,
+         ActivationFunction_T ACTIVATION,
+         // Memory mapping: inputs
+         size_t INPUT_MEM_CONT_OFFSET,
+         size_t INPUT_MEM_CONT_SIZE,
+         size_t INPUT_MEM_WRAP_OFFSET,
+         size_t INPUT_MEM_WRAP_SIZE,
+         size_t INPUT_MEM_STRIDE,
+         // Memory mapping: outputs
+         size_t OUTPUT_MEM_CONT_OFFSET,
+         size_t OUTPUT_MEM_CONT_SIZE,
+         size_t OUTPUT_MEM_WRAP_OFFSET,
+         size_t OUTPUT_MEM_WRAP_SIZE,
+         size_t OUTPUT_MEM_STRIDE,
+         typename Input_T, typename Output_T,
+         typename Weight_T, typename Bias_T,
+         typename Rescaling_T>
+__attribute__((always_inline)) inline
+void fullyconnected_transpose_forward (
+    const Input_T* __restrict inputs,
+    Output_T* __restrict outputs,
+    const Weight_T* __restrict weights,
+    const Bias_T* __restrict biases,
+    const Rescaling_T& __restrict rescaling)
+{
+    constexpr size_t INPUT_WIDTH_STRIDE = (INPUT_MEM_STRIDE / sizeof(Input_T));
+    constexpr size_t INPUT_HEIGHT_STRIDE = (INPUT_MEM_STRIDE / sizeof(Input_T))*CHANNELS_WIDTH;
+    // constexpr size_t INPUT_OUT_CHANNELS_STRIDE = (INPUT_MEM_STRIDE / sizeof(Input_T))*CHANNELS_WIDTH*CHANNELS_HEIGHT;
+
+    constexpr size_t WEIGHT_HEIGHT_STRIDE = CHANNELS_WIDTH;
+    constexpr size_t WEIGHT_IN_CHANNELS_STRIDE = CHANNELS_HEIGHT*CHANNELS_WIDTH;
+    constexpr size_t WEIGHT_OUT_CHANNELS_STRIDE = NB_CHANNELS*CHANNELS_HEIGHT*CHANNELS_WIDTH;
+#ifdef _OPENMP
 #pragma omp parallel for
-    for (int och = 0; och < NB_OUTPUTS; och++) {
+#endif
+    for (size_t och = 0; och < NB_OUTPUTS; och++) {
+        Bias_T weightedSum = (biases) ? biases[och] : Bias_T(0);

-        Bias_T weightedSum = biases[och];
+        for (size_t iy = 0; iy < CHANNELS_HEIGHT; ++iy) {
+            int iOffset = INPUT_HEIGHT_STRIDE * iy;

-        for (int iy = 0; iy < CHANNELS_HEIGHT; ++iy) {
-            const int iPos = (CHANNELS_WIDTH * iy);
-            int iOffset = NB_CHANNELS * iPos;
+            // Wrapping cannot occur in the middle of a line, except if
+            // there is only one line (1D)!
+            bool wrapInRange = false;

-            const int wOffset = NB_CHANNELS * CHANNELS_WIDTH
-                                    * (iy + CHANNELS_HEIGHT * och);
+            if (INPUT_MEM_WRAP_SIZE > 0 && iOffset >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) {
+                iOffset += (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET
+                            - INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
+            }
+            else if (INPUT_MEM_WRAP_SIZE > 0 && CHANNELS_WIDTH > 1
+                && CHANNELS_HEIGHT == 1 // single line (1D)!
+                && iOffset + CHANNELS_WIDTH * NB_CHANNELS
+                    > (INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
+            {
+                wrapInRange = true;
+            }

-            macsOnRange<NB_CHANNELS * CHANNELS_WIDTH>(
-                inputs + iOffset,
-                weights + wOffset,
-                weightedSum);
+            const size_t wOffset = WEIGHT_OUT_CHANNELS_STRIDE * och + WEIGHT_HEIGHT_STRIDE * iy;
+
+            for (size_t ix = 0; ix < CHANNELS_WIDTH; ++ix) {
+                int iOffsetInRange = iOffset + ix * INPUT_WIDTH_STRIDE;
+
+                if (wrapInRange
+                    && iOffsetInRange >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
+                {
+                    iOffsetInRange += (INPUT_MEM_WRAP_OFFSET
+                                - INPUT_MEM_CONT_OFFSET
+                                - INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
+                }
+
+                // Beware that the pointer increment for weights is
+                // CHANNELS_HEIGHT*CHANNELS_WIDTH
+                macsOnRange<NB_CHANNELS, WEIGHT_IN_CHANNELS_STRIDE>(
+                    inputs + iOffsetInRange,
+                    weights + wOffset + ix,
+                    weightedSum);
+            }
        }

        outputs[och] = activation_forward_value<Output_T>(weightedSum, och, ACTIVATION, rescaling);
    }
-*/
 }

-
 #endif  // __AIDGE_EXPORT_CPP_KERNELS_FULLYCONNECTED__
--- a/aidge_export_cpp/kernels/hardmax.hpp
+++ b/aidge_export_cpp/kernels/hardmax.hpp
+#ifndef __AIDGE_EXPORT_CPP_KERNELS_HARDMAX__
+#define __AIDGE_EXPORT_CPP_KERNELS_HARDMAX__
+
+#include "network/typedefs.hpp"
+#include "network/utils.hpp"
+#include <sys/types.h>
+
+// Todo add border value and border type (Reflect, Constant, Wrap...) and add
+// the two missing pad value (bottom and right)
+
+template <size_t AXIS_DIM_SIZE,
+          size_t PREAXIS_STRIDE,
+          size_t AXIS_STRIDE,
+          size_t POSTAXIS_STRIDE,
+          size_t NB_ELTS,
+          typename Input_T,
+          typename Output_T>
+// void HardmaxImpl_cpu_forward_kernel(std::int32_t axis_, const
+// std::vector<DimSize_t>& dims, const void* input_, void* output_)
+__attribute__((always_inline)) inline void
+hardmax2d_forward(const Input_T *__restrict input,
+                  Output_T *__restrict output) {
+    // fill output with 0
+    for (Output_T *i = output; i != output + NB_ELTS; ++i) {
+        *i = 0;
+    }
+
+    // For each index on all the axes before and after 'axis', we have a
+    // different max element to find
+    for (size_t i = 0, preAxisOffset = 0; i < PREAXIS_STRIDE;
+         ++i, preAxisOffset += AXIS_DIM_SIZE * POSTAXIS_STRIDE) {
+
+        for (size_t j = 0; j < POSTAXIS_STRIDE; ++j) {
+            // Init the max with first element
+            size_t maxIdx = 0;
+            Input_T maxVal = input[preAxisOffset + j];
+            // Loop over the elements on 'axis'
+            // Since we start at 0th idx, we already initialize the values like
+            // the 1st iteration has been done
+            for (size_t k = 1,
+                              postAxisOffset = preAxisOffset + POSTAXIS_STRIDE;
+                 k < AXIS_DIM_SIZE;
+                 ++k, postAxisOffset += POSTAXIS_STRIDE) {
+
+                Input_T currVal = input[postAxisOffset + j];
+                // Update max elements
+                if (currVal > maxVal) {
+                    maxIdx = k;
+                    maxVal = currVal;
+                }
+            }
+            output[preAxisOffset + maxIdx * POSTAXIS_STRIDE + j] = 1;
+        }
+    }
+}
+
+#endif // __AIDGE_EXPORT_CPP_KERNELS_HARDMAX__
--- a/aidge_export_cpp/kernels/identity.hpp
+++ b/aidge_export_cpp/kernels/identity.hpp
+#ifndef __AIDGE_EXPORT_CPP_KERNELS_IDENTITY__
+#define __AIDGE_EXPORT_CPP_KERNELS_IDENTITY__
+
+#include "network/typedefs.hpp"
+#include <sys/types.h>
+
+// Generic function for identity and activation
+
+template<size_t M,
+         typename Input_T, typename Output_T>
+__attribute__((always_inline)) inline
+void identity_forward (
+    const Input_T* __restrict inputs,
+    Output_T* __restrict outputs)
+{
+    // If inputs and outputs pointers are the same, the memory manager has already optimized this function so it is a no-op !
+    if (inputs == outputs)
+        return;
+
+    // A identity in c++ world should equal to a Noop
+    // We only need to copy the input buffer to the output
+    for (size_t m = 0; m < M; ++m) {
+        outputs[m] = inputs[m];
+    }
+}
+
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_IDENTITY__
\ No newline at end of file
--- a/aidge_export_cpp/kernels/leakyrelu.hpp
+++ b/aidge_export_cpp/kernels/leakyrelu.hpp
@@ -2,24 +2,50 @@
 #define __AIDGE_EXPORT_CPP_KERNELS_LEAKYRELU__

 #include "network/typedefs.hpp"
+#include <sys/types.h>
+#include <sys/types.h>

-template<int NB_DATA,
+template<size_t NB_ELTS,
+         // Memory mapping: inputs
+         size_t INPUT_MEM_CONT_OFFSET,
+         size_t INPUT_MEM_CONT_SIZE,
+         size_t INPUT_MEM_WRAP_OFFSET,
+         size_t INPUT_MEM_WRAP_SIZE,
+         size_t INPUT_MEM_STRIDE,
+         // Memory mapping: outputs
+         size_t OUTPUT_MEM_CONT_OFFSET,
+         size_t OUTPUT_MEM_CONT_SIZE,
+         size_t OUTPUT_MEM_WRAP_OFFSET,
+         size_t OUTPUT_MEM_WRAP_SIZE,
+         size_t OUTPUT_MEM_STRIDE,
         typename Input_T, typename Output_T>
-__attribute__((always_inline)) inline 
+__attribute__((always_inline)) inline
 void leakyrelu_forward (
    const Input_T* __restrict inputs,
    Output_T* __restrict outputs,
    const float negative_slope)
 {
-#pragma omp parallel for
-    for (int i = 0; i < NB_DATA; ++i) {
-        if (inputs[i] >= 0) {
-            outputs[i] = inputs[i];
-        } else {
-            outputs[i] = negative_slope * inputs[i];
+    int inOffset = 0;
+    int outOffset = 0;
+
+    for (size_t i = 0; i < NB_ELTS; ++i) {
+        if (INPUT_MEM_WRAP_SIZE > 0 && i == static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) {
+            inOffset = (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET
+                        - INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
+        }
+
+        if (OUTPUT_MEM_WRAP_SIZE > 0 && i == static_cast<int>(OUTPUT_MEM_CONT_SIZE / sizeof(Output_T))) {
+            outOffset = (OUTPUT_MEM_WRAP_OFFSET - OUTPUT_MEM_CONT_OFFSET
+                        - OUTPUT_MEM_CONT_SIZE) / sizeof(Output_T);
+        }
+
+        if (inputs[inOffset + i] >= 0) {
+            outputs[outOffset + i] = inputs[inOffset + i];
+        }
+        else {
+            outputs[outOffset + i] = negative_slope * inputs[inOffset + i];
        }
    }
 }

-
-#endif  // __AIDGE_EXPORT_CPP_KERNELS_LEAKYRELU__
\ No newline at end of file
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_LEAKYRELU__
--- a/aidge_export_cpp/kernels/matmul.hpp
+++ b/aidge_export_cpp/kernels/matmul.hpp
@@ -2,16 +2,15 @@
 #define __AIDGE_EXPORT_CPP_KERNELS_MATMUL__

 #include "network/typedefs.hpp"
-#include "kernels/activation.hpp"
+#include "network/activation_utils.hpp"
+#include <sys/types.h>

 // Generic function for matmul and activation

-template<int M,
-         int K,
-         int N,
-         ActivationFunction_T ACTIVATION,
-         typename Input_T, typename Output_T,
-         typename Rescaling_T>
+template<size_t NB_MAT, size_t N, size_t M, size_t K,
+        const size_t OFFSET_IN1[], const size_t OFFSET_IN2[],
+        ActivationFunction_T ACTIVATION,
+        typename Input_T, typename Output_T, typename Rescaling_T>
 __attribute__((always_inline)) inline
 void matmul_forward (
    const Input_T* __restrict inputs1,
@@ -19,15 +18,28 @@ void matmul_forward (
    Output_T* __restrict outputs,
    const Rescaling_T& __restrict rescaling)
 {
-    for (int m = 0; m < M; ++m) {
-        for (int n = 0; n < N; ++n) {
-            Output_T sum = Output_T(0);
-            for (int k = 0; k < K; ++k) {
-                sum += inputs1[K*m + k] * inputs2[N*k + n];
+    for (size_t stack = 0; stack < NB_MAT; ++stack) {
+        const size_t offset1 = OFFSET_IN1[stack] * N * K;
+        const size_t offset2 = OFFSET_IN2[stack] * K * M;
+        Output_T* out_ptr = &outputs[stack * N * M];
+
+        for (size_t i = 0; i < N; ++i) {
+            const Output_T* in1_row = &inputs1[offset1 + i * K];
+
+            for (size_t j = 0; j < M; ++j) {
+                Output_T sum = 0;
+
+                // Access column of inputs2 as row-major
+                for (size_t l = 0; l < K; ++l) {
+                    sum += in1_row[l] * inputs2[offset2 + l * M + j];
+                }
+
+                out_ptr[i * M + j] = activation_forward_value<Output_T>(
+                    sum, 0 /* not applicable */, ACTIVATION, rescaling
+                );
            }
-            outputs[N*m + n] = activation_forward_value<Output_T>(sum, 0/*not applicable*/, ACTIVATION, rescaling);
        }
    }
 }

-#endif  // __AIDGE_EXPORT_CPP_KERNELS_MATMUL__
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_MATMUL__
\ No newline at end of file
No results found