Compare revisions

ab985170 · fb338649 · 735ce06a · c2e599df · 5c0e491a · 7e0848d6
--- a/.gitignore
+++ b/.gitignore
@@ -11,6 +11,13 @@ __pycache__
 *.pyc
 *.egg-info
 dist*/
+aidge_export_cpp/_version.py
+wheelhouse/*
+env_aidge/
+
+# Temp test folders
+aidge_export_cpp/unit_tests/*_temp_test
+*_test/

 # Mermaid
 *.mmd
@@ -19,4 +26,7 @@ dist*/
 xml*/

 # ONNX
-*.onnx
\ No newline at end of file
+*.onnx
+
+# GDB
+.gdb_history
\ No newline at end of file
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
+###############################################################################
+#                 Aidge Continuous Integration and Deployment                 #
+#                                                                             #
+###############################################################################
+
+stages:
+  - static_analysis
+  - build
+  - test
+  - coverage
+  - release
+  - deploy
+
+include:
+  - project: 'eclipse/aidge/gitlab_shared_files'
+    ref: 'main'
+    file:
+      # choose which jobs to run by including the corresponding files.
+      - '.gitlab/ci/ubuntu_python.gitlab-ci.yml'
+      - '.gitlab/ci/release/pip.gitlab-ci.yml'
+
+      #  Since aidge_export_cpp is a pure python package building on windows and on ubuntu doesn't differ
+      # - '.gitlab/ci/windows_python.gitlab-ci.yml'
+
+test:ubuntu_python:
+  before_script:
+    - !reference [.setup:test:ubuntu_python, before_script]
+    - DEPS_NAMES=("aidge_onnx" "aidge_quantization")
+    - DEPENDENCY_JOB="build:ubuntu_python"
+    - !reference [.ubuntu:download:artifacts, script]
+    # Need to install extra dependence for tests:
+    - python -m pip install torch torchvision
+coverage:ubuntu_python:
+  before_script:
+    - !reference [.setup:coverage:ubuntu_python, before_script]
+    - DEPS_NAMES=("aidge_onnx" "aidge_quantization")
+    - DEPENDENCY_JOB="build:ubuntu_python"
+    - !reference [.ubuntu:download:artifacts, script]
\ No newline at end of file
--- a/.gitlab/ci/cibuildwheel_build_deps_before_build_wheel.sh
+++ b/.gitlab/ci/cibuildwheel_build_deps_before_build_wheel.sh
+#!/bin/bash
+set -e
+if [[ "$1" == "" ]]; then 
+  echo "build aidge deps in cibuildwheel container before building wheel."
+  echo "search path defines where the dependencies will be searched."
+  echo "Hint : In wheel containers, files are mounted on /host by default."
+  echo "\nusage : ./cibuildwheel_build_deps_before_build_wheel.sh $search_path"
+fi
+set -x
+if [[ $AIDGE_DEPENDENCIES ==  "" ]]; then # case for aidge_ core
+  mkdir -p build # creating build if its not already there to hold the build of cpp files
+  rm -rf build/* # build from scratch
+else 
+  for repo in $AIDGE_DEPENDENCIES ; do # case for other projects
+    search_path=$1
+    REPO_PATH=$(find $search_path ! -writable -prune -o  -type d     \
+                                    -name "$repo"                    \
+                                    -not -path "*/install/*"         \
+                                    -not -path "*/.git/*"            \
+                                    -not -path "*/miniconda/*"       \
+                                    -not -path "*/conda/*"           \
+                                    -not -path "*/.local/*"          \
+                                    -not -path "*/lib/*"             \
+                                    -not -path "*/$repo/$repo/*"     \
+                                    -not -path "*/proc/*"            \
+                                    -print -quit)
+    if [[ -z "$REPO_PATH" ]]; then 
+      echo "ERROR : dependency $repo not found in search_path \"$search_path\". ABORTING."
+      exit -1
+    fi
+
+    cd $REPO_PATH
+    mkdir -p build # creating build if its not already there to hold the build of cpp files
+    rm -rf build/* # build from scratch
+    pip install . -v
+    cd -
+  done
+fi
+set +x
+set +e
--- a/CHANGELOG
+++ b/CHANGELOG
+# Version 0.2.1 (January 31, 2025)
+
+# Version 0.2.0 (december 6, 2024)
+
 # Version 0.0.1 (January 23, 2024)

 Initial release
--- a/MANIFEST.in
+++ b/MANIFEST.in
-include MANIFEST.in
-include LICENSE
-include README.md
-recursive-include aidge_export_cpp *
-include setup.py
-include version.txt
--- a/README.md
+++ b/README.md
 # Aidge CPP Export

-Use this module to export your Aidge model to a generic CPP export
\ No newline at end of file
+Use this module to export your Aidge model to a generic CPP export
+
+## Install
+
+Install with:
+
+    pip install -v .
+
+## Development mode install
+
+For editable/development mode, install with:
+
+    pip install -v --no-build-isolation -e .
--- a/aidge_export_cpp/__init__.py
+++ b/aidge_export_cpp/__init__.py
@@ -2,14 +2,15 @@ r"""
 Aidge Export for CPP standalone projects

 """
+from pathlib import Path

-from .operators import *
-from collections import defaultdict
-import aidge_core
-
-from aidge_export_cpp.utils import ROOT
-
-__version__ = open(ROOT / "version.txt", "r").read().strip()
+# Constants
+FILE = Path(__file__).resolve()
+ROOT = FILE.parents[0]

+from .export_registry import ExportLibCpp
+from .export_utils import *
+from .operators import *
 from .export import *
+from . import benchmark

--- a/aidge_export_cpp/benchmark.py
+++ b/aidge_export_cpp/benchmark.py
+import contextlib
+import os
+from shutil import rmtree
+from subprocess import run
+
+import numpy as np
+
+import aidge_core
+import aidge_backend_cpu
+import aidge_export_cpp
+
+def measure_inference_time(model: aidge_core.GraphView, input_data: list[str, np.ndarray], nb_warmup: int = 10, nb_iterations: int = 50) -> list[float]:
+    # load and set up the model
+    # model.set_datatype(ai.dtype.float32)
+    model.set_backend("cpu")
+
+    # create input Tensor list for the GraphView
+    ordered_inputs: list[aidge_core.Tensor] = []
+    # [tmp fix] manual transpositin of data for input of export BEFORE converting to Tensor
+    for i in input_data:
+        nb_dims = len(i[1].shape)
+        if nb_dims == 3:
+            ordered_inputs.append(aidge_core.Tensor(i[1].transpose(0,2,1).reshape(i[1].shape).copy()))
+        if nb_dims == 4:
+            ordered_inputs.append(aidge_core.Tensor(np.transpose(i[1], axes=(0,2,3,1)).reshape(i[1].shape).copy()))
+        else:
+            ordered_inputs.append(aidge_core.Tensor(i[1]))
+
+    # set inputs for the export
+    for i, inp in enumerate(model.get_ordered_inputs()):
+        op = inp[0].get_operator()
+        op.set_input(i, ordered_inputs[i])
+
+    model.forward_dims([t.dims() for t in ordered_inputs])
+
+    scheduler = aidge_core.SequentialScheduler(model)
+    scheduler.generate_scheduling()
+
+    # for ordered_input in ordered_inputs:
+        # ordered_input.set_backend("cpu")
+    operator_type: str = model.get_ordered_outputs()[0][0].get_operator().type()
+    print("  ├─Generating export...", end="", flush=True)
+    folder_name: str = f"{operator_type.lower()}_test_export_cpp"
+    with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f):
+        aidge_core.export_utils.scheduler_export(
+            scheduler,
+            folder_name,
+            aidge_export_cpp.ExportLibCpp,
+            memory_manager=aidge_core.mem_info.generate_optimized_memory_info,
+            memory_manager_args={"wrapping": False }
+        )
+        aidge_core.export_utils.generate_main_inference_time_cpp(folder_name, model, nb_iterations, nb_warmup)
+    print(" ok")
+
+    print("  ├─Compiling...", end="", flush=True)
+    with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f):
+        run(['make'], cwd=folder_name, stdout=f)
+    print(" ok")
+    timings_str = run(f'./{folder_name}/bin/run_export', capture_output=True, text=True)
+
+    folder_path = os.path.abspath(folder_name)
+    if os.path.exists(folder_path):
+        rmtree(folder_path, ignore_errors=True)
+
+    timings = [float(t) for t in timings_str.stdout.split(' ') if t.strip()]
+    return timings
+
+def compute_output(model: aidge_core.GraphView, input_data: list[str, np.ndarray]) -> list[np.ndarray]:
+    # load and set up the model
+    model.set_backend("cpu")
+
+    # create input Tensor list for the GraphView
+    ordered_inputs: list[aidge_core.Tensor] = []
+    # [tmp fix] manual transpositin of data for input of export BEFORE converting to Tensor
+    for i in input_data:
+        nb_dims = len(i[1].shape)
+        if nb_dims == 3:
+            ordered_inputs.append(aidge_core.Tensor(i[1].transpose(0,2,1).reshape(i[1].shape).copy()))
+        if nb_dims == 4:
+            ordered_inputs.append(aidge_core.Tensor(np.transpose(i[1], axes=(0,2,3,1)).reshape(i[1].shape).copy()))
+        else:
+            ordered_inputs.append(aidge_core.Tensor(i[1]))
+
+    # set inputs for the export
+    for i, inp in enumerate(model.get_ordered_inputs()):
+        op = inp[0].get_operator()
+        op.set_input(i, ordered_inputs[i])
+
+    model.forward_dims([t.dims() for t in ordered_inputs])
+
+    scheduler = aidge_core.SequentialScheduler(model)
+    scheduler.generate_scheduling()
+
+
+    operator_type: str = model.get_ordered_outputs()[0][0].get_operator().type()
+    print("  │ Generating export...", end="", flush=True)
+    folder_name: str = f"{operator_type.lower()}_test_export_cpp"
+    with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f):
+        aidge_core.export_utils.scheduler_export(
+            scheduler,
+            folder_name,
+            aidge_export_cpp.ExportLibCpp,
+            memory_manager=aidge_core.mem_info.generate_optimized_memory_info,
+            memory_manager_args={"wrapping": False }
+        )
+        aidge_core.export_utils.generate_main_display_output_cpp(folder_name, model)
+    print(" ok")
+
+    print("  │ Compiling...", end="", flush=True)
+    with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f):
+        run(['make'], cwd=folder_name, stdout=f)
+    print(" ok")
+    output_str: str = run(f'./{folder_name}/bin/run_export', capture_output=True, text=True)
+    folder_path = os.path.abspath(folder_name)
+    if os.path.exists(folder_path):
+        rmtree(folder_path, ignore_errors=True)
+
+    outputs_str: list[str] = output_str.stdout.strip().split('\n')
+    outputs = [np.array([float(val) for val in single_output_str.split(' ') if val.strip()]) for i, single_output_str in enumerate(outputs_str)]
+
+    for i, pair in enumerate(model.get_ordered_outputs()):
+        dims = pair[0].get_operator().get_output(pair[1]).dims()
+        nb_dims = len(dims)
+        dims_permutted = dims
+        if nb_dims == 3:
+            dims_permutted = [dims[0], dims[2], dims[1]]
+        if nb_dims == 4:
+            dims_permutted = [dims[0], dims[2], dims[3], dims[1]]
+
+        if np.prod(dims) != outputs[i].size:
+            aidge_core.Log.fatal("Incompatible export output size ({}) with required shape {}", outputs[i].size, dims)
+        outputs[i] = outputs[i].reshape(dims_permutted)
+        if nb_dims == 3:
+            outputs[i] = outputs[i].transpose(0,2,1)
+        if nb_dims == 4:
+            outputs[i] = outputs[i].transpose(0,3,1,2)
+
+    return outputs
--- a/aidge_export_cpp/export.py
+++ b/aidge_export_cpp/export.py
-import re
 import os
-from pathlib import Path
 import shutil
 import numpy as np
+from pathlib import Path
 from typing import List, Union
-from jinja2 import Environment, FileSystemLoader

 import aidge_core
-from aidge_core.export.code_generation import *
-from aidge_export_cpp.utils import (ROOT, OPERATORS_REGISTRY, supported_operators)
-from aidge_export_cpp.utils.converter import aidge_datatype2ctype, numpy_dtype2ctype
-import aidge_export_cpp.operators
-from aidge_export_cpp.utils.generation import *
-from aidge_export_cpp.memory import *
-
-
-def generate_input_file(export_folder:str,
-                        array_name:str,
-                        array: np.ndarray):
-    
-    # If directory doesn't exist, create it
-    if not os.path.exists(export_folder):
-        os.makedirs(export_folder)
-
-    generate_file(
-        file_path=f"{export_folder}/{array_name}.h",
-        template_path=str(ROOT / "templates" / "data" / "inputs.jinja"),
-        dims = array.shape,
-        data_t = numpy_dtype2ctype(array.dtype),
-        name = array_name,
-        values = array.tolist()
-    )
-
-
-def export(export_folder_name, graphview, scheduler):
-
-    export_folder = Path().absolute() / export_folder_name
-
-    os.makedirs(str(export_folder), exist_ok=True)
-
-    dnn_folder = export_folder / "dnn"
-    os.makedirs(str(dnn_folder), exist_ok=True)
-
-    list_actions = []
-    list_configs = []
+from aidge_core.mem_info import generate_optimized_memory_info
+from aidge_core.export_utils import scheduler_export, generate_main_cpp, aidge2c, generate_file

-    list_forward_nodes = scheduler.get_static_scheduling()
+from aidge_export_cpp import ExportLibCpp, ROOT
+from aidge_export_cpp.export_utils import read_log_file

-    for node in list_forward_nodes:
-        if node.type() in supported_operators():
-            op = OPERATORS_REGISTRY[node.type()](node)
-            
-            # For configuration files
-            list_configs = op.export(dnn_folder, list_configs)

-            # For forward file
-            list_actions = op.forward(list_actions)
-
-
-    # Memory management
-    mem_size, mem_info = compute_default_mem_info(scheduler)
-
-    # Generate the memory file
-    generate_file(
-        str(dnn_folder / "memory" / "mem_info.h"),
-        str(ROOT / "templates" / "memory" / "mem_info.jinja"),
-        mem_size = mem_size,
-        mem_info_legends = MEMORY_INFO_TEMPLATE,
-        mem_info = mem_info
-    )
-    list_configs.append("memory/mem_info.h")
-
-    # Get entry nodes
-    # It supposes the entry nodes are producers with constant=false
-    # Store the datatype & name
-    list_inputs_name = []
-    for node in graphview.get_nodes():
-        if node.type() == "Producer":
-            if not node.get_operator().get_attr("Constant"):
-                export_type = aidge_datatype2ctype(node.get_operator().get_output(0).dtype())
-                list_inputs_name.append((export_type, node.name()))
-
-    # Get output nodes
-    # Store the datatype & name, like entry nodes
-    list_outputs_name = []
-    for node in graphview.get_nodes():
-        if len(node.get_children()) == 0:
-            export_type = aidge_datatype2ctype(node.get_operator().get_output(0).dtype())
-            list_outputs_name.append((export_type, node.name()))
-
-    # Generate forward file
-    generate_file(
-        str(dnn_folder / "src" / "forward.cpp"),
-        str(ROOT / "templates" / "network" / "network_forward.jinja"),
-        headers=list_configs,
-        actions=list_actions,
-        inputs= list_inputs_name,
-        outputs=list_outputs_name
-    )
-
-    # Generate dnn API
-    generate_file(
-        str(dnn_folder / "include" / "dnn.hpp"),
-        str(ROOT / "templates" / "network" / "dnn_header.jinja"),
-        libraries=[],
-        functions=get_functions_from_c_file(str(dnn_folder / "src" / "forward.cpp")),
-    )
-
-    # Copy all static files in the export
-    shutil.copy(str(ROOT / "static" / "main.cpp"), str(export_folder))
-    shutil.copy(str(ROOT / "static" / "Makefile"), str(export_folder))
-    shutil.copytree(str(ROOT / "static" / "include"), str(dnn_folder / "include"), dirs_exist_ok=True)
\ No newline at end of file
+def export(export_folder_name: str,
+           graphview: aidge_core.GraphView,
+           scheduler: Union[List[aidge_core.Node],
+                            aidge_core.Scheduler],
+           inputs_tensor: aidge_core.Tensor = None,
+           labels: aidge_core.Tensor = None,
+           dev_mode: bool = False,
+           aidge_cmp: bool = False):
+    
+    """ Export an aidge_core.Scheduler to C++ code
+    
+    :param export_folder_name: Export folder name
+    :type export_folder_name: str
+    :param graph_view: An instance of :py:class:`aidge_core.graph_view`, providing access to nodes and
+                       ordered input/output data within the computational graph.
+    :type graph_view: aidge_core.GraphView
+    :param scheduler: Scheduler instance managing the computation graph.
+                      Uses `graph_view` and `get_sequential_static_scheduling` methods
+    :param inputs_tensor: **For future** argument to provide tensor to use in the main function, not implemented yet!
+                          By default, the input of the given graph will be exported.
+    :type input_tensor: aidge_core.Tensor
+                    to retrieve the computation graph layout and ordered nodes.
+    :type scheduler: aidge_core.Scheduler
+    :param labels: Argument to provide labels tensor to generate and use in the main function. 
+    :type labels: aidge_core.Tensor
+    :param dev_mode: Wether or not the developer mode is enabled. If enabled, the export files
+                     will be symlinks from the aidge_export_cpp module. Therefore, modifying
+                     a file within the export will change the module as well. 
+    :type dev_mode: boolean
+    """
+
+    export_folder_name = Path(export_folder_name)
+
+    # Remove existing export
+    if os.path.isdir(export_folder_name):
+        print("Removing existing export directory...")
+        shutil.rmtree(export_folder_name)
+
+    # Generate Model Files
+    """
+    Perform the following tasks :
+    - Generate the parameters and layers config files
+    - Generate the forward.cpp file
+    - Copy all needed kernels
+    """
+
+    scheduler_export(scheduler,
+                     export_folder_name,
+                     ExportLibCpp,
+                     memory_manager=generate_optimized_memory_info,
+                     memory_manager_args={
+                         "stats_folder": f"{export_folder_name}/stats"},
+                     dev_mode=dev_mode)
+    
+    # Generate main file
+    generate_main_cpp(export_folder_name, graphview, labels=labels, inputs_tensor=inputs_tensor)
+
+    # Generate log files (aidge_cmp option)
+    """
+    If the aidge_cmp option has been enabled, the generated log_outputs will
+    be copied into the generated export in order to be used as reference. 
+    """
+    if aidge_cmp:
+        ranked_nodes = graphview.get_ranked_nodes_name("{0}[{1}#{3}]")
+        os.makedirs(export_folder_name / "data" / "aidge_outputs")
+        os.makedirs(export_folder_name / "data" / "export_outputs")
+        for node in graphview.get_nodes():
+            if node.type() != "Producer":
+                file_path = 'log_outputs/' + ranked_nodes[node] + '/output_0.log'
+                data_t = aidge2c(node.get_operator().get_output(0).dtype())
+                name = node.name() + '_output_0_aidge'
+                dims = node.get_operator().get_output(0).dims()
+                values = read_log_file(file_path)
+
+                generate_file(export_folder_name / "data" / "aidge_outputs" / (node.name() + ".hpp"),
+                              ROOT / "templates" / "data" / "aidge_tensor.jinja",
+                              data_t=data_t,
+                              name=name,
+                              dims=dims,
+                              values=values)
--- a/aidge_export_cpp/export_registry.py
+++ b/aidge_export_cpp/export_registry.py
+from aidge_core.export_utils import ExportLib
+from aidge_export_cpp import ROOT
+
+class ExportLibCpp(ExportLib):
+    _name="export_cpp"
+    static_files={
+        str(ROOT / "static" / "Makefile"): "",
+        str(ROOT / "static" / "typedefs.hpp"): "dnn/include/network",
+        str(ROOT / "static" / "utils.hpp"): "dnn/include/network",
+        str(ROOT / "static" / "rescaling_utils.hpp"): "dnn/include/network",
+        str(ROOT / "static" / "activation_utils.hpp"): "dnn/include/network",
+    }
--- a/aidge_export_cpp/export_utils.py
+++ b/aidge_export_cpp/export_utils.py
+import os
+from collections import OrderedDict
+
+import aidge_core
+from aidge_core.export_utils import get_node_from_metaop
+
+def cpp_fuse_to_metaops(graph_view: aidge_core.GraphView):
+    """ 
+    Fuse nodes into metaops adapted for the CPP Export
+    TODO: These recipes should be into aidge_core
+
+    :param graph_view: An instance of :py:class:`aidge_core.GraphView`, providing access to nodes and
+                       ordered input/output data within the computational graph.
+    """
+
+    cpp_recipes = OrderedDict({
+        # Quantization
+        "QMul":           "Mul->Quantizer",     # Fixed Point Scaling
+
+        # FC
+        "QFC":            "FC->(Quantizer|QMul)",
+        "FCAct":          "(FC|QFC)->ReLU",
+
+        # Conv
+        "QConv":          "Conv2D->(Quantizer|QMul)",
+        "PadConv":        "(QConv|Conv2D)<-Pad2D",
+        "ConvAct":        "(QConv|Conv2D)->ReLU",
+        "PadConvAct":     "PadConv->ReLU",
+
+        # ConvDw
+        "QConvDw":          "ConvDepthWise2D->(Quantizer|QMul)",
+        "ConvDwPad":        "(QConvDw|ConvDepthWise2D)->Pad2D",
+        "ConvDwAct":        "(QConvDw|ConvConvDepthWise2D2D)->ReLU",
+        "ConvDwActPad":     "ConvDwAct->Pad2D",
+
+        # Max Pooling
+        "PadMaxPool":     "MaxPooling2D<-Pad2D",
+        "MaxPoolAct":     "MaxPooling2D->ReLU",
+        "PadMaxPoolAct":  "PadMaxPool->ReLU",
+
+        # Average Pooling
+        "PadAvgPool":     "AvgPooling2D<-Pad2D",
+        "AvgPoolAct":     "AvgPooling2D->ReLU",
+        "PadAvgPoolAct":  "PadAvgPool->ReLU",
+
+        # Global Average Pooling
+        "PadGlobalAvgPool":     "GlobalAveragePooling2D<-Pad2D",
+        "GlobalAvgPoolAct":     "GlobalAveragePooling2D->ReLU",
+        "PadGlobalAvgPoolAct":  "PadGlobalAveragePool->ReLU",
+
+        # ElemWise
+        "QAdd":      "Add->(Quantizer|QMul)",
+        "QSub":      "Sub->(Quantizer|QMul)",
+        # "QMul":    "Mul->Quantizer",      # Already defined
+        "AddAct":    "(QAdd|Add)->ReLU",
+        "SubAct":    "(QSub|Sub)->ReLU",
+        "MulAct":    "(QMul|Mul)->ReLU",
+
+        # Activation
+        "QReLU":        "ReLU->(Quantizer|QMul)",
+    })
+
+    for node, recipe in cpp_recipes.items():
+        aidge_core.fuse_to_metaops(graph_view, recipe, node)
+
+
+
+def set_nodes_names(scheduler):
+    """
+    Set the CPP nodes names as well as their producers. 
+    The producers naming is handled from their child node.
+    
+    [TODO] Fc and Conv layers will always have weights as parent 1 and 
+    possibly biases as parent 2. It may be better to previously label the 
+    producers. 
+
+    :param scheduler: Scheduler instance managing the computation graph.
+                      Uses `graph_view` and `get_sequential_static_scheduling` methods
+                      to retrieve the computation graph layout and ordered nodes.
+    :type scheduler: aidge_core.Scheduler
+    """
+
+    node_ids = {}   # Dict holding the node type along with a counter
+    node_it = 0     # Node Iterator
+
+    ## MetaOps
+    for node in scheduler.get_sequential_static_scheduling():
+        node_type = node.type()
+
+        if node_type != "Producer":
+            if node.type() not in node_ids:
+                node_ids[node_type] = 0
+
+            # Set node name
+            node.set_name("_" + str(node_it) + "_" +
+                            str(node_type) + "_" + str(node_ids[node_type]))
+            node_ids[node_type] += 1
+            node_it += 1
+
+            # Set producers names
+            ## Weights & Biases producers
+            if get_node_from_metaop(node, "FC") or \
+               get_node_from_metaop(node, "Conv2D") or \
+               get_node_from_metaop(node, "ConvDepthWise2D"):
+                
+                node.get_parent(1).set_name(node.name() + "_weights")
+                if node.get_parent(2) is not None:
+                    node.get_parent(2).set_name(node.name() + "_biases")  
+
+    ## Scaling Producers
+    for node in scheduler.get_sequential_static_scheduling():
+        """
+        TODO: If multiple quantizer nodes are found, the producers will
+        all have the same name and this will not work properly. 
+        """
+        if node.type() == "Producer":
+            child_node = node.output(0)[0][0]
+            if node.attributes().has_attr("shift_prod"):
+                node.set_name(child_node.name() + "_shift")
+            if node.attributes().has_attr("coef_prod"):
+                node.set_name(child_node.name() + "_coef")
+
+
+
+def set_nodes_datatypes(graph_view: aidge_core.GraphView):
+    """ Set the nodes' datatypes
+
+    The set_datatype function can't be used on Conv2D and FC nodes directly
+    as the biases datatype is different from the other inputs. 
+    TODO: Should be using forward_datatype()
+
+    :param graph_view: An instance of :py:class:`aidge_core.graph_view`, providing access to nodes and
+                       ordered input/output data within the computational graph.
+    """
+    for node in graph_view.get_nodes():
+        if node.type() != "Producer":
+            if get_node_from_metaop(node, "FC") or \
+               get_node_from_metaop(node, "Conv2D") or \
+               get_node_from_metaop(node, "ConvDepthWise2D"):
+
+                node.get_operator().get_input(0).set_datatype(aidge_core.dtype.int8)    # Input
+                node.get_operator().get_input(1).set_datatype(aidge_core.dtype.int8)    # Weights
+                if node.get_parent(2) is not None:
+                    node.get_operator().get_input(2).set_datatype(aidge_core.dtype.int32)   # Biases
+                node.get_operator().get_output(0).set_datatype(aidge_core.dtype.int8)       # Output
+            else:
+                node.get_operator().set_datatype(aidge_core.dtype.int8)
+
+    # Set input node's datatype
+    for n in graph_view.get_input_nodes():
+        n.get_operator().get_input(0).set_datatype(aidge_core.dtype.int8)
+
+
+
+def read_log_file(file_path: str):
+    """ Read log file
+    Used to read the aidge generated log files containing the intermediate
+    tensors of the exported model. 
+
+    :param file_path: Path to the file to read. 
+    :type file_path: str
+    """
+    # Check if the file exists
+    if not os.path.isfile(file_path):
+        print(f"File not found: {file_path}")
+        return None
+
+    with open(file_path, 'r') as file:
+        content = file.read()
+    return content
+
+
+
+def exclude_unwanted_producers(model):
+    """ Exclude some producers not needed for the export
+
+    Currently excludes the producers attached to the Mul and BitShift nodes, as they are
+    tensors holding a single data. This data is retrieved during the export
+    generation process and passed as argument directly within the Mul layer
+    configuration. 
+    """
+
+    nodes_to_ignore = ["Mul", "BitShift", "Clip"]
+
+    for node in model.get_nodes():
+        if node.type() == "Producer":
+            children_nodes = [n.type() for n in node.get_children()]
+            for node_type in nodes_to_ignore:
+                if node_type in children_nodes:
+                    node.attributes().ignore = True
+                    break
+
+
+
+def set_scaling_attributes(export_node: aidge_core.export_utils.ExportNode, node: aidge_core.Node):
+    """
+    Look recursively for a Quantizer node inside of the given node, 
+    then set shift and coef attributes of the given export node.
+    [TODO] Should be moved into aidge_core.ExportNode
+
+    :param export_node: An instance of :py:class:`aidge_core.export_utils.ExportNode` to set the scaling
+                        attributes needed for a quantized export. 
+    :type export_node: aidge_core.export_utils.ExportNode
+    :param node: Node which may hold a Quantizer node. 
+    :type node: aidge_core.Node
+    """
+
+    QNode = get_node_from_metaop(node, "Quantizer")
+    if QNode:
+        BNode = get_node_from_metaop(QNode[0], "BitShift")
+        export_node.attributes["shift_value"] = BNode[0].get_operator().get_input(1)[0]
+
+    QMulNode = get_node_from_metaop(node, "QMul")
+    if QMulNode:
+        CNode = get_node_from_metaop(QMulNode[0], "Mul")
+        export_node.attributes["coef_value"] = CNode[0].get_operator().get_input(1)[0]
+
+      
+            
+def normalize(array):
+    """
+    Normalize an input image between -1 and 1
+    """
+    if array.max() == array.min():
+        return array/array.max()
+    array = (array - array.min()) / (array.max() - array.min())
+    return 2 * array - 1
+
--- a/aidge_export_cpp/kernels/activation.hpp
+++ b/aidge_export_cpp/kernels/activation.hpp
 #ifndef __AIDGE_EXPORT_CPP_KERNELS_ACTIVATION__
 #define __AIDGE_EXPORT_CPP_KERNELS_ACTIVATION__

-#include <type_traits>
-#include "network/typedefs.hpp"
-#include "network/utils.hpp"
-#include "network/rescaling.hpp"
-
-template<typename Output_T, typename T,  
-         typename std::enable_if<std::is_floating_point<T>::value>::type* = nullptr>
-__attribute__((always_inline)) inline
-Output_T saturate (T value, int32_t /*sat*/) 
-{
-    return value;
-}
-
-template<typename Output_T, typename T,  
-         typename std::enable_if<!std::is_floating_point<T>::value>::type* = nullptr>
-__attribute__((always_inline)) inline
-Output_T saturate (T value, uint32_t sat) 
-{
-    if (std::is_unsigned<Output_T>::value) {
-        return clamp(value, T(0), (T(1) << sat) - 1);
-    } else {
-        return clamp(value, -(T(1) << (sat - 1)), (T(1) << (sat - 1)) - 1);
-    }                                                 
-}
-
-template<typename Output_T, 
-         typename Sum_T, 
-         typename Rescaling_T>
-__attribute__((always_inline)) inline 
-Output_T activation_forward_value (Sum_T weightedSum, 
-                                   int output, 
-                                   ActivationFunction_T func, 
-                                   const Rescaling_T& __restrict rescaling) 
-{
-    switch(func) {
-        case Linear:
-        case Saturation: {
-            break;
-        }
-        case Rectifier: {
-            if(weightedSum <= 0) weightedSum = 0;
-            break;
-        }
-        default:
-            // Unsupported activation function
-            break;
-    }
-
-    // Value fixed here for now but it should be generated by 
-    // the export module or determined by the type of Output_T
-    // For now only works for int8_t and uint8_t
-    const uint32_t NB_BITS = 8;
-    return saturate<Output_T>(rescaling(weightedSum, output), NB_BITS);
-}
-
+#include "network/activation_utils.hpp"
+#include "network/rescaling_utils.hpp"

 template<int NB_DATA,
         ActivationFunction_T ACTIVATION,
         typename Input_T, typename Output_T, typename Rescaling_T>
-__attribute__((always_inline)) inline 
+__attribute__((always_inline)) inline
 void activation_forward (
    const Input_T* __restrict inputs,
    Output_T* __restrict outputs,

--- a/aidge_export_cpp/kernels/batchnorm.hpp
+++ b/aidge_export_cpp/kernels/batchnorm.hpp
@@ -2,17 +2,19 @@
 #define __AIDGE_EXPORT_CPP_KERNELS_BATCHNORM__

 #include "network/typedefs.hpp"
-#include "network/rescaling.hpp"
+#include "network/activation_utils.hpp"
+
 #include <math.h>

 // WARNING: this kernel only works for 32-bits floating point values

-template<int NB_OUTPUTS, 
+template<int NB_BATCHES, int NB_OUTPUTS,
         int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
         ActivationFunction_T ACTIVATION,
-         typename Input_T, typename Output_T, 
-         typename Param_T>
-__attribute__((always_inline)) inline 
+         typename Input_T, typename Output_T,
+         typename Param_T,
+         typename Rescaling_T>
+__attribute__((always_inline)) inline
 void batchnorm_forward (
    const Input_T* __restrict inputs,
    Output_T* __restrict outputs,
@@ -20,18 +22,22 @@ void batchnorm_forward (
    const Param_T* __restrict variances,
    const Param_T* __restrict means,
    const Param_T* __restrict scales,
-    const double epsilon)
+    const double epsilon,
+    const Rescaling_T& __restrict rescaling)
 {
-    for (unsigned int output = 0; output < NB_OUTPUTS; ++output) {
-        const Output_T var = sqrt(variances[output] + epsilon);
+    for (unsigned int batch = 0; batch < NB_BATCHES; ++batch) {
+        for (unsigned int output = 0; output < NB_OUTPUTS; ++output) {
+            // If the variance is 0, we need to avoid division by 0
+            Output_T var = sqrt(variances[output] > 0.0 ? variances[output] + epsilon : epsilon);

-        for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
-            for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
-                const int outputOffset = OUTPUTS_HEIGHT * oy + ox;
+            for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
+                for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
+                    const int outputOffset = batch * OUTPUTS_WIDTH * OUTPUTS_HEIGHT * NB_OUTPUTS + output * OUTPUTS_WIDTH * OUTPUTS_HEIGHT + OUTPUTS_WIDTH * oy + ox;

-                const Output_T normalized = (inputs[outputOffset + output] - means[output]) / var;
-                const Output_T sAs = scales[output] * normalized + biases[output];
-                outputs[outputOffset + output] = sat<Output_T>(sAs, output, ACTIVATION, NoScaling);
+                    const Output_T normalized = (inputs[outputOffset] - means[output]) / var;
+                    const Output_T sAs = scales[output] * normalized + biases[output];
+                    outputs[outputOffset] = activation_forward_value<Output_T>(sAs, output, ACTIVATION, rescaling);
+                }
            }
        }
    }

--- a/aidge_export_cpp/kernels/concat.hpp
+++ b/aidge_export_cpp/kernels/concat.hpp
+#ifndef __AIDGE_EXPORT_CPP_KERNELS_CONCAT__
+#define __AIDGE_EXPORT_CPP_KERNELS_CONCAT__
+
+template<int AXIS_SIZE_POST,
+         int AXIS_SIZE_PRE,
+         unsigned int NB_INPUTS,
+         typename T>
+__attribute__((always_inline)) inline static
+void concat_forward (
+    const T* const * __restrict inputs,
+    const unsigned int* __restrict sizes,
+    T* __restrict output)
+{
+    unsigned int total_concat_axis_size = 0;
+    for (unsigned int n = 0; n < NB_INPUTS; ++n)
+        total_concat_axis_size += sizes[n];
+
+    for (int i = 0; i < AXIS_SIZE_PRE; ++i) {
+        // Loop over post-axis (e.g., dims after axis 1)
+        for (int j = 0; j < AXIS_SIZE_POST; ++j) {
+            unsigned int axis_offset = 0;
+
+            // Loop over each input tensor
+            for (unsigned int n = 0; n < NB_INPUTS; ++n) {
+                for (unsigned int k = 0; k < sizes[n]; ++k) {
+                    const int input_idx  = i * sizes[n] * AXIS_SIZE_POST + k * AXIS_SIZE_POST + j;
+
+                    output[i * total_concat_axis_size * AXIS_SIZE_POST + (axis_offset + k) * AXIS_SIZE_POST + j] =
+                        inputs[n][input_idx];
+                }
+
+                axis_offset += sizes[n];  // move along axis in output
+            }
+        }
+    }
+
+}
+
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_CONCAT__
\ No newline at end of file
--- a/aidge_export_cpp/kernels/convolution.hpp
+++ b/aidge_export_cpp/kernels/convolution.hpp
@@ -2,13 +2,13 @@
 #define __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION__

 #include "network/typedefs.hpp"
-#include "network/rescaling.hpp"
+#include "network/rescaling_utils.hpp"
 #include "network/utils.hpp"
-#include "kernels/macs.hpp"
-#include "kernels/activation.hpp"
+#include "network/macs.hpp"
+#include "network/activation_utils.hpp"


-template<int NB_CHANNELS, 
+template<int NB_CHANNELS,
         int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
         int NB_OUTPUTS,
         int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
@@ -17,10 +17,10 @@ template<int NB_CHANNELS,
         int DILATION_Y, int DILATION_X,
         int KERNEL_HEIGHT, int KERNEL_WIDTH,
         ActivationFunction_T ACTIVATION,
-         typename Input_T, typename Output_T, 
+         typename Input_T, typename Output_T,
         typename Weight_T, typename Bias_T,
         typename Rescaling_T>
-__attribute__((always_inline)) inline 
+__attribute__((always_inline)) inline
 void convolution_forward(
    const Input_T* __restrict inputs,
    Output_T* __restrict outputs,
@@ -28,10 +28,10 @@ void convolution_forward(
    const Bias_T* __restrict biases,
    const Rescaling_T& __restrict rescaling)
 {
-    constexpr int DILATED_KERNEL_HEIGHT 
+    constexpr int DILATED_KERNEL_HEIGHT
            = KERNEL_HEIGHT + (DILATION_Y - 1) * (KERNEL_HEIGHT - 1);

-    constexpr int DILATED_KERNEL_WIDTH 
+    constexpr int DILATED_KERNEL_WIDTH
            = KERNEL_WIDTH + (DILATION_X - 1) * (KERNEL_WIDTH - 1);

    constexpr int OUTPUTS_HEIGHT_NOPAD
@@ -44,11 +44,13 @@ void convolution_forward(
            : max(PADDING_Y - (oy * STRIDE_Y), 0);
        const int syMax = (PADDING_Y == 0
                && OUTPUTS_HEIGHT == OUTPUTS_HEIGHT_NOPAD) ? DILATED_KERNEL_HEIGHT
-            : clamp(CHANNELS_HEIGHT + PADDING_Y - (oy * STRIDE_Y), 
+            : clamp(CHANNELS_HEIGHT + PADDING_Y - (oy * STRIDE_Y),
                    0, DILATED_KERNEL_HEIGHT);
        const int iy = (oy * STRIDE_Y) - PADDING_Y;

+#ifdef _OPENMP
 #pragma omp parallel for collapse(2)
+#endif
        for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
            for (int output = 0; output < NB_OUTPUTS; ++output) {
                // moved to inner loop for collapsing -->
@@ -57,16 +59,16 @@ void convolution_forward(
                const int sxMax = (PADDING_X == 0
                        && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
                            ? DILATED_KERNEL_WIDTH
-                    : clamp(CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X), 
+                    : clamp(CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X),
                            0, DILATED_KERNEL_WIDTH);
                const int ix = (ox * STRIDE_X) - PADDING_X;

                const int oPos = (ox + OUTPUTS_WIDTH * oy);
-                int oOffset = NB_OUTPUTS * oPos;
+                const int oOffset = NB_OUTPUTS * oPos;

                // <--
-
-                Bias_T weightedSum = biases[output];
+                // Check if the biases are defined
+                Bias_T weightedSum = biases ? biases[output] : 0;

                for (int sy = 0; sy < KERNEL_HEIGHT; ++sy) {
                    if ((PADDING_Y != 0
@@ -77,7 +79,7 @@ void convolution_forward(
                    }

                    const int iPos = ix + CHANNELS_WIDTH * (iy + sy*DILATION_Y);
-                    int iOffset = NB_CHANNELS * iPos;
+                    const int iOffset = NB_CHANNELS * iPos;

                    const int wOffset = (output*KERNEL_HEIGHT + sy) * KERNEL_WIDTH * NB_CHANNELS;

@@ -85,8 +87,8 @@ void convolution_forward(
                        || sxMax - sxMin == KERNEL_WIDTH))
                    {
                        macsOnRange<KERNEL_WIDTH * NB_CHANNELS>(
-                            inputs + iOffset, 
-                            weights + wOffset, 
+                            inputs + iOffset,
+                            weights + wOffset,
                            weightedSum);
                    }
                    else {
@@ -98,13 +100,13 @@ void convolution_forward(
                                continue;
                            }

-                            int iOffsetInRange = iOffset
+                            const int iOffsetInRange = iOffset
                                + sx * DILATION_X * NB_CHANNELS;
-                        
+
                            macsOnRange<NB_CHANNELS>(
                                // same input line so no wrapping can occur
-                                inputs + iOffsetInRange, 
-                                weights + wOffset + sx * NB_CHANNELS, 
+                                inputs + iOffsetInRange,
+                                weights + wOffset + sx * NB_CHANNELS,
                                weightedSum);
                        }
                    }
@@ -116,4 +118,45 @@ void convolution_forward(
    }
 }

+// Template specialization when biases are not given to the convolution
+template<int NB_CHANNELS,
+         int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
+         int NB_OUTPUTS,
+         int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
+         int PADDING_Y, int PADDING_X,
+         int STRIDE_Y, int STRIDE_X,
+         int DILATION_Y, int DILATION_X,
+         int KERNEL_HEIGHT, int KERNEL_WIDTH,
+         ActivationFunction_T ACTIVATION,
+         typename Input_T, typename Output_T,
+         typename Weight_T,
+         typename Rescaling_T>
+__attribute__((always_inline)) inline
+void convolution_forward(
+    const Input_T* __restrict inputs,
+    Output_T* __restrict outputs,
+    const Weight_T* __restrict weights,
+    std::nullptr_t __restrict,
+    const Rescaling_T& __restrict rescaling)
+{
+    const float* b = nullptr;
+
+    convolution_forward<NB_CHANNELS,
+                        CHANNELS_HEIGHT,
+                        CHANNELS_WIDTH,
+                        NB_OUTPUTS,
+                        OUTPUTS_HEIGHT,
+                        OUTPUTS_WIDTH,
+                        PADDING_Y,
+                        PADDING_X,
+                        STRIDE_Y,
+                        STRIDE_X,
+                        DILATION_Y,
+                        DILATION_X,
+                        KERNEL_HEIGHT,
+                        KERNEL_WIDTH,
+                        ACTIVATION>
+                        (inputs, outputs, weights, b, rescaling);
+}
+
 #endif  // __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION__
--- a/aidge_export_cpp/kernels/convolution_depthwise.hpp
+++ b/aidge_export_cpp/kernels/convolution_depthwise.hpp
+#ifndef __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION_DEPTHWISE__
+#define __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION_DEPTHWISE__
+
+#include "network/typedefs.hpp"
+#include "network/rescaling_utils.hpp"
+#include "network/utils.hpp"
+#include "network/macs.hpp"
+#include "network/activation_utils.hpp"
+
+template<int NB_CHANNELS, 
+         int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
+         int NB_OUTPUTS,
+         int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
+         int PADDING_Y, int PADDING_X,
+         int STRIDE_Y, int STRIDE_X,
+         int DILATION_Y, int DILATION_X,
+         int KERNEL_HEIGHT, int KERNEL_WIDTH,
+         ActivationFunction_T ACTIVATION,
+         typename Input_T, typename Output_T,
+         typename Weight_T, typename Bias_T,
+         typename Rescaling_T>
+__attribute__((always_inline)) inline
+void convolution_depthwise_forward(
+    const Input_T* __restrict inputs,
+    Output_T* __restrict outputs,
+    const Weight_T* __restrict weights,
+    const Bias_T* __restrict biases,
+    const Rescaling_T& __restrict rescaling)
+{
+    static_assert(NB_OUTPUTS % NB_CHANNELS == 0,
+        "NB_OUTPUTS should be a multiple of NB_CHANNELS.");
+
+    constexpr int DILATED_KERNEL_HEIGHT 
+            = KERNEL_HEIGHT + (DILATION_Y - 1) * (KERNEL_HEIGHT - 1);
+
+    constexpr int DILATED_KERNEL_WIDTH 
+            = KERNEL_WIDTH + (DILATION_X - 1) * (KERNEL_WIDTH - 1);
+
+    constexpr int OUTPUTS_HEIGHT_NOPAD
+        = (CHANNELS_HEIGHT - DILATION_Y * (KERNEL_HEIGHT - 1) - 1 + STRIDE_Y) / STRIDE_Y;
+    constexpr int OUTPUTS_WIDTH_NOPAD
+        = (CHANNELS_WIDTH - DILATION_X * (KERNEL_WIDTH - 1) - 1 + STRIDE_X) / STRIDE_X;
+
+    for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
+        const int syMin = (PADDING_Y == 0) ? 0
+            : max(PADDING_Y - (oy * STRIDE_Y), 0);
+        const int syMax = (PADDING_Y == 0
+                && OUTPUTS_HEIGHT == OUTPUTS_HEIGHT_NOPAD) ? DILATED_KERNEL_HEIGHT
+            : clamp(CHANNELS_HEIGHT + PADDING_Y - (oy * STRIDE_Y), 
+                    0, DILATED_KERNEL_HEIGHT);
+        const int iy = (oy * STRIDE_Y) - PADDING_Y;
+
+#ifdef _OPENMP
+#pragma omp parallel for collapse(2)
+#endif
+        for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
+            for (int output = 0; output < NB_OUTPUTS; ++output) {
+                // moved to inner loop for collapsing -->
+                const int sxMin = (PADDING_X == 0) ? 0
+                    : max(PADDING_X - (ox * STRIDE_X), 0);
+                const int sxMax = (PADDING_X == 0
+                        && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
+                            ? DILATED_KERNEL_WIDTH
+                    : clamp(CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X), 
+                            0, DILATED_KERNEL_WIDTH);
+                const int ix = (ox * STRIDE_X) - PADDING_X;
+
+                const int oPos = (ox + OUTPUTS_WIDTH * oy);
+                const int oOffset = NB_OUTPUTS * oPos;
+                // <--
+
+                const int channel = (output * NB_CHANNELS) / NB_OUTPUTS;
+
+                Bias_T weightedSum = biases ? biases[output] : 0;
+
+                for (int sy = 0; sy < KERNEL_HEIGHT; ++sy) {
+                    if ((PADDING_Y != 0
+                            || OUTPUTS_HEIGHT != OUTPUTS_HEIGHT_NOPAD)
+                        && ((sy*DILATION_Y < syMin) || (sy*DILATION_Y >= syMax)))
+                    {
+                        continue;
+                    }
+
+                    const int iPos = ix + CHANNELS_WIDTH * (iy + sy*DILATION_Y);
+                    const int iOffset = NB_CHANNELS * iPos;
+
+                    const int wOffset = (output*KERNEL_HEIGHT + sy) 
+                                        * KERNEL_WIDTH;
+
+                    if (DILATION_X == 1 && ((PADDING_X == 0
+                            && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
+                        || sxMax - sxMin == KERNEL_WIDTH))
+                    {
+                        macsOnRange<KERNEL_WIDTH, NB_CHANNELS>(
+                            inputs + iOffset + channel, 
+                            weights + wOffset, 
+                            weightedSum);
+                    }
+                    else {
+                        for (int sx = 0; sx < KERNEL_WIDTH; ++sx) {
+                            if ((PADDING_X != 0
+                                    || OUTPUTS_WIDTH != OUTPUTS_WIDTH_NOPAD)
+                                && ((sx*DILATION_X < sxMin) || (sx*DILATION_X >= sxMax)))
+                            {
+                                continue;
+                            }
+
+                            const int iOffsetInRange = iOffset
+                                + sx * DILATION_X * NB_CHANNELS;
+
+                            weightedSum += inputs[iOffsetInRange + channel]
+                                * weights[wOffset + sx];
+                        }
+                    }
+                }
+
+                outputs[oOffset + output] = activation_forward_value<Output_T>(weightedSum, output, ACTIVATION, rescaling);
+            }
+        }
+    }
+}
+
+// Template specialization when biases are not given to the convolution
+template<int NB_CHANNELS,
+         int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
+         int NB_OUTPUTS,
+         int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
+         int PADDING_Y, int PADDING_X,
+         int STRIDE_Y, int STRIDE_X,
+         int DILATION_Y, int DILATION_X,
+         int KERNEL_HEIGHT, int KERNEL_WIDTH,
+         ActivationFunction_T ACTIVATION,
+         typename Input_T, typename Output_T,
+         typename Weight_T,
+         typename Rescaling_T>
+__attribute__((always_inline)) inline
+void convolution_depthwise_forward(
+    const Input_T* __restrict inputs,
+    Output_T* __restrict outputs,
+    const Weight_T* __restrict weights,
+    std::nullptr_t __restrict,
+    const Rescaling_T& __restrict rescaling)
+{
+    const float* b = nullptr;
+
+    convolution_depthwise_forward<NB_CHANNELS,
+                        CHANNELS_HEIGHT,
+                        CHANNELS_WIDTH,
+                        NB_OUTPUTS,
+                        OUTPUTS_HEIGHT,
+                        OUTPUTS_WIDTH,
+                        PADDING_Y,
+                        PADDING_X,
+                        STRIDE_Y,
+                        STRIDE_X,
+                        DILATION_Y,
+                        DILATION_X,
+                        KERNEL_HEIGHT,
+                        KERNEL_WIDTH,
+                        ACTIVATION>
+                        (inputs, outputs, weights, b, rescaling);
+}
+
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION_DEPTHWISE__
--- a/aidge_export_cpp/kernels/elemwise.hpp
+++ b/aidge_export_cpp/kernels/elemwise.hpp
@@ -2,7 +2,7 @@
 #define __AIDGE_EXPORT_CPP_KERNELS_ELEMWISE__

 #include "network/typedefs.hpp"
-#include "kernels/activation.hpp"
+#include "network/activation_utils.hpp"

 // Generic function for two inputs


--- a/aidge_export_cpp/kernels/fullyconnected.hpp
+++ b/aidge_export_cpp/kernels/fullyconnected.hpp
@@ -2,20 +2,20 @@
 #define __AIDGE_EXPORT_CPP_KERNELS_FULLYCONNECTED__

 #include "network/typedefs.hpp"
-#include "network/rescaling.hpp"
+#include "network/rescaling_utils.hpp"
 #include "network/utils.hpp"
-#include "kernels/macs.hpp"
-#include "kernels/activation.hpp"
+#include "network/macs.hpp"
+#include "network/activation_utils.hpp"

-template<int NB_CHANNELS, 
+template<int NB_CHANNELS,
         int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
         int NB_OUTPUTS,
         int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
         ActivationFunction_T ACTIVATION,
-         typename Input_T, typename Output_T, 
+         typename Input_T, typename Output_T,
         typename Weight_T, typename Bias_T,
         typename Rescaling_T>
-__attribute__((always_inline)) inline 
+__attribute__((always_inline)) inline
 void fullyconnected_forward (
    const Input_T* __restrict inputs,
    Output_T* __restrict outputs,
@@ -28,14 +28,17 @@ void fullyconnected_forward (
    // It is only an issue if the FC was after a flatten layer.
    // Otherwise it is not an issue for the other FC because CHANNELS_WIDTH = CHANNELS_HEIGHT = 1
    // Solution: Add a system to check dataformat
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
    for (int och = 0; och < NB_OUTPUTS; och++) {

-        Bias_T weightedSum = biases[och];
+        Bias_T weightedSum = (biases) ? biases[och] : Bias_T(0);

        for (int iy = 0; iy < CHANNELS_HEIGHT; ++iy) {
            for (int ix = 0; ix < CHANNELS_WIDTH; ++ix) {
                for (int ch = 0; ch < NB_CHANNELS; ++ch) {
-                    weightedSum += inputs[CHANNELS_WIDTH*NB_CHANNELS*iy + NB_CHANNELS*ix + ch] 
+                    weightedSum += inputs[CHANNELS_WIDTH*NB_CHANNELS*iy + NB_CHANNELS*ix + ch]
                                * weights[CHANNELS_HEIGHT*CHANNELS_WIDTH*NB_CHANNELS*och + CHANNELS_HEIGHT*CHANNELS_WIDTH*ch + CHANNELS_HEIGHT*iy + ix];
                }
            }
@@ -45,10 +48,12 @@ void fullyconnected_forward (
    }
 /*
 Here the kernel to use with inputs in NHWC and weights in NHWC
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
    for (int och = 0; och < NB_OUTPUTS; och++) {

-        Bias_T weightedSum = biases[och];
+        Bias_T weightedSum = (biases) ? biases[och] : Bias_T(0);

        for (int iy = 0; iy < CHANNELS_HEIGHT; ++iy) {
            const int iPos = (CHANNELS_WIDTH * iy);
@@ -58,8 +63,8 @@ Here the kernel to use with inputs in NHWC and weights in NHWC
                                    * (iy + CHANNELS_HEIGHT * och);

            macsOnRange<NB_CHANNELS * CHANNELS_WIDTH>(
-                inputs + iOffset, 
-                weights + wOffset, 
+                inputs + iOffset,
+                weights + wOffset,
                weightedSum);
        }

@@ -69,4 +74,4 @@ Here the kernel to use with inputs in NHWC and weights in NHWC
 }


-#endif  // __AIDGE_EXPORT_CPP_KERNELS_FULLYCONNECTED__
\ No newline at end of file
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_FULLYCONNECTED__
--- a/aidge_export_cpp/kernels/hardmax.hpp
+++ b/aidge_export_cpp/kernels/hardmax.hpp
+#ifndef __AIDGE_EXPORT_CPP_KERNELS_HARDMAX__
+#define __AIDGE_EXPORT_CPP_KERNELS_HARDMAX__
+
+#include "network/typedefs.hpp"
+#include "network/utils.hpp"
+
+// Todo add border value and border type (Reflect, Constant, Wrap...) and add
+// the two missing pad value (bottom and right)
+
+template <unsigned int AXIS_DIM_SIZE,
+          unsigned int PREAXIS_STRIDE,
+          unsigned int AXIS_STRIDE,
+          unsigned int POSTAXIS_STRIDE,
+          unsigned int NB_ELTS,
+          typename Input_T,
+          typename Output_T>
+// void HardmaxImpl_cpu_forward_kernel(std::int32_t axis_, const
+// std::vector<DimSize_t>& dims, const void* input_, void* output_)
+__attribute__((always_inline)) inline void
+hardmax2d_forward(const Input_T *__restrict input,
+                  Output_T *__restrict output) {
+    // fill output with 0
+    for (Output_T *i = output; i != output + NB_ELTS; ++i) {
+        *i = 0;
+    }
+
+    // For each index on all the axes before and after 'axis', we have a
+    // different max element to find
+    for (unsigned int i = 0, preAxisOffset = 0; i < PREAXIS_STRIDE;
+         ++i, preAxisOffset += AXIS_DIM_SIZE * POSTAXIS_STRIDE) {
+
+        for (unsigned int j = 0; j < POSTAXIS_STRIDE; ++j) {
+            // Init the max with first element
+            unsigned int maxIdx = 0;
+            Input_T maxVal = input[preAxisOffset + j];
+            // Loop over the elements on 'axis'
+            // Since we start at 0th idx, we already initialize the values like
+            // the 1st iteration has been done
+            for (unsigned int k = 1,
+                              postAxisOffset = preAxisOffset + POSTAXIS_STRIDE;
+                 k < AXIS_DIM_SIZE;
+                 ++k, postAxisOffset += POSTAXIS_STRIDE) {
+
+                Input_T currVal = input[postAxisOffset + j];
+                // Update max elements
+                if (currVal > maxVal) {
+                    maxIdx = k;
+                    maxVal = currVal;
+                }
+            }
+            output[preAxisOffset + maxIdx * POSTAXIS_STRIDE + j] = 1;
+        }
+    }
+}
+
+#endif // __AIDGE_EXPORT_CPP_KERNELS_HARDMAX__
--- a/aidge_export_cpp/kernels/leakyrelu.hpp
+++ b/aidge_export_cpp/kernels/leakyrelu.hpp
@@ -11,7 +11,9 @@ void leakyrelu_forward (
    Output_T* __restrict outputs,
    const float negative_slope)
 {
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
    for (int i = 0; i < NB_DATA; ++i) {
        if (inputs[i] >= 0) {
            outputs[i] = inputs[i];
No results found