From 29a0fec1af61ea5fb77cf63e5ebf36b0d6273e82 Mon Sep 17 00:00:00 2001
From: cmoineau <cyril.moineau@cea.fr>
Date: Thu, 12 Dec 2024 09:38:50 +0000
Subject: [PATCH 01/22] Update export_cpp with
 https://gitlab.eclipse.org/eclipse/aidge/aidge_core/-/merge_requests/277

---
 aidge_export_cpp/__init__.py       |  2 +-
 aidge_export_cpp/utils/__init__.py | 25 ++++++-------------------
 pyproject.toml                     | 17 +++++++++++------
 setup.cfg                          |  4 ++++
 4 files changed, 22 insertions(+), 26 deletions(-)
 create mode 100644 setup.cfg

diff --git a/aidge_export_cpp/__init__.py b/aidge_export_cpp/__init__.py
index 99df130..16c9be6 100644
--- a/aidge_export_cpp/__init__.py
+++ b/aidge_export_cpp/__init__.py
@@ -10,7 +10,7 @@ import aidge_core
 
 from aidge_export_cpp.utils import ROOT
 
-from ._version import *
+# from ._version import *
 
 from .export import *
 
diff --git a/aidge_export_cpp/utils/__init__.py b/aidge_export_cpp/utils/__init__.py
index 0728388..5b15131 100644
--- a/aidge_export_cpp/utils/__init__.py
+++ b/aidge_export_cpp/utils/__init__.py
@@ -1,27 +1,14 @@
 from pathlib import Path
-import os
+from importlib.metadata import version
 
 # Constants
 FILE = Path(__file__).resolve()
 ROOT = FILE.parents[1]
 
 
-OPERATORS_REGISTRY = {}
+def show_version():
+    version_aidge_export_cpp = version("aidge_export_cpp")
+    print(f"Aidge Export CPP: {version_aidge_export_cpp}")
 
-def operator_register(*args):
-   
-    key_list = [arg for arg in args]
-
-    def decorator(operator):
-        class Wrapper(operator):
-            def __init__(self, *args, **kwargs):
-                return operator(*args, **kwargs)
-        
-        for key in key_list:
-            OPERATORS_REGISTRY[key] = operator
-
-        return Wrapper
-    return decorator
-
-def supported_operators():
-    return list(OPERATORS_REGISTRY.keys())
+def get_project_version()->str:
+    return version("aidge_export_cpp")
diff --git a/pyproject.toml b/pyproject.toml
index 870f193..b3d85aa 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,7 +9,7 @@ dependencies = [
 requires-python = ">= 3.7"
 readme = "README.md"
 license = { file = "LICENSE" }
-classifiers = [ 
+classifiers = [
     "Development Status :: 2 - Pre-Alpha",
     "Intended Audience :: Developers",
     "Intended Audience :: Education",
@@ -26,7 +26,14 @@ classifiers = [
     "Topic :: Scientific/Engineering :: Artificial Intelligence",
     "Topic :: Software Development"
 ]
-dynamic = ["version"] # defined in tool.setuptools_scm
+dynamic = ["version"] # defined by pbr
+
+[project.urls]
+Homepage = "https://www.deepgreen.ai/en/platform"
+Documentation = "https://eclipse-aidge.readthedocs.io/en/latest/"
+Repository = "https://gitlab.eclipse.org/eclipse/aidge/aidge_export_cpp"
+Issues = "https://gitlab.eclipse.org/eclipse/aidge/aidge_export_cpp/-/issues/"
+Changelog = "https://gitlab.eclipse.org/eclipse/aidge/aidge_export_cpp/-/releases"
 
 [project.optional-dependencies]
 test = ["pytest"]
@@ -34,7 +41,7 @@ test = ["pytest"]
 [build-system]
 requires = [
     "setuptools>=64",
-    "setuptools_scm[toml]==7.1.0"
+    "pbr"
 ]
 build-backend = "setuptools.build_meta"
 
@@ -47,6 +54,4 @@ include = ["aidge_export_cpp"]  # package names should match these glob patterns
 namespaces = false # to disable scanning PEP 420 namespaces (true by default)
 [tool.setuptools.exclude-package-data]
 aidge_export_cpp = ["unit_tests*"] # exclude unit_tests which may be included as data
-# SETUPTOOLS_SCM
-[tool.setuptools_scm]
-write_to = "aidge_export_cpp/_version.py"
+
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000..bb5e124
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,4 @@
+
+# pbr file
+[metadata]
+version = file: version.txt
-- 
GitLab


From d7d6330bdfadb2e09fe541ddff1f8c9315e679c3 Mon Sep 17 00:00:00 2001
From: cmoineau <cyril.moineau@cea.fr>
Date: Thu, 2 Jan 2025 14:07:54 +0000
Subject: [PATCH 02/22] Fix import path + remove deprecated functions.

---
 aidge_export_cpp/__init__.py                  |   8 +-
 aidge_export_cpp/export.py                    | 128 +-----------------
 aidge_export_cpp/export_registry.py           |   2 +-
 aidge_export_cpp/operators.py                 |  17 ++-
 .../{utils/__init__.py => utils.py}           |   2 +-
 aidge_export_cpp/utils/converter.py           |  18 ---
 aidge_export_cpp/utils/generation.py          |  51 -------
 7 files changed, 25 insertions(+), 201 deletions(-)
 rename aidge_export_cpp/{utils/__init__.py => utils.py} (93%)
 delete mode 100644 aidge_export_cpp/utils/converter.py
 delete mode 100644 aidge_export_cpp/utils/generation.py

diff --git a/aidge_export_cpp/__init__.py b/aidge_export_cpp/__init__.py
index 16c9be6..4eff598 100644
--- a/aidge_export_cpp/__init__.py
+++ b/aidge_export_cpp/__init__.py
@@ -2,15 +2,9 @@ r"""
 Aidge Export for CPP standalone projects
 
 """
+from .utils import ROOT
 from .export_registry import ExportLibCpp
-
 from .operators import *
 from collections import defaultdict
-import aidge_core
-
-from aidge_export_cpp.utils import ROOT
-
-# from ._version import *
-
 from .export import *
 
diff --git a/aidge_export_cpp/export.py b/aidge_export_cpp/export.py
index ebac7a8..42bf90f 100644
--- a/aidge_export_cpp/export.py
+++ b/aidge_export_cpp/export.py
@@ -1,131 +1,15 @@
-import re
-import os
-import numpy as np
-
 import aidge_core
-
-from aidge_core.export_utils.code_generation import *
-from aidge_core.mem_info import compute_default_mem_info
-
-from aidge_export_cpp.utils import ROOT
-from aidge_export_cpp.utils.converter import numpy_dtype2ctype
 from aidge_export_cpp import ExportLibCpp
-from aidge_export_cpp.utils.generation import *
-# from aidge_export_cpp.memory import *
-
-
-def generate_input_file(export_folder:str,
-                        array_name:str,
-                        array: np.ndarray):
-
-    # If directory doesn't exist, create it
-    if not os.path.exists(export_folder):
-        os.makedirs(export_folder)
-
-    generate_file(
-        file_path=f"{export_folder}/{array_name}.h",
-        template_path=str(ROOT / "templates" / "data" / "inputs.jinja"),
-        dims = array.shape,
-        data_t = numpy_dtype2ctype(array.dtype),
-        name = array_name,
-        values = array.tolist()
-    )
-
 
 def export(export_folder_name, graphview, scheduler, mem_wrapping=False):
+    print("Warning: This function is deprecated, check tutorial https://eclipse.dev/aidge/source/Tutorial/export_cpp.html to find the new way to generate a C++ export.")
     aidge_core.export_utils.scheduler_export(
         scheduler,
         export_folder_name,
         ExportLibCpp,
-        memory_manager=compute_default_mem_info
+        memory_manager=aidge_core.mem_info.generate_optimized_memory_info,
+        memory_manager_args={
+            "stats_folder": f"{export_folder_name}/stats",
+            "wrapping": mem_wrapping
+        }
     )
-
-    # export_folder = Path().absolute() / export_folder_name
-
-    # os.makedirs(str(export_folder), exist_ok=True)
-
-    # dnn_folder = export_folder / "dnn"
-    # os.makedirs(str(dnn_folder), exist_ok=True)
-
-    # list_actions = []
-    # list_configs = []
-    # peak_mem, mem_info = compute_default_mem_info(scheduler)
-    # list_forward_nodes = scheduler.get_static_scheduling()
-
-    # for node in list_forward_nodes:
-    #     if ExportLibCpp.exportable(node):
-    #         op = ExportLibCpp.get_export_node(node)(node, mem_info[node])
-    #         # For configuration files
-    #         list_configs = op.export(dnn_folder, list_configs)
-
-    #         # For forward file
-    #         list_actions = op.forward(list_actions)
-    #     else:
-    #         raise RuntimeError(f"Operator not supported: {node.type()} !")
-
-    # # Memory management
-    # # stats_folder = export_folder / "statistics"
-    # # os.makedirs(str(stats_folder), exist_ok=True)
-    # # mem_size, mem_info = generate_optimized_memory_info(stats_folder, scheduler, mem_wrapping)
-    # # peak_mem, mem_info = compute_default_mem_info(scheduler)
-
-    # # Generate the memory file
-    # # generate_file(
-    # #     str(dnn_folder / "memory" / "mem_info.h"),
-    # #     str(ROOT / "templates" / "memory" / "mem_info.jinja"),
-    # #     mem_size = mem_size,
-    # #     mem_info_legends = MEMORY_INFO_TEMPLATE,
-    # #     mem_info = mem_info
-    # # )
-    # # list_configs.append("memory/mem_info.h")
-
-    # # Get entry nodes
-    # # Store the datatype & name
-    # list_inputs_name = []
-    # for node in graphview.get_input_nodes():
-    #     for idx, node_input_tuple in enumerate(node.inputs()):
-    #         node_input, _ = node_input_tuple
-    #         if node_input is None:
-    #             export_type = aidge2c(node.get_operator().get_output(0).dtype())
-    #             list_inputs_name.append((export_type, f"{node.name()}_input_{idx}"))
-    #         elif node_input not in graphview.get_nodes():
-    #             export_type = aidge2c(node_input.get_operator().get_output(0).dtype())
-    #             list_inputs_name.append((export_type, node_input.name()))
-
-
-    # # Get output nodes
-    # # Store the datatype & name, like entry nodes
-    # list_outputs_name = []
-    # for node in graphview.get_nodes():
-    #     if len(node.get_children()) == 0:
-    #         export_type = aidge2c(node.get_operator().get_output(0).dtype())
-    #         list_outputs_name.append((export_type, f"{node.name()}_output_0"))
-
-    # # Generate forward file
-    # # TODO: for now the mem type is bound for all intermediate results, should change.
-    # # Note that we may have all inputs constants, hence select output type
-    # assert len(list_outputs_name) >= 1, f"TODO: requires some output to determine mem type"
-    # mem_ctype = list_outputs_name[0][0]
-    # generate_file(
-    #     str(dnn_folder / "src" / "forward.cpp"),
-    #     str(ROOT / "templates" / "network" / "network_forward.jinja"),
-    #     headers=set(list_configs),
-    #     actions=list_actions,
-    #     inputs= list_inputs_name,
-    #     outputs=list_outputs_name,
-    #     mem_ctype=mem_ctype,
-    #     peak_mem=peak_mem
-    # )
-
-    # # Generate dnn API
-    # generate_file(
-    #     str(dnn_folder / "include" / "dnn.hpp"),
-    #     str(ROOT / "templates" / "network" / "dnn_header.jinja"),
-    #     libraries=[],
-    #     functions=get_functions_from_c_file(str(dnn_folder / "src" / "forward.cpp")),
-    # )
-
-    # # Copy all static files in the export
-    # shutil.copy(str(ROOT / "static" / "main.cpp"), str(export_folder))
-    # shutil.copy(str(ROOT / "static" / "Makefile"), str(export_folder))
-    # shutil.copytree(str(ROOT / "static" / "include"), str(dnn_folder / "include"), dirs_exist_ok=True)
diff --git a/aidge_export_cpp/export_registry.py b/aidge_export_cpp/export_registry.py
index f1aa83b..876e4ff 100644
--- a/aidge_export_cpp/export_registry.py
+++ b/aidge_export_cpp/export_registry.py
@@ -1,5 +1,5 @@
 from aidge_core.export_utils import ExportLib
-from aidge_export_cpp.utils import ROOT
+from aidge_export_cpp import ROOT
 
 class ExportLibCpp(ExportLib):
     _name="export_cpp"
diff --git a/aidge_export_cpp/operators.py b/aidge_export_cpp/operators.py
index 9654a20..f04dbb3 100644
--- a/aidge_export_cpp/operators.py
+++ b/aidge_export_cpp/operators.py
@@ -4,12 +4,27 @@ from pathlib import Path
 import aidge_core
 from aidge_core.export_utils import ExportNode, ExportNodeCpp, generate_file
 from aidge_export_cpp.utils import ROOT
-from aidge_export_cpp.utils.converter import numpy_dtype2ctype
 from aidge_export_cpp import ExportLibCpp
 
 ##############################################
 ############## Export functions ##############
 ##############################################
+def numpy_dtype2ctype(dtype):
+    if dtype == np.int8:
+        return "int8_t"
+    elif dtype == np.int16:
+        return "int16_t"
+    elif dtype == np.int32:
+        return "int32_t"
+    elif dtype == np.int64:
+        return "int64_t"
+    elif dtype == np.float32:
+        return "float"
+    elif dtype == np.float64:
+        return "double"
+    # Add more dtype mappings as needed
+    else:
+        raise ValueError(f"Unsupported {dtype} dtype")
 
 def export_params(name: str,
                   array: np.ndarray,
diff --git a/aidge_export_cpp/utils/__init__.py b/aidge_export_cpp/utils.py
similarity index 93%
rename from aidge_export_cpp/utils/__init__.py
rename to aidge_export_cpp/utils.py
index 5b15131..915c2c6 100644
--- a/aidge_export_cpp/utils/__init__.py
+++ b/aidge_export_cpp/utils.py
@@ -3,7 +3,7 @@ from importlib.metadata import version
 
 # Constants
 FILE = Path(__file__).resolve()
-ROOT = FILE.parents[1]
+ROOT = FILE.parents[0]
 
 
 def show_version():
diff --git a/aidge_export_cpp/utils/converter.py b/aidge_export_cpp/utils/converter.py
deleted file mode 100644
index d4af124..0000000
--- a/aidge_export_cpp/utils/converter.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import numpy as np
-
-def numpy_dtype2ctype(dtype):
-    if dtype == np.int8:
-        return "int8_t"
-    elif dtype == np.int16:
-        return "int16_t"
-    elif dtype == np.int32:
-        return "int32_t"
-    elif dtype == np.int64:
-        return "int64_t"
-    elif dtype == np.float32:
-        return "float"
-    elif dtype == np.float64:
-        return "double"
-    # Add more dtype mappings as needed
-    else:
-        raise ValueError(f"Unsupported {dtype} dtype")
diff --git a/aidge_export_cpp/utils/generation.py b/aidge_export_cpp/utils/generation.py
deleted file mode 100644
index 4478ef7..0000000
--- a/aidge_export_cpp/utils/generation.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import re
-import os
-import shutil
-from jinja2 import Environment, FileSystemLoader
-
-
-def get_functions_from_c_file(file_path):
-    functions = []
-    pattern = r'\w+\s+(\w+)\s*\(([^)]*)\)\s*{'
-
-    with open(file_path, 'r') as file:
-        file_content = file.read()
-
-    matches = re.findall(pattern, file_content)
-    for match in matches:
-        function_name = match[0]
-        arguments = match[1].split(',')
-        arguments = [arg.strip() for arg in arguments]
-
-        return_type = get_return_type(file_content, function_name)
-
-        function_string = f"{return_type} {function_name}({', '.join(arguments)});"
-        functions.append(function_string)
-
-    return functions
-
-
-def get_return_type(file_content, function_name):
-    pattern = rf'\w+\s+{function_name}\s*\([^)]*\)\s*{{'
-    return_type = re.search(pattern, file_content).group()
-    return_type = return_type.split()[0].strip()
-    return return_type
-
-
-def get_functions_from_c_folder(folder_path):
-    functions = []
-    
-    for _, _, files in os.walk(folder_path):
-        for file in files:
-            functions += get_functions_from_c_file(os.path.join(folder_path, file))
-
-    return functions
-
-
-def copyfile(filename, dst_folder):
-
-    # If directory doesn't exist, create it
-    if not os.path.exists(dst_folder):
-        os.makedirs(dst_folder)
-
-    shutil.copy(filename, dst_folder)
-- 
GitLab


From 847a27b15d16bf6b62964c070573f6e4761194ea Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Sun, 19 Jan 2025 15:48:57 +0100
Subject: [PATCH 03/22] Hotfix: source files were not included anymore

---
 pyproject.toml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index b3d85aa..25fb9d3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -52,6 +52,8 @@ build-backend = "setuptools.build_meta"
 where = ["."]  # list of folders that contain the packages (["."] by default)
 include = ["aidge_export_cpp"]  # package names should match these glob patterns (["*"] by default)
 namespaces = false # to disable scanning PEP 420 namespaces (true by default)
+[tool.setuptools.package-data]
+'aidge_export_cpp' = ['**/*']
 [tool.setuptools.exclude-package-data]
 aidge_export_cpp = ["unit_tests*"] # exclude unit_tests which may be included as data
 
-- 
GitLab


From 4b92baa187381698f8cee9619b19cebf31a40a63 Mon Sep 17 00:00:00 2001
From: Matthew  Newson <matthew.newson@cea.fr>
Date: Wed, 12 Feb 2025 14:17:37 +0000
Subject: [PATCH 04/22] Upload New File

---
 aidge_export_cpp/kernels/erf.hpp | 40 ++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 aidge_export_cpp/kernels/erf.hpp

diff --git a/aidge_export_cpp/kernels/erf.hpp b/aidge_export_cpp/kernels/erf.hpp
new file mode 100644
index 0000000..b509133
--- /dev/null
+++ b/aidge_export_cpp/kernels/erf.hpp
@@ -0,0 +1,40 @@
+#ifndef __AIDGE_EXPORT_CPP_KERNELS_ERP__
+#define __AIDGE_EXPORT_CPP_KERNELS_ERP__
+
+#include "network/typedefs.hpp"
+#include <cmath>
+#include <math.h>
+#include <iostream>
+
+template<int _NB_ELTS,
+         typename Input_T, typename Output_T>
+__attribute__((always_inline)) inline 
+void erf_forward (
+    const Input_T* __restrict inputs,
+    Output_T* __restrict outputs)
+{
+    double a1 =  0.254829592;
+    double a2 = -0.284496736;
+    double a3 =  1.421413741;
+    double a4 = -1.453152027;
+    double a5 =  1.061405429;
+    double p  =  0.3275911;
+
+
+#pragma omp parallel for
+    for (int i = 0; i < _NB_ELTS; ++i) {
+        int sign = 1;
+        if (inputs[i] < 0)
+            sign = -1;
+        double abs_value = abs(inputs[i]);
+        
+        // A&S formula 7.1.26
+        double t = 1.0/(1.0 + p*abs_value);
+        double y = 1.0 - (((((a5*t + a4)*t) + a3)*t + a2)*t + a1)*t*exp(-abs_value*abs_value);
+        outputs[i] = sign*y;
+
+    }
+}
+
+
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_ERP_
\ No newline at end of file
-- 
GitLab


From 9b6fb714ff531a47da042fce67ef28cd864418ae Mon Sep 17 00:00:00 2001
From: Matthew  Newson <matthew.newson@cea.fr>
Date: Wed, 12 Feb 2025 14:21:31 +0000
Subject: [PATCH 05/22] Upload New File

---
 aidge_export_cpp/kernels/transpose.hpp | 56 ++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)
 create mode 100644 aidge_export_cpp/kernels/transpose.hpp

diff --git a/aidge_export_cpp/kernels/transpose.hpp b/aidge_export_cpp/kernels/transpose.hpp
new file mode 100644
index 0000000..a6ddce2
--- /dev/null
+++ b/aidge_export_cpp/kernels/transpose.hpp
@@ -0,0 +1,56 @@
+#ifndef __AIDGE_EXPORT_CPP_KERNELS_TRANSPOSE__
+#define __AIDGE_EXPORT_CPP_KERNELS_TRANSPOSE__
+
+#include "network/typedefs.hpp"
+#include <cstring>
+#include <cstdio>
+#include <iostream>
+
+using namespace std; 
+
+template< int INPUT_DIMS[],  int PERM[], int OUTPUT_DIMS[], 
+		int SIZE_OUTPUT_DIMS, int SIZE,
+        typename Input_T, typename Output_T>
+__attribute__((always_inline)) inline 
+void transpose_forward (
+    const Input_T* __restrict inputs,
+    Output_T* __restrict outputs)
+    {
+
+	int newStrides[SIZE_OUTPUT_DIMS];
+	for (int i = 0; i<SIZE_OUTPUT_DIMS;++i){newStrides[i] = 1;}
+	for (int i = 0; i < SIZE_OUTPUT_DIMS; ++i) {
+		for (int j = i + 1; j < SIZE_OUTPUT_DIMS; ++j) {
+			newStrides[i] *= OUTPUT_DIMS[j];
+		}
+	}
+
+	int indices[SIZE_OUTPUT_DIMS];
+	for (int i = 0; i<SIZE_OUTPUT_DIMS;++i){indices[i] = 0;}
+
+	for (int i = 0; i < SIZE; ++i) {
+		int idx = 0;
+		for (int j = SIZE_OUTPUT_DIMS -1; j >=0; --j) {
+			idx += indices[PERM[j]] * newStrides[j];
+		}
+
+		outputs[idx] = inputs[i];
+
+
+		for (int j = SIZE_OUTPUT_DIMS - 1; j >= 0; --j) {
+			if (indices[j] < INPUT_DIMS[j] - 1) {
+				indices[j]++;
+				break;
+			}
+			else {
+				indices[j] = 0;
+			}
+		}
+	}
+
+    
+}
+
+
+
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_TRANSPOSE__
\ No newline at end of file
-- 
GitLab


From 1fd2f8da29e2b797bbc9e036cfff5e869b6fa52c Mon Sep 17 00:00:00 2001
From: Matthew  Newson <matthew.newson@cea.fr>
Date: Tue, 25 Feb 2025 12:56:48 +0000
Subject: [PATCH 06/22] Upload Erf jinja forward

---
 aidge_export_cpp/templates/kernel_forward/erf_forward.jinja | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 aidge_export_cpp/templates/kernel_forward/erf_forward.jinja

diff --git a/aidge_export_cpp/templates/kernel_forward/erf_forward.jinja b/aidge_export_cpp/templates/kernel_forward/erf_forward.jinja
new file mode 100644
index 0000000..9f3fbf3
--- /dev/null
+++ b/aidge_export_cpp/templates/kernel_forward/erf_forward.jinja
@@ -0,0 +1,6 @@
+{% filter indent(width=4, first=False) %}
+{% include "./_mem_offset.jinja" %}
+erf_forward<{{name|upper}}_NB_ELTS>
+                   ({{in_name[0]}}, {{out_name[0]}});
+{% include "./_save_outputs.jinja" %}
+{% endfilter %}
\ No newline at end of file
-- 
GitLab


From 0264ce4b97ff330aea589f650a10b30c07bccf18 Mon Sep 17 00:00:00 2001
From: Matthew  Newson <matthew.newson@cea.fr>
Date: Tue, 25 Feb 2025 12:57:58 +0000
Subject: [PATCH 07/22] Upload Transpose Forward File

---
 .../templates/kernel_forward/transpose_forward.jinja  | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 aidge_export_cpp/templates/kernel_forward/transpose_forward.jinja

diff --git a/aidge_export_cpp/templates/kernel_forward/transpose_forward.jinja b/aidge_export_cpp/templates/kernel_forward/transpose_forward.jinja
new file mode 100644
index 0000000..2f8d939
--- /dev/null
+++ b/aidge_export_cpp/templates/kernel_forward/transpose_forward.jinja
@@ -0,0 +1,11 @@
+{% filter indent(width=4, first=False) %}
+{% include "./_mem_offset.jinja" %}
+transpose_forward<{{ name|upper }}_INPUT_DIMS,
+                    {{ name|upper }}_PERM, 
+                    {{ name|upper}}_OUTPUT_DIMS, 
+                    {{ name|upper}}_SIZE_OUTPUT_DIMS, 
+                    {{name|upper}}_SIZE>
+                   ({{in_name[0]}}, {{out_name[0]}});
+{% include "./_save_outputs.jinja" %}
+{% endfilter %}
+
-- 
GitLab


From bb72026146e36ecdf51188f2f9d047dcfd6c4329 Mon Sep 17 00:00:00 2001
From: Matthew Newson <matthew.newson@cea.fr>
Date: Wed, 26 Feb 2025 14:27:14 +0000
Subject: [PATCH 08/22] Adding erf and transpose config

---
 .../templates/configuration/erf_config.jinja    | 11 +++++++++++
 .../configuration/transpose_config.jinja        | 17 +++++++++++++++++
 2 files changed, 28 insertions(+)
 create mode 100644 aidge_export_cpp/templates/configuration/erf_config.jinja
 create mode 100644 aidge_export_cpp/templates/configuration/transpose_config.jinja

diff --git a/aidge_export_cpp/templates/configuration/erf_config.jinja b/aidge_export_cpp/templates/configuration/erf_config.jinja
new file mode 100644
index 0000000..b273472
--- /dev/null
+++ b/aidge_export_cpp/templates/configuration/erf_config.jinja
@@ -0,0 +1,11 @@
+{#- For name header -#}
+#ifndef {{ name|upper }}_LAYER_H
+#define {{ name|upper }}_LAYER_H
+
+{# For layer configuration -#}
+{# For layer configuration -#}
+{% include "./_def_io.jinja" %}
+{% include "./_meminfo.jinja" %}
+#define {{ name|upper }}_NB_ELTS {{ in_dims[0]|join('*') }}
+
+#endif /* {{ name|upper }}_LAYER_H */
\ No newline at end of file
diff --git a/aidge_export_cpp/templates/configuration/transpose_config.jinja b/aidge_export_cpp/templates/configuration/transpose_config.jinja
new file mode 100644
index 0000000..c3eabc5
--- /dev/null
+++ b/aidge_export_cpp/templates/configuration/transpose_config.jinja
@@ -0,0 +1,17 @@
+{#- For name header -#}
+#ifndef {{ name|upper }}_LAYER_H
+#define {{ name|upper }}_LAYER_H
+
+{# For layer configuration -#}
+{% include "./_def_io.jinja" %}
+{% include "./_meminfo.jinja" %}
+#define {{ name|upper }}_SIZE {{out_size[0]}}
+#define {{name|upper }}_SIZE_OUTPUT_DIMS {{out_dims[0]|length}}
+
+int {{name|upper}}_OUTPUT_DIMS[] =  { {{ out_dims[0]|join(", ") }} };
+int {{name|upper}}_INPUT_DIMS[] = { {{ in_dims[0]|join(", ") }} };
+int {{name|upper}}_PERM[] = { {{ output_dims_order|join(", ") }} };
+
+
+
+#endif /* {{ name|upper }}_LAYER_H */
-- 
GitLab


From 659a8f4bb519f6b7f1863bb9de994608b4b5ff57 Mon Sep 17 00:00:00 2001
From: Matthew Newson <matthew.newson@cea.fr>
Date: Wed, 26 Feb 2025 15:19:25 +0000
Subject: [PATCH 09/22] Adding new files for add, div, mul and sub to make it
 easier to read and manipulated

---
 aidge_export_cpp/kernels/add.hpp              | 104 ++++++++++++++++++
 aidge_export_cpp/kernels/div.hpp              | 103 +++++++++++++++++
 aidge_export_cpp/kernels/mul.hpp              | 104 ++++++++++++++++++
 aidge_export_cpp/kernels/sub.hpp              | 103 +++++++++++++++++
 .../templates/configuration/add_config.jinja  |  25 +++++
 .../templates/configuration/div_config.jinja  |  25 +++++
 .../templates/configuration/mul_config.jinja  |  25 +++++
 .../templates/configuration/sub_config.jinja  |  25 +++++
 .../kernel_forward/add_forward.jinja          |  14 +++
 .../kernel_forward/div_forward.jinja          |  14 +++
 .../kernel_forward/mul_forward.jinja          |  14 +++
 .../kernel_forward/sub_forward.jinja          |  14 +++
 12 files changed, 570 insertions(+)
 create mode 100644 aidge_export_cpp/kernels/add.hpp
 create mode 100644 aidge_export_cpp/kernels/div.hpp
 create mode 100644 aidge_export_cpp/kernels/mul.hpp
 create mode 100644 aidge_export_cpp/kernels/sub.hpp
 create mode 100644 aidge_export_cpp/templates/configuration/add_config.jinja
 create mode 100644 aidge_export_cpp/templates/configuration/div_config.jinja
 create mode 100644 aidge_export_cpp/templates/configuration/mul_config.jinja
 create mode 100644 aidge_export_cpp/templates/configuration/sub_config.jinja
 create mode 100644 aidge_export_cpp/templates/kernel_forward/add_forward.jinja
 create mode 100644 aidge_export_cpp/templates/kernel_forward/div_forward.jinja
 create mode 100644 aidge_export_cpp/templates/kernel_forward/mul_forward.jinja
 create mode 100644 aidge_export_cpp/templates/kernel_forward/sub_forward.jinja

diff --git a/aidge_export_cpp/kernels/add.hpp b/aidge_export_cpp/kernels/add.hpp
new file mode 100644
index 0000000..03ba2c5
--- /dev/null
+++ b/aidge_export_cpp/kernels/add.hpp
@@ -0,0 +1,104 @@
+#ifndef __AIDGE_EXPORT_CPP_KERNELS_ADD__
+#define __AIDGE_EXPORT_CPP_KERNELS_ADD__
+
+#include "network/typedefs.hpp"
+#include "kernels/activation.hpp"
+#include <iostream>
+#include <cassert>
+
+
+
+template<int NB_ELTS, 
+        int INPUT_A_DIMS[],  int INPUT_B_DIMS[], int OUTPUT_DIMS[], 
+		int SIZE_DIM_IN_A, int SIZE_DIM_IN_B, int SIZE_DIM_OUT, int OUT_SIZE, 
+        ActivationFunction_T ACTIVATION,
+        typename Input_T, typename Output_T>        
+__attribute__((always_inline)) inline
+void add_forward (
+    Output_T* __restrict outputs,
+    const Input_T* __restrict inputs1,
+    const Input_T* __restrict inputs2)
+{
+    int ndim_a[SIZE_DIM_OUT];
+    int ndim_b[SIZE_DIM_OUT];
+    for (int i= 0; i<SIZE_DIM_OUT; i++){
+        int idx = SIZE_DIM_OUT-SIZE_DIM_IN_A;
+        ndim_a[i] = (i< idx) ? 1 : INPUT_A_DIMS[i-idx];
+    }
+    for (int i= 0; i<SIZE_DIM_OUT; i++){
+        int idx = SIZE_DIM_OUT-SIZE_DIM_IN_B;
+        ndim_b[i] = (i< idx) ? 1 : INPUT_B_DIMS[i-idx];
+    }
+    
+    // Find the highest equal dimension
+    int contiguousidx  = SIZE_DIM_OUT -1 ;
+
+    for (int i = contiguousidx ; ndim_a[i] == ndim_b[i]; i--) {
+        contiguousidx  = i;
+    }
+
+    // Compute the highest number of contiguous data for each Tensor
+    int input0_contiguous_size = 1;
+    for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
+        input0_contiguous_size *= ndim_a[i];
+    }
+    int input1_contiguous_size = 1;
+    for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
+        input1_contiguous_size *= ndim_b[i];
+    }
+    int output_contiguous_size = 1;
+    for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
+        output_contiguous_size *= OUTPUT_DIMS[i];
+    }
+    // initialize strides to iterate through data because of broadcasting
+    int stride_post0[contiguousidx ] ;
+    int stride_post1[contiguousidx ] ;
+    int stride_step0[contiguousidx ] ;
+    int stride_step1[contiguousidx ] ;
+    if (contiguousidx > 0) {
+        stride_post0[contiguousidx  - 1] = 1;
+        stride_post1[contiguousidx  - 1] = 1;
+        #pragma omp parallel for
+        for (int i = contiguousidx -2; i != -1; --i) {
+            stride_post0[i] = stride_post0[i+1]*ndim_a[i+1];
+            stride_post1[i] = stride_post1[i+1]*ndim_b[i+1];
+        }
+        #pragma omp parallel for
+        for (int i = 0; i < contiguousidx ; ++i) {
+            stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1;
+            stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1;
+        }
+    }
+    int offsetIn0 = 0;
+    int offsetIn1 = 0;
+    int offsetOut = 0;
+    int nbMatrices = 1;
+    for(int i = 0; i<contiguousidx ; ++i){
+        nbMatrices *= OUTPUT_DIMS[i];
+        
+    }
+    int dim = contiguousidx  - 1;
+    for(int stack = 0; stack < nbMatrices;){
+        for(int i = 0; i < output_contiguous_size; ++i){
+            int in0_id = (input0_contiguous_size != 1) ? i : 0;
+            int in1_id = (input1_contiguous_size != 1) ? i : 0;
+            outputs[i + offsetOut*output_contiguous_size] = inputs1[in0_id + offsetIn0*input0_contiguous_size] + inputs2[in1_id + offsetIn1*input1_contiguous_size];
+        }
+        if (++stack < nbMatrices) {
+            int tmp_stack = stack;
+            while(tmp_stack % OUTPUT_DIMS[dim] == 0) {
+                tmp_stack /= OUTPUT_DIMS[dim];
+                dim--;
+            }
+            offsetIn0 += stride_step0[dim];
+            offsetIn1 += stride_step1[dim];
+            ++offsetOut;
+            dim = contiguousidx  - 1;
+        }
+    }
+}
+
+
+
+
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_ADD__
\ No newline at end of file
diff --git a/aidge_export_cpp/kernels/div.hpp b/aidge_export_cpp/kernels/div.hpp
new file mode 100644
index 0000000..f1ff7d0
--- /dev/null
+++ b/aidge_export_cpp/kernels/div.hpp
@@ -0,0 +1,103 @@
+#ifndef __AIDGE_EXPORT_CPP_KERNELS_DIV__
+#define __AIDGE_EXPORT_CPP_KERNELS_DIV__
+
+#include "network/typedefs.hpp"
+#include "kernels/activation.hpp"
+#include <iostream>
+#include <cassert>
+
+
+
+template<int NB_ELTS, 
+        int INPUT_A_DIMS[],  int INPUT_B_DIMS[], int OUTPUT_DIMS[], 
+		int SIZE_DIM_IN_A, int SIZE_DIM_IN_B, int SIZE_DIM_OUT, int OUT_SIZE, 
+        ActivationFunction_T ACTIVATION,
+        typename Input_T, typename Output_T>        
+__attribute__((always_inline)) inline
+void div_forward (
+    Output_T* __restrict outputs,
+    const Input_T* __restrict inputs1,
+    const Input_T* __restrict inputs2)
+{
+
+    int ndim_a[SIZE_DIM_OUT];
+    int ndim_b[SIZE_DIM_OUT];
+    for (int i= 0; i<SIZE_DIM_OUT; i++){
+        int idx = SIZE_DIM_OUT-SIZE_DIM_IN_A;
+        ndim_a[i] = (i< idx) ? 1 : INPUT_A_DIMS[i-idx];
+    }
+    for (int i= 0; i<SIZE_DIM_OUT; i++){
+        int idx = SIZE_DIM_OUT-SIZE_DIM_IN_B;
+        ndim_b[i] = (i< idx) ? 1 : INPUT_B_DIMS[i-idx];
+    }
+    
+    // Find the highest equal dimension
+    int contiguousidx  = SIZE_DIM_OUT -1 ;
+
+    for (int i = contiguousidx ; ndim_a[i] == ndim_b[i]; i--) {
+        contiguousidx  = i;
+    }
+
+    // Compute the highest number of contiguous data for each Tensor
+    int input0_contiguous_size = 1;
+    for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
+        input0_contiguous_size *= ndim_a[i];
+    }
+    int input1_contiguous_size = 1;
+    for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
+        input1_contiguous_size *= ndim_b[i];
+    }
+    int output_contiguous_size = 1;
+    for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
+        output_contiguous_size *= OUTPUT_DIMS[i];
+    }
+    // initialize strides to iterate through data because of broadcasting
+    int stride_post0[contiguousidx ] ;
+    int stride_post1[contiguousidx ] ;
+    int stride_step0[contiguousidx ] ;
+    int stride_step1[contiguousidx ] ;
+    if (contiguousidx > 0) {
+        stride_post0[contiguousidx  - 1] = 1;
+        stride_post1[contiguousidx  - 1] = 1;
+        for (int i = contiguousidx -2; i != -1; --i) {
+            stride_post0[i] = stride_post0[i+1]*ndim_a[i+1];
+            stride_post1[i] = stride_post1[i+1]*ndim_b[i+1];
+        }
+        for (int i = 0; i < contiguousidx ; ++i) {
+            stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1;
+            stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1;
+        }
+    }
+    int offsetIn0 = 0;
+    int offsetIn1 = 0;
+    int offsetOut = 0;
+    int nbMatrices = 1;
+    for(int i = 0; i<contiguousidx ; ++i){
+        nbMatrices *= OUTPUT_DIMS[i];
+        
+    }
+    int dim = contiguousidx  - 1;
+    for(int stack = 0; stack < nbMatrices;){
+        for(int i = 0; i < output_contiguous_size; ++i){
+            int in0_id = (input0_contiguous_size != 1) ? i : 0;
+            int in1_id = (input1_contiguous_size != 1) ? i : 0;
+            outputs[i + offsetOut*output_contiguous_size] = inputs1[in0_id + offsetIn0*input0_contiguous_size] / inputs2[in1_id + offsetIn1*input1_contiguous_size];
+        }
+        if (++stack < nbMatrices) {
+            int tmp_stack = stack;
+            while(tmp_stack % OUTPUT_DIMS[dim] == 0) {
+                tmp_stack /= OUTPUT_DIMS[dim];
+                dim--;
+            }
+            offsetIn0 += stride_step0[dim];
+            offsetIn1 += stride_step1[dim];
+            ++offsetOut;
+            dim = contiguousidx  - 1;
+        }
+    }
+}
+
+
+
+
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_DIV__
\ No newline at end of file
diff --git a/aidge_export_cpp/kernels/mul.hpp b/aidge_export_cpp/kernels/mul.hpp
new file mode 100644
index 0000000..cbed0f6
--- /dev/null
+++ b/aidge_export_cpp/kernels/mul.hpp
@@ -0,0 +1,104 @@
+#ifndef __AIDGE_EXPORT_CPP_KERNELS_MUL__
+#define __AIDGE_EXPORT_CPP_KERNELS_MUL__
+
+#include "network/typedefs.hpp"
+#include "kernels/activation.hpp"
+#include <iostream>
+#include <cassert>
+
+
+
+template<int NB_ELTS, 
+        int INPUT_A_DIMS[],  int INPUT_B_DIMS[], int OUTPUT_DIMS[], 
+		int SIZE_DIM_IN_A, int SIZE_DIM_IN_B, int SIZE_DIM_OUT, int OUT_SIZE, 
+        ActivationFunction_T ACTIVATION,
+        typename Input_T, typename Output_T>        
+__attribute__((always_inline)) inline
+void mul_forward (
+    Output_T* __restrict outputs,
+    const Input_T* __restrict inputs1,
+    const Input_T* __restrict inputs2)
+{
+    int ndim_a[SIZE_DIM_OUT];
+    int ndim_b[SIZE_DIM_OUT];                                                  
+    for (int i= 0; i<SIZE_DIM_OUT; i++){
+        int idx = SIZE_DIM_OUT-SIZE_DIM_IN_A;
+        ndim_a[i] = (i< idx) ? 1 : INPUT_A_DIMS[i-idx];
+    }
+    for (int i= 0; i<SIZE_DIM_OUT; i++){
+        int idx = SIZE_DIM_OUT-SIZE_DIM_IN_B;
+        ndim_b[i] = (i< idx) ? 1 : INPUT_B_DIMS[i-idx];
+    }
+    
+    // Find the highest equal dimension
+    int contiguousidx  = SIZE_DIM_OUT -1 ;
+
+    for (int i = contiguousidx ; ndim_a[i] == ndim_b[i]; i--) {
+        contiguousidx  = i;
+    }
+
+    // Compute the highest number of contiguous data for each Tensor
+    int input0_contiguous_size = 1;
+    for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
+        input0_contiguous_size *= ndim_a[i];
+    }
+    int input1_contiguous_size = 1;
+    for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
+        input1_contiguous_size *= ndim_b[i];
+    }
+    int output_contiguous_size = 1;
+    for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
+        output_contiguous_size *= OUTPUT_DIMS[i];
+    }
+    // initialize strides to iterate through data because of broadcasting
+    int stride_post0[contiguousidx ] ;
+    int stride_post1[contiguousidx ] ;
+    int stride_step0[contiguousidx ] ;
+    int stride_step1[contiguousidx ] ;
+    if (contiguousidx > 0) {
+        stride_post0[contiguousidx  - 1] = 1;
+        stride_post1[contiguousidx  - 1] = 1;
+        #pragma omp parallel for
+        for (int i = contiguousidx -2; i != -1; --i) {
+            stride_post0[i] = stride_post0[i+1]*ndim_a[i+1];
+            stride_post1[i] = stride_post1[i+1]*ndim_b[i+1];
+        }
+        #pragma omp parallel for
+        for (int i = 0; i < contiguousidx ; ++i) {
+            stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1;
+            stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1;
+        }
+    }
+    int offsetIn0 = 0;
+    int offsetIn1 = 0;
+    int offsetOut = 0;
+    int nbMatrices = 1;
+    for(int i = 0; i<contiguousidx ; ++i){
+        nbMatrices *= OUTPUT_DIMS[i];
+        
+    }
+    int dim = contiguousidx  - 1;
+    for(int stack = 0; stack < nbMatrices;){
+        for(int i = 0; i < output_contiguous_size; ++i){
+            int in0_id = (input0_contiguous_size != 1) ? i : 0;
+            int in1_id = (input1_contiguous_size != 1) ? i : 0;
+            outputs[i + offsetOut*output_contiguous_size] = inputs1[in0_id + offsetIn0*input0_contiguous_size] * inputs2[in1_id + offsetIn1*input1_contiguous_size];
+        }
+        if (++stack < nbMatrices) {
+            int tmp_stack = stack;
+            while(tmp_stack % OUTPUT_DIMS[dim] == 0) {
+                tmp_stack /= OUTPUT_DIMS[dim];
+                dim--;
+            }
+            offsetIn0 += stride_step0[dim];
+            offsetIn1 += stride_step1[dim];
+            ++offsetOut;
+            dim = contiguousidx  - 1;
+        }
+    }
+}
+
+
+
+
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_MUL__
\ No newline at end of file
diff --git a/aidge_export_cpp/kernels/sub.hpp b/aidge_export_cpp/kernels/sub.hpp
new file mode 100644
index 0000000..07637cd
--- /dev/null
+++ b/aidge_export_cpp/kernels/sub.hpp
@@ -0,0 +1,103 @@
+#ifndef __AIDGE_EXPORT_CPP_KERNELS_SUB__
+#define __AIDGE_EXPORT_CPP_KERNELS_SUB__
+
+#include "network/typedefs.hpp"
+#include "kernels/activation.hpp"
+#include <iostream>
+#include <cassert>
+
+
+
+template<int NB_ELTS, 
+        int INPUT_A_DIMS[],  int INPUT_B_DIMS[], int OUTPUT_DIMS[], 
+		int SIZE_DIM_IN_A, int SIZE_DIM_IN_B, int SIZE_DIM_OUT, int OUT_SIZE, 
+        ActivationFunction_T ACTIVATION,
+        typename Input_T, typename Output_T>        
+__attribute__((always_inline)) inline
+void sub_forward (
+    Output_T* __restrict outputs,
+    const Input_T* __restrict inputs1,
+    const Input_T* __restrict inputs2)
+{
+
+    int ndim_a[SIZE_DIM_OUT];
+    int ndim_b[SIZE_DIM_OUT];
+    for (int i= 0; i<SIZE_DIM_OUT; i++){
+        int idx = SIZE_DIM_OUT-SIZE_DIM_IN_A;
+        ndim_a[i] = (i< idx) ? 1 : INPUT_A_DIMS[i-idx];
+    }
+    for (int i= 0; i<SIZE_DIM_OUT; i++){
+        int idx = SIZE_DIM_OUT-SIZE_DIM_IN_B;
+        ndim_b[i] = (i< idx) ? 1 : INPUT_B_DIMS[i-idx];
+    }
+    
+    // Find the highest equal dimension
+    int contiguousidx  = SIZE_DIM_OUT -1 ;
+
+    for (int i = contiguousidx ; ndim_a[i] == ndim_b[i]; i--) {
+        contiguousidx  = i;
+    }
+
+    // Compute the highest number of contiguous data for each Tensor
+    int input0_contiguous_size = 1;
+    for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
+        input0_contiguous_size *= ndim_a[i];
+    }
+    int input1_contiguous_size = 1;
+    for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
+        input1_contiguous_size *= ndim_b[i];
+    }
+    int output_contiguous_size = 1;
+    for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
+        output_contiguous_size *= OUTPUT_DIMS[i];
+    }
+    // initialize strides to iterate through data because of broadcasting
+    int stride_post0[contiguousidx ] ;
+    int stride_post1[contiguousidx ] ;
+    int stride_step0[contiguousidx ] ;
+    int stride_step1[contiguousidx ] ;
+    if (contiguousidx > 0) {
+        stride_post0[contiguousidx  - 1] = 1;
+        stride_post1[contiguousidx  - 1] = 1;
+        for (int i = contiguousidx -2; i != -1; --i) {
+            stride_post0[i] = stride_post0[i+1]*ndim_a[i+1];
+            stride_post1[i] = stride_post1[i+1]*ndim_b[i+1];
+        }
+        for (int i = 0; i < contiguousidx ; ++i) {
+            stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1;
+            stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1;
+        }
+    }
+    int offsetIn0 = 0;
+    int offsetIn1 = 0;
+    int offsetOut = 0;
+    int nbMatrices = 1;
+    for(int i = 0; i<contiguousidx ; ++i){
+        nbMatrices *= OUTPUT_DIMS[i];
+        
+    }
+    int dim = contiguousidx  - 1;
+    for(int stack = 0; stack < nbMatrices;){
+        for(int i = 0; i < output_contiguous_size; ++i){
+            int in0_id = (input0_contiguous_size != 1) ? i : 0;
+            int in1_id = (input1_contiguous_size != 1) ? i : 0;
+            outputs[i + offsetOut*output_contiguous_size] = inputs1[in0_id + offsetIn0*input0_contiguous_size] - inputs2[in1_id + offsetIn1*input1_contiguous_size];
+        }
+        if (++stack < nbMatrices) {
+            int tmp_stack = stack;
+            while(tmp_stack % OUTPUT_DIMS[dim] == 0) {
+                tmp_stack /= OUTPUT_DIMS[dim];
+                dim--;
+            }
+            offsetIn0 += stride_step0[dim];
+            offsetIn1 += stride_step1[dim];
+            ++offsetOut;
+            dim = contiguousidx  - 1;
+        }
+    }
+}
+
+
+
+
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_SUB__
\ No newline at end of file
diff --git a/aidge_export_cpp/templates/configuration/add_config.jinja b/aidge_export_cpp/templates/configuration/add_config.jinja
new file mode 100644
index 0000000..143d004
--- /dev/null
+++ b/aidge_export_cpp/templates/configuration/add_config.jinja
@@ -0,0 +1,25 @@
+{#- For name header -#}
+#ifndef {{ name|upper }}_LAYER_H
+#define {{ name|upper }}_LAYER_H
+#include "kernels/rescaling.hpp"
+
+{% include "./_def_io.jinja" %}
+{% include "./_meminfo.jinja" %}
+{# For layer configuration -#}
+#define {{ name|upper }}_NB_ELTS {{ in_dims[0]|join('*') }}
+#define {{ name|upper }}_NB_ELTS_B {{ in_dims[1]|join('*')}}
+
+int {{name|upper}}_OUTPUT_DIMS[] =  { {{ out_dims[0]|join(", ") }} };
+int {{name|upper}}_INPUT_A_DIMS[] = { {{ in_dims[0]|join(", ") }} };
+int {{name|upper}}_INPUT_B_DIMS[] = { {{ in_dims[1]|join(", ") }} };
+
+#define {{name|upper}}_SIZE_DIM_IN_A {{in_dims[0]|length}}
+#define {{name|upper}}_SIZE_DIM_IN_B {{in_dims[1]|length}}
+#define {{name|upper}}_SIZE_DIM_OUT {{out_dims[0]|length}}
+
+#define {{ name|upper }}_OUT_SIZE {{out_size[0]}}
+#define {{name|upper }}_SIZE_DIM_OUT {{out_dims[0]|length}}
+
+#define {{ name|upper }}_ACTIVATION {{ activation }}
+static const {{ rescaling }} {{ name|upper }}_RESCALING = {};
+#endif /* {{ name|upper }}_LAYER_H */
diff --git a/aidge_export_cpp/templates/configuration/div_config.jinja b/aidge_export_cpp/templates/configuration/div_config.jinja
new file mode 100644
index 0000000..143d004
--- /dev/null
+++ b/aidge_export_cpp/templates/configuration/div_config.jinja
@@ -0,0 +1,25 @@
+{#- For name header -#}
+#ifndef {{ name|upper }}_LAYER_H
+#define {{ name|upper }}_LAYER_H
+#include "kernels/rescaling.hpp"
+
+{% include "./_def_io.jinja" %}
+{% include "./_meminfo.jinja" %}
+{# For layer configuration -#}
+#define {{ name|upper }}_NB_ELTS {{ in_dims[0]|join('*') }}
+#define {{ name|upper }}_NB_ELTS_B {{ in_dims[1]|join('*')}}
+
+int {{name|upper}}_OUTPUT_DIMS[] =  { {{ out_dims[0]|join(", ") }} };
+int {{name|upper}}_INPUT_A_DIMS[] = { {{ in_dims[0]|join(", ") }} };
+int {{name|upper}}_INPUT_B_DIMS[] = { {{ in_dims[1]|join(", ") }} };
+
+#define {{name|upper}}_SIZE_DIM_IN_A {{in_dims[0]|length}}
+#define {{name|upper}}_SIZE_DIM_IN_B {{in_dims[1]|length}}
+#define {{name|upper}}_SIZE_DIM_OUT {{out_dims[0]|length}}
+
+#define {{ name|upper }}_OUT_SIZE {{out_size[0]}}
+#define {{name|upper }}_SIZE_DIM_OUT {{out_dims[0]|length}}
+
+#define {{ name|upper }}_ACTIVATION {{ activation }}
+static const {{ rescaling }} {{ name|upper }}_RESCALING = {};
+#endif /* {{ name|upper }}_LAYER_H */
diff --git a/aidge_export_cpp/templates/configuration/mul_config.jinja b/aidge_export_cpp/templates/configuration/mul_config.jinja
new file mode 100644
index 0000000..143d004
--- /dev/null
+++ b/aidge_export_cpp/templates/configuration/mul_config.jinja
@@ -0,0 +1,25 @@
+{#- For name header -#}
+#ifndef {{ name|upper }}_LAYER_H
+#define {{ name|upper }}_LAYER_H
+#include "kernels/rescaling.hpp"
+
+{% include "./_def_io.jinja" %}
+{% include "./_meminfo.jinja" %}
+{# For layer configuration -#}
+#define {{ name|upper }}_NB_ELTS {{ in_dims[0]|join('*') }}
+#define {{ name|upper }}_NB_ELTS_B {{ in_dims[1]|join('*')}}
+
+int {{name|upper}}_OUTPUT_DIMS[] =  { {{ out_dims[0]|join(", ") }} };
+int {{name|upper}}_INPUT_A_DIMS[] = { {{ in_dims[0]|join(", ") }} };
+int {{name|upper}}_INPUT_B_DIMS[] = { {{ in_dims[1]|join(", ") }} };
+
+#define {{name|upper}}_SIZE_DIM_IN_A {{in_dims[0]|length}}
+#define {{name|upper}}_SIZE_DIM_IN_B {{in_dims[1]|length}}
+#define {{name|upper}}_SIZE_DIM_OUT {{out_dims[0]|length}}
+
+#define {{ name|upper }}_OUT_SIZE {{out_size[0]}}
+#define {{name|upper }}_SIZE_DIM_OUT {{out_dims[0]|length}}
+
+#define {{ name|upper }}_ACTIVATION {{ activation }}
+static const {{ rescaling }} {{ name|upper }}_RESCALING = {};
+#endif /* {{ name|upper }}_LAYER_H */
diff --git a/aidge_export_cpp/templates/configuration/sub_config.jinja b/aidge_export_cpp/templates/configuration/sub_config.jinja
new file mode 100644
index 0000000..143d004
--- /dev/null
+++ b/aidge_export_cpp/templates/configuration/sub_config.jinja
@@ -0,0 +1,25 @@
+{#- For name header -#}
+#ifndef {{ name|upper }}_LAYER_H
+#define {{ name|upper }}_LAYER_H
+#include "kernels/rescaling.hpp"
+
+{% include "./_def_io.jinja" %}
+{% include "./_meminfo.jinja" %}
+{# For layer configuration -#}
+#define {{ name|upper }}_NB_ELTS {{ in_dims[0]|join('*') }}
+#define {{ name|upper }}_NB_ELTS_B {{ in_dims[1]|join('*')}}
+
+int {{name|upper}}_OUTPUT_DIMS[] =  { {{ out_dims[0]|join(", ") }} };
+int {{name|upper}}_INPUT_A_DIMS[] = { {{ in_dims[0]|join(", ") }} };
+int {{name|upper}}_INPUT_B_DIMS[] = { {{ in_dims[1]|join(", ") }} };
+
+#define {{name|upper}}_SIZE_DIM_IN_A {{in_dims[0]|length}}
+#define {{name|upper}}_SIZE_DIM_IN_B {{in_dims[1]|length}}
+#define {{name|upper}}_SIZE_DIM_OUT {{out_dims[0]|length}}
+
+#define {{ name|upper }}_OUT_SIZE {{out_size[0]}}
+#define {{name|upper }}_SIZE_DIM_OUT {{out_dims[0]|length}}
+
+#define {{ name|upper }}_ACTIVATION {{ activation }}
+static const {{ rescaling }} {{ name|upper }}_RESCALING = {};
+#endif /* {{ name|upper }}_LAYER_H */
diff --git a/aidge_export_cpp/templates/kernel_forward/add_forward.jinja b/aidge_export_cpp/templates/kernel_forward/add_forward.jinja
new file mode 100644
index 0000000..3176ced
--- /dev/null
+++ b/aidge_export_cpp/templates/kernel_forward/add_forward.jinja
@@ -0,0 +1,14 @@
+{% filter indent(width=4, first=False) %}
+{% include "./_mem_offset.jinja" %}
+add_forward<{{name|upper}}_NB_ELTS,
+               {{name|upper}}_INPUT_A_DIMS,
+               {{name|upper}}_INPUT_B_DIMS,
+               {{name|upper}}_OUTPUT_DIMS,
+               {{name|upper}}_SIZE_DIM_IN_A,
+               {{name|upper}}_SIZE_DIM_IN_B,
+               {{name|upper}}_SIZE_DIM_OUT,
+               {{name|upper}}_OUT_SIZE,
+               {{name|upper}}_ACTIVATION>
+                 ({{out_name[0]}}, {{in_name[0]}}, {{in_name[1]}});
+{% include "./_save_outputs.jinja" %}
+{% endfilter %}
diff --git a/aidge_export_cpp/templates/kernel_forward/div_forward.jinja b/aidge_export_cpp/templates/kernel_forward/div_forward.jinja
new file mode 100644
index 0000000..4b79357
--- /dev/null
+++ b/aidge_export_cpp/templates/kernel_forward/div_forward.jinja
@@ -0,0 +1,14 @@
+{% filter indent(width=4, first=False) %}
+{% include "./_mem_offset.jinja" %}
+div_forward<{{name|upper}}_NB_ELTS,
+               {{name|upper}}_INPUT_A_DIMS,
+               {{name|upper}}_INPUT_B_DIMS,
+               {{name|upper}}_OUTPUT_DIMS,
+               {{name|upper}}_SIZE_DIM_IN_A,
+               {{name|upper}}_SIZE_DIM_IN_B,
+               {{name|upper}}_SIZE_DIM_OUT,
+               {{name|upper}}_OUT_SIZE,
+               {{name|upper}}_ACTIVATION>
+                 ({{out_name[0]}}, {{in_name[0]}}, {{in_name[1]}});
+{% include "./_save_outputs.jinja" %}
+{% endfilter %}
diff --git a/aidge_export_cpp/templates/kernel_forward/mul_forward.jinja b/aidge_export_cpp/templates/kernel_forward/mul_forward.jinja
new file mode 100644
index 0000000..9a7170b
--- /dev/null
+++ b/aidge_export_cpp/templates/kernel_forward/mul_forward.jinja
@@ -0,0 +1,14 @@
+{% filter indent(width=4, first=False) %}
+{% include "./_mem_offset.jinja" %}
+mul_forward<{{name|upper}}_NB_ELTS,
+               {{name|upper}}_INPUT_A_DIMS,
+               {{name|upper}}_INPUT_B_DIMS,
+               {{name|upper}}_OUTPUT_DIMS,
+               {{name|upper}}_SIZE_DIM_IN_A,
+               {{name|upper}}_SIZE_DIM_IN_B,
+               {{name|upper}}_SIZE_DIM_OUT,
+               {{name|upper}}_OUT_SIZE,
+               {{name|upper}}_ACTIVATION>
+                 ({{out_name[0]}}, {{in_name[0]}}, {{in_name[1]}});
+{% include "./_save_outputs.jinja" %}
+{% endfilter %}
diff --git a/aidge_export_cpp/templates/kernel_forward/sub_forward.jinja b/aidge_export_cpp/templates/kernel_forward/sub_forward.jinja
new file mode 100644
index 0000000..51b47a8
--- /dev/null
+++ b/aidge_export_cpp/templates/kernel_forward/sub_forward.jinja
@@ -0,0 +1,14 @@
+{% filter indent(width=4, first=False) %}
+{% include "./_mem_offset.jinja" %}
+sub_forward<{{name|upper}}_NB_ELTS,
+               {{name|upper}}_INPUT_A_DIMS,
+               {{name|upper}}_INPUT_B_DIMS,
+               {{name|upper}}_OUTPUT_DIMS,
+               {{name|upper}}_SIZE_DIM_IN_A,
+               {{name|upper}}_SIZE_DIM_IN_B,
+               {{name|upper}}_SIZE_DIM_OUT,
+               {{name|upper}}_OUT_SIZE,
+               {{name|upper}}_ACTIVATION>
+                 ({{out_name[0]}}, {{in_name[0]}}, {{in_name[1]}});
+{% include "./_save_outputs.jinja" %}
+{% endfilter %}
-- 
GitLab


From a4dd75a361f3d98565287160a44426ebe1fa8515 Mon Sep 17 00:00:00 2001
From: Matthew Newson <matthew.newson@cea.fr>
Date: Tue, 4 Mar 2025 12:29:22 +0000
Subject: [PATCH 10/22] Debug for batchnorm and for matmul and transpose able
 to broadcast

---
 aidge_export_cpp/kernels/batchnorm.hpp |  32 +++++---
 aidge_export_cpp/kernels/matmul.hpp    | 108 +++++++++++++++++++++----
 aidge_export_cpp/kernels/transpose.hpp |   3 +-
 3 files changed, 114 insertions(+), 29 deletions(-)

diff --git a/aidge_export_cpp/kernels/batchnorm.hpp b/aidge_export_cpp/kernels/batchnorm.hpp
index 740ea21..0ed5080 100644
--- a/aidge_export_cpp/kernels/batchnorm.hpp
+++ b/aidge_export_cpp/kernels/batchnorm.hpp
@@ -3,7 +3,9 @@
 
 #include "network/typedefs.hpp"
 #include "kernels/rescaling.hpp"
+#include "kernels/activation.hpp"
 #include <math.h>
+#include <iostream>
 
 // WARNING: this kernel only works for 32-bits floating point values
 
@@ -12,30 +14,34 @@ template<int NB_OUTPUTS,
          ActivationFunction_T ACTIVATION,
          typename Input_T, typename Output_T,
          typename Param_T>
-__attribute__((always_inline)) inline
+__attribute__((always_inline)) inline   
 void batchnorm_forward (
     const Input_T* __restrict inputs,
     Output_T* __restrict outputs,
+    const Param_T* __restrict scales,
     const Param_T* __restrict biases,
-    const Param_T* __restrict variances,
     const Param_T* __restrict means,
-    const Param_T* __restrict scales,
+    const Param_T* __restrict variances,
     const double epsilon)
 {
-    for (unsigned int output = 0; output < NB_OUTPUTS; ++output) {
-        const Output_T var = sqrt(variances[output] + epsilon);
 
-        for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
-            for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
-                const int outputOffset = OUTPUTS_HEIGHT * oy + ox;
-
-                const Output_T normalized = (inputs[outputOffset + output] - means[output]) / var;
-                const Output_T sAs = scales[output] * normalized + biases[output];
-                outputs[outputOffset + output] = sat<Output_T>(sAs, output, ACTIVATION, NoScaling);
-            }
+    int featureMapSize = OUTPUTS_HEIGHT * OUTPUTS_WIDTH;
+    #pragma omp parallel for
+    for (int ch = 0; ch < NB_OUTPUTS; ++ch) {
+        int ioIndex = ch * featureMapSize;
+        #pragma omp parallel for
+        for (int i = ioIndex; i < ioIndex + featureMapSize; i++) {
+            outputs[i] = biases[ch];
+        }
+        float var = sqrt(variances[ch] + epsilon);
+        #pragma omp parallel for
+        for (int feature = 0; feature < featureMapSize; ++feature) {
+            outputs[ioIndex + feature] += (scales[ch] * (inputs[ioIndex + feature] - means[ch]) / var);
         }
     }
+
 }
 
 
+
 #endif  // __AIDGE_EXPORT_CPP_KERNELS_BATCHNORM__
diff --git a/aidge_export_cpp/kernels/matmul.hpp b/aidge_export_cpp/kernels/matmul.hpp
index 4500993..b284214 100644
--- a/aidge_export_cpp/kernels/matmul.hpp
+++ b/aidge_export_cpp/kernels/matmul.hpp
@@ -3,31 +3,109 @@
 
 #include "network/typedefs.hpp"
 #include "kernels/activation.hpp"
-
+#include <iostream>
 // Generic function for matmul and activation
 
-template<int M,
-         int K,
-         int N,
+template<int INPUT_A_DIMS[],  int INPUT_B_DIMS[], int OUTPUT_DIMS[], 
+		int _SIZE_DIM_IN_A, int _SIZE_DIM_IN_B, int SIZE_DIM_OUT, 
          ActivationFunction_T ACTIVATION,
-         typename Input_T, typename Output_T,
-         typename Rescaling_T>
+         typename Input_T, typename Output_T>
 __attribute__((always_inline)) inline
 void matmul_forward (
     const Input_T* __restrict inputs1,
     const Input_T* __restrict inputs2,
-    Output_T* __restrict outputs,
-    const Rescaling_T& __restrict rescaling)
+    Output_T* __restrict outputs)
 {
-    for (int m = 0; m < M; ++m) {
-        for (int n = 0; n < N; ++n) {
-            Output_T sum = Output_T(0);
-            for (int k = 0; k < K; ++k) {
-                sum += inputs1[K*m + k] * inputs2[N*k + n];
+
+    //initialize arrays storing broadcasted(or not) dims
+    int ndim_a[SIZE_DIM_OUT];     
+    int ndim_b[SIZE_DIM_OUT];
+    if ( _SIZE_DIM_IN_A == 1){ 
+        ndim_a[0] = 1;
+        ndim_a[1] =INPUT_A_DIMS[0];
+    }
+    if ( _SIZE_DIM_IN_B == 1){ 
+        ndim_b[0] =INPUT_B_DIMS[0];
+        ndim_b[1] = 1;
+    }
+    
+    for (int i= 0; i<SIZE_DIM_OUT; i++){
+        int idx = SIZE_DIM_OUT-_SIZE_DIM_IN_A;
+        ndim_a[i] = (i< idx) ? 1 :INPUT_A_DIMS[i-idx];
+    }
+    for (int i= 0; i<SIZE_DIM_OUT; i++){
+        int idx = SIZE_DIM_OUT-_SIZE_DIM_IN_B;
+        ndim_b[i] = (i< idx) ? 1 :INPUT_B_DIMS[i-idx];
+    }
+        
+    // initialize strides to iterate through data because of broadcasting
+    int stride_post0[SIZE_DIM_OUT-2] ;
+    int stride_post1[SIZE_DIM_OUT-2] ; 
+    int stride_step0[SIZE_DIM_OUT-2] ;
+    int stride_step1[SIZE_DIM_OUT-2] ; 
+    if (SIZE_DIM_OUT > 2){ 
+        stride_post0[SIZE_DIM_OUT - 3] = 1;
+        stride_post1[SIZE_DIM_OUT - 3] = 1;
+        for (int i = SIZE_DIM_OUT-4; i != -1; --i) {
+            stride_post0[i] = stride_post0[i+1]*ndim_a[i+1];
+            stride_post1[i] = stride_post1[i+1]*ndim_b[i+1];
+        }
+        for (int i = 0; i < SIZE_DIM_OUT-2; ++i) {
+            stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1;
+            stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1;
+        }
+
+    }
+
+    
+    // if _SIZE_DIM_IN_B == _SIZE_DIM_IN_A, then _SIZE_DIM_IN_A == SIZE_DIM_OUT == _SIZE_DIM_IN_B; 
+    // else it will be broadcasted to the correct dims
+
+    int nbMatrices = 1;
+    for(int i = SIZE_DIM_OUT -3; i>=0; --i){
+        nbMatrices *= OUTPUT_DIMS[i];
+    }
+    int dim = SIZE_DIM_OUT -3;
+
+
+    int offsetIn0 = 0;
+    int offsetIn1 = 0;
+    int offsetOut = 0;
+    const int n = ndim_a[SIZE_DIM_OUT - 2];
+    const int k = ndim_a[SIZE_DIM_OUT - 1];
+    const int m = ndim_b[SIZE_DIM_OUT - 1];
+    const int matrix0Size = n*k;
+    const int matrix1Size = k*m;
+    const int matrixOutSize = n*m;
+
+    for(int stack = 0; stack < nbMatrices;){
+
+        for (int i = 0; i < n; ++i) {
+
+            for (int j = 0; j < m; ++j) {
+                float sum = 0;
+
+                for (int l = 0; l < k; ++l) {
+                    sum += (inputs1[ offsetIn0*matrix0Size + i*k + l] * inputs2[offsetIn1*matrix1Size + l*m + j]);
+                }
+                outputs[ offsetOut*matrixOutSize + i*m + j] = sum;
+            }
+        } 
+
+        if (++stack < nbMatrices) {
+            int tmp_stack = stack;
+            while(tmp_stack % OUTPUT_DIMS[dim] == 0) {
+                tmp_stack /= OUTPUT_DIMS[dim];
+                dim--;
             }
-            outputs[N*m + n] = activation_forward_value<Output_T>(sum, 0/*not applicable*/, ACTIVATION, rescaling);
+            offsetIn0 += stride_step0[dim];
+            offsetIn1 += stride_step1[dim];
+            ++offsetOut;
+            dim = SIZE_DIM_OUT -3;
         }
+
     }
+
 }
 
-#endif  // __AIDGE_EXPORT_CPP_KERNELS_MATMUL__
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_MATMUL__
\ No newline at end of file
diff --git a/aidge_export_cpp/kernels/transpose.hpp b/aidge_export_cpp/kernels/transpose.hpp
index a6ddce2..2a89e3c 100644
--- a/aidge_export_cpp/kernels/transpose.hpp
+++ b/aidge_export_cpp/kernels/transpose.hpp
@@ -27,9 +27,10 @@ void transpose_forward (
 
 	int indices[SIZE_OUTPUT_DIMS];
 	for (int i = 0; i<SIZE_OUTPUT_DIMS;++i){indices[i] = 0;}
-
+	#pragma omp parallel for
 	for (int i = 0; i < SIZE; ++i) {
 		int idx = 0;
+		#pragma omp parallel for
 		for (int j = SIZE_OUTPUT_DIMS -1; j >=0; --j) {
 			idx += indices[PERM[j]] * newStrides[j];
 		}
-- 
GitLab


From ace17ef9620f1ba1843f795c349be513ac47d21c Mon Sep 17 00:00:00 2001
From: Matthew Newson <matthew.newson@cea.fr>
Date: Wed, 5 Mar 2025 14:45:09 +0000
Subject: [PATCH 11/22] Jinja files matmul modified to accomodate for
 broadcasting modificatio of matmul

---
 .../configuration/matmul_config.jinja         | 21 +++++++++++++++----
 .../kernel_forward/matmul_forward.jinja       | 15 +++++++++----
 2 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/aidge_export_cpp/templates/configuration/matmul_config.jinja b/aidge_export_cpp/templates/configuration/matmul_config.jinja
index fece988..0c28e06 100644
--- a/aidge_export_cpp/templates/configuration/matmul_config.jinja
+++ b/aidge_export_cpp/templates/configuration/matmul_config.jinja
@@ -1,13 +1,26 @@
 {#- For name header -#}
 #ifndef {{ name|upper }}_LAYER_H
 #define {{ name|upper }}_LAYER_H
+#include "kernels/rescaling.hpp"
 
 {# For layer configuration -#}
-#define {{ name|upper }}_M {{ inputs_dims[0][0] }}
-#define {{ name|upper }}_K {{ inputs_dims[0][1] }}
-#define {{ name|upper }}_N {{ inputs_dims[1][1] }}
+{% include "./_def_io.jinja" %}
+{% include "./_meminfo.jinja" %}
+#define {{ name|upper }}_B {{ in_dims[0][0]}}
+#define {{ name|upper }}_C {{ in_chan[0]}}
+#define {{ name|upper }}_M {{ in_height[0]}}
+#define {{ name|upper }}_K {{ in_width[0] }}
+#define {{ name|upper }}_N {{ out_width[0] }}
+
+#define {{name|upper}}_SIZE_DIM_IN_A {{in_dims[0]|length}}
+#define {{name|upper}}_SIZE_DIM_IN_B {{in_dims[1]|length}}
+#define {{name|upper}}_SIZE_DIM_OUT {{out_dims[0]|length}}
+
+int {{name|upper}}_OUTPUT_DIMS[] =  { {{ out_dims[0]|join(", ") }} };
+int {{name|upper}}_INPUT_A_DIMS[] = { {{ in_dims[0]|join(", ") }} };
+int {{name|upper}}_INPUT_B_DIMS[] = { {{ in_dims[1]|join(", ") }} };
+
 #define {{ name|upper }}_ACTIVATION {{ activation }}
-static const {{ rescaling }} {{ name|upper }}_RESCALING = {};
 
 {#- Calculate sizes #}
 
diff --git a/aidge_export_cpp/templates/kernel_forward/matmul_forward.jinja b/aidge_export_cpp/templates/kernel_forward/matmul_forward.jinja
index ce80ffd..4ed0264 100644
--- a/aidge_export_cpp/templates/kernel_forward/matmul_forward.jinja
+++ b/aidge_export_cpp/templates/kernel_forward/matmul_forward.jinja
@@ -1,5 +1,12 @@
-matmul_forward<{{name|upper}}_M,
-               {{name|upper}}_K,
-               {{name|upper}}_N,
+{% filter indent(width=4, first=False) %}
+{% include "./_mem_offset.jinja" %}
+matmul_forward<{{name|upper}}_INPUT_A_DIMS,
+               {{name|upper}}_INPUT_B_DIMS,
+               {{name|upper}}_OUTPUT_DIMS,
+               {{name|upper}}_SIZE_DIM_IN_A,
+               {{name|upper}}_SIZE_DIM_IN_B,
+               {{name|upper}}_SIZE_DIM_OUT,
                {{name|upper}}_ACTIVATION>
-               ({{inputs1_name}}, {{inputs2_name}}, {{outputs_name}}, {{name|upper}}_RESCALING);
\ No newline at end of file
+               ({{in_name[0]}}, {{in_name[1]}}, {{out_name[0]}});
+{% include "./_save_outputs.jinja" %}
+{% endfilter %} 
\ No newline at end of file
-- 
GitLab


From 8a315efbf4986f9903d163216bdac188ee2faeec Mon Sep 17 00:00:00 2001
From: Matthew Newson <matthew.newson@cea.fr>
Date: Tue, 25 Mar 2025 08:37:11 +0000
Subject: [PATCH 12/22] Add operator helpers for transpose, erf, matmul and
 batchnorm

---
 aidge_export_cpp/operators.py | 166 ++++++++++++++++++++++++++++------
 1 file changed, 138 insertions(+), 28 deletions(-)

diff --git a/aidge_export_cpp/operators.py b/aidge_export_cpp/operators.py
index f04dbb3..16890e1 100644
--- a/aidge_export_cpp/operators.py
+++ b/aidge_export_cpp/operators.py
@@ -4,27 +4,14 @@ from pathlib import Path
 import aidge_core
 from aidge_core.export_utils import ExportNode, ExportNodeCpp, generate_file
 from aidge_export_cpp.utils import ROOT
+from aidge_export_cpp.utils.converter import numpy_dtype2ctype
 from aidge_export_cpp import ExportLibCpp
 
 ##############################################
 ############## Export functions ##############
 ##############################################
-def numpy_dtype2ctype(dtype):
-    if dtype == np.int8:
-        return "int8_t"
-    elif dtype == np.int16:
-        return "int16_t"
-    elif dtype == np.int32:
-        return "int32_t"
-    elif dtype == np.int64:
-        return "int64_t"
-    elif dtype == np.float32:
-        return "float"
-    elif dtype == np.float64:
-        return "double"
-    # Add more dtype mappings as needed
-    else:
-        raise ValueError(f"Unsupported {dtype} dtype")
+
+
 
 def export_params(name: str,
                   array: np.ndarray,
@@ -103,6 +90,7 @@ class ConvCPP(ExportNodeCpp):
         self.attributes["padding"] = [0, 0]
         self.attributes["activation"] = "Linear"
         self.attributes["rescaling"] = "NoScaling"
+        self.attributes["groups"] = 1
         self.config_template = str(
             ROOT / "templates" / "configuration" / "convolution_config.jinja")
         self.forward_template = str(
@@ -144,21 +132,53 @@ class PaddedConvCPP(ExportNodeCpp):
             str(ROOT / "kernels" / "activation.hpp"),
             str(ROOT / "kernels" / "rescaling.hpp")
         ]
+        self.attributes["groups"] = 1
+
+@ExportLibCpp.register_metaop("PaddedConvDepthWise2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
+class PaddedConvDepthWiseCPP(ExportNodeCpp):
+    def __init__(self, node, mem_info):
+        super().__init__(node, mem_info)
+        # TODO find a way to retrive attr for meta op
+        for n in self.operator.get_micro_graph().get_nodes():
+            if n.type() == "Pad2D":
+                self.attributes["padding"] = n.get_operator(
+                ).attr.begin_end_borders
+            if n.type() == "ConvDepthWise2D":
+                self.attributes["kernel_dims"] = n.get_operator(
+                ).attr.kernel_dims
+                self.attributes["stride_dims"] = n.get_operator(
+                ).attr.stride_dims
+                self.attributes["dilation_dims"] = n.get_operator(
+                ).attr.dilation_dims
+        self.attributes["activation"] = "Linear"
+        self.attributes["rescaling"] = "NoScaling"
+        self.config_template = str(
+            ROOT / "templates" / "configuration" / "convolution_config.jinja")
+        self.forward_template = str(
+            ROOT / "templates" / "kernel_forward" / "convolution_forward.jinja")
+        self.include_list = []
+        self.kernels_to_copy = [
+            str(ROOT / "kernels" / "convolution.hpp"),
+            str(ROOT / "kernels" / "macs.hpp"),
+            str(ROOT / "kernels" / "activation.hpp"),
+            str(ROOT / "kernels" / "rescaling.hpp")
+        ]
+        self.attributes["groups"] = self.attributes["out_chan"][0]
 
 @ExportLibCpp.register("Add", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
 class AddCPP(ExportNodeCpp):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
-        self.attributes["elemwise_op"] = "Add"
+        self.attributes["add_op"] = "Add"
         self.attributes["activation"] = "Linear"
         self.attributes["rescaling"] = "NoScaling"
         self.config_template = str(
-            ROOT / "templates" / "configuration" / "elemwise_config.jinja")
+            ROOT / "templates" / "configuration" / "add_config.jinja")
         self.forward_template = str(
-            ROOT / "templates" / "kernel_forward" / "elemwise_forward.jinja")
+            ROOT / "templates" / "kernel_forward" / "add_forward.jinja")
         self.include_list = []
         self.kernels_to_copy = [
-            str(ROOT / "kernels" / "elemwise.hpp"),
+            str(ROOT / "kernels" / "add.hpp"),
             str(ROOT / "kernels" / "activation.hpp"),
             str(ROOT / "kernels" / "rescaling.hpp")
         ]
@@ -167,16 +187,16 @@ class AddCPP(ExportNodeCpp):
 class SubCPP(ExportNodeCpp):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
-        self.attributes["elemwise_op"] = "Sub"
+        self.attributes["sub_op"] = "Sub"
         self.attributes["activation"] = "Linear"
         self.attributes["rescaling"] = "NoScaling"
         self.config_template = str(
-            ROOT / "templates" / "configuration" / "elemwise_config.jinja")
+            ROOT / "templates" / "configuration" / "sub_config.jinja")
         self.forward_template = str(
-            ROOT / "templates" / "kernel_forward" / "elemwise_forward.jinja")
+            ROOT / "templates" / "kernel_forward" / "sub_forward.jinja")
         self.include_list = []
         self.kernels_to_copy = [
-            str(ROOT / "kernels" / "elemwise.hpp"),
+            str(ROOT / "kernels" / "sub.hpp"),
             str(ROOT / "kernels" / "activation.hpp"),
             str(ROOT / "kernels" / "rescaling.hpp")
         ]
@@ -186,20 +206,39 @@ class SubCPP(ExportNodeCpp):
 class MulCPP(ExportNodeCpp):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
-        self.attributes["elemwise_op"] = "Mul"
+        self.attributes["mul_op"] = "Mul"
+        self.attributes["activation"] = "Linear"
+        self.attributes["rescaling"] = "NoScaling"
+        self.config_template = str(
+            ROOT / "templates" / "configuration" / "mul_config.jinja")
+        self.forward_template = str(
+            ROOT / "templates" / "kernel_forward" / "mul_forward.jinja")
+        self.include_list = []
+        self.kernels_to_copy = [
+            str(ROOT / "kernels" / "mul.hpp"),
+            str(ROOT / "kernels" / "activation.hpp"),
+            str(ROOT / "kernels" / "rescaling.hpp")
+        ]
+
+@ExportLibCpp.register("Div", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
+class DivCPP(ExportNodeCpp):
+    def __init__(self, node, mem_info):
+        super().__init__(node, mem_info)
+        self.attributes["div_op"] = "Div"
         self.attributes["activation"] = "Linear"
         self.attributes["rescaling"] = "NoScaling"
         self.config_template = str(
-            ROOT / "templates" / "configuration" / "elemwise_config.jinja")
+            ROOT / "templates" / "configuration" / "div_config.jinja")
         self.forward_template = str(
-            ROOT / "templates" / "kernel_forward" / "elemwise_forward.jinja")
+            ROOT / "templates" / "kernel_forward" / "div_forward.jinja")
         self.include_list = []
         self.kernels_to_copy = [
-            str(ROOT / "kernels" / "elemwise.hpp"),
+            str(ROOT / "kernels" / "div.hpp"),
             str(ROOT / "kernels" / "activation.hpp"),
             str(ROOT / "kernels" / "rescaling.hpp")
         ]
 
+
 @ExportLibCpp.register("MaxPooling2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
 class MaxPoolCPP(ExportNodeCpp):
     def __init__(self, node, mem_info):
@@ -295,3 +334,74 @@ class FcCPP(ExportNodeCpp):
             str(ROOT / "kernels" / "activation.hpp"),
             str(ROOT / "kernels" / "rescaling.hpp")
         ]
+
+@ExportLibCpp.register("MatMul", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
+class MatMulCPP(ExportNodeCpp):
+    def __init__(self, node, mem_info):
+        super().__init__(node, mem_info)
+        self.attributes["activation"] = "Linear"
+        self.attributes["rescaling"] = "NoScaling"
+        self.config_template = str(
+            ROOT / "templates" / "configuration" / "matmul_config.jinja")
+        self.forward_template = str(
+            ROOT / "templates" / "kernel_forward" / "matmul_forward.jinja")
+        self.include_list = []
+        self.kernels_to_copy = [
+            str(ROOT / "kernels" / "matmul.hpp"),
+            str(ROOT / "kernels" / "activation.hpp"),
+            str(ROOT / "kernels" / "rescaling.hpp")
+        ]
+
+@ExportLibCpp.register("Erf", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
+class ErfCPP(ExportNodeCpp):
+    def __init__(self, node, mem_info):
+        super().__init__(node, mem_info)
+        self.attributes["activation"] = "Linear"
+        self.attributes["rescaling"] = "NoScaling"
+        self.config_template = str(
+            ROOT / "templates" / "configuration" / "erf_config.jinja")
+        self.forward_template = str(
+            ROOT / "templates" / "kernel_forward" / "erf_forward.jinja")
+        self.include_list = []
+        self.kernels_to_copy = [
+            str(ROOT / "kernels" / "erf.hpp"),
+            str(ROOT / "kernels" / "activation.hpp"),
+            str(ROOT / "kernels" / "rescaling.hpp")
+        ]
+
+@ExportLibCpp.register("Transpose", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
+class TransposeCPP(ExportNodeCpp):
+    def __init__(self, node, mem_info):
+        super().__init__(node, mem_info)
+        # Get parameter permutation from transpose
+        self.attributes["output_dims_order"] = self.operator.attr.get_attr("output_dims_order")
+
+        self.attributes["activation"] = "Linear"
+        self.attributes["rescaling"] = "NoScaling"
+        self.config_template = str(
+            ROOT / "templates" / "configuration" / "transpose_config.jinja")
+        self.forward_template = str(
+            ROOT / "templates" / "kernel_forward" / "transpose_forward.jinja")
+        self.include_list = []
+        self.kernels_to_copy = [
+            str(ROOT / "kernels" / "transpose.hpp"),
+            str(ROOT / "kernels" / "activation.hpp"),
+            str(ROOT / "kernels" / "rescaling.hpp")
+        ]
+
+@ExportLibCpp.register("BatchNorm2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
+class BatchNorm2DCPP(ExportNodeCpp):
+    def __init__(self, node, mem_info):
+        super().__init__(node, mem_info)
+        self.attributes["activation"] = "Linear"
+        self.attributes["rescaling"] = "NoScaling"
+        self.config_template = str(
+            ROOT / "templates" / "configuration" / "batchnorm_config.jinja")
+        self.forward_template = str(
+            ROOT / "templates" / "kernel_forward" / "batchnorm_forward.jinja")
+        self.include_list = []
+        self.kernels_to_copy = [
+            str(ROOT / "kernels" / "batchnorm.hpp"),
+            str(ROOT / "kernels" / "activation.hpp"),
+            str(ROOT / "kernels" / "rescaling.hpp")
+        ]
\ No newline at end of file
-- 
GitLab


From f64c927a746af341cd70a75937ced5992cb4827a Mon Sep 17 00:00:00 2001
From: Matthew Newson <matthew.newson@cea.fr>
Date: Thu, 27 Mar 2025 10:26:36 +0000
Subject: [PATCH 13/22] Added new parameter groups

---
 aidge_export_cpp/kernels/convolution.hpp | 149 +++++++++--------------
 1 file changed, 56 insertions(+), 93 deletions(-)

diff --git a/aidge_export_cpp/kernels/convolution.hpp b/aidge_export_cpp/kernels/convolution.hpp
index efc7ee7..b623369 100644
--- a/aidge_export_cpp/kernels/convolution.hpp
+++ b/aidge_export_cpp/kernels/convolution.hpp
@@ -6,114 +6,77 @@
 #include "network/utils.hpp"
 #include "kernels/macs.hpp"
 #include "kernels/activation.hpp"
+#include <omp.h>
+#include <iostream>
+
+// Weights index en NHWC
+constexpr int inds_pos(int n, int c, int h, int w, int N, int C, int H, int W) {
+    return n * (H * W * C) +
+           h * (W * C) + 
+           w * C +
+           c;
+}
 
+// Image index in CHW
+constexpr int inds_pos(int c, int h, int w, int C, int H, int W) {
+    return c * (H * W) + 
+           h * W +
+           w;
+}
 
-template<int NB_CHANNELS,
-         int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
-         int NB_OUTPUTS,
-         int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
+template<int NB_CHANNELS, 
+         int IN_HEIGHT, int IN_WIDTH,
+         int NB_OUTPUTS, int GROUPS,
+         int OUT_HEIGHT, int OUT_WIDTH,
          int PADDING_Y, int PADDING_X,
          int STRIDE_Y, int STRIDE_X,
          int DILATION_Y, int DILATION_X,
          int KERNEL_HEIGHT, int KERNEL_WIDTH,
          ActivationFunction_T ACTIVATION,
-         typename Input_T, typename Output_T,
+         typename Input_T, typename Output_T, 
          typename Weight_T, typename Bias_T,
          typename Rescaling_T>
-__attribute__((always_inline)) inline
+__attribute__((always_inline)) inline 
 void convolution_forward(
-    const Input_T* __restrict inputs,
+    const Input_T* __restrict inputs,  
     Output_T* __restrict outputs,
     const Weight_T* __restrict weights,
     const Bias_T* __restrict biases,
     const Rescaling_T& __restrict rescaling)
 {
-    constexpr int DILATED_KERNEL_HEIGHT
-            = KERNEL_HEIGHT + (DILATION_Y - 1) * (KERNEL_HEIGHT - 1);
-
-    constexpr int DILATED_KERNEL_WIDTH
-            = KERNEL_WIDTH + (DILATION_X - 1) * (KERNEL_WIDTH - 1);
-
-    constexpr int OUTPUTS_HEIGHT_NOPAD
-        = (CHANNELS_HEIGHT - DILATION_Y * (KERNEL_HEIGHT - 1) - 1 + STRIDE_Y) / STRIDE_Y;
-    constexpr int OUTPUTS_WIDTH_NOPAD
-        = (CHANNELS_WIDTH - DILATION_X * (KERNEL_WIDTH - 1) - 1 + STRIDE_X) / STRIDE_X;
-
-    for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
-        const int syMin = (PADDING_Y == 0) ? 0
-            : max(PADDING_Y - (oy * STRIDE_Y), 0);
-        const int syMax = (PADDING_Y == 0
-                && OUTPUTS_HEIGHT == OUTPUTS_HEIGHT_NOPAD) ? DILATED_KERNEL_HEIGHT
-            : clamp(CHANNELS_HEIGHT + PADDING_Y - (oy * STRIDE_Y),
-                    0, DILATED_KERNEL_HEIGHT);
-        const int iy = (oy * STRIDE_Y) - PADDING_Y;
-
-#pragma omp parallel for collapse(2)
-        for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
-            for (int output = 0; output < NB_OUTPUTS; ++output) {
-                // moved to inner loop for collapsing -->
-                const int sxMin = (PADDING_X == 0) ? 0
-                    : max(PADDING_X - (ox * STRIDE_X), 0);
-                const int sxMax = (PADDING_X == 0
-                        && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
-                            ? DILATED_KERNEL_WIDTH
-                    : clamp(CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X),
-                            0, DILATED_KERNEL_WIDTH);
-                const int ix = (ox * STRIDE_X) - PADDING_X;
-
-                const int oPos = (ox + OUTPUTS_WIDTH * oy);
-                int oOffset = NB_OUTPUTS * oPos;
-
-                // <--
-
-                Bias_T weightedSum = biases[output];
 
-                for (int sy = 0; sy < KERNEL_HEIGHT; ++sy) {
-                    if ((PADDING_Y != 0
-                            || OUTPUTS_HEIGHT != OUTPUTS_HEIGHT_NOPAD)
-                        && ((sy*DILATION_Y < syMin) || (sy*DILATION_Y >= syMax)))
-                    {
-                        continue;
-                    }
-
-                    const int iPos = ix + CHANNELS_WIDTH * (iy + sy*DILATION_Y);
-                    int iOffset = NB_CHANNELS * iPos;
-
-                    const int wOffset = (output*KERNEL_HEIGHT + sy) * KERNEL_WIDTH * NB_CHANNELS;
-
-                    if (DILATION_X == 1 && ((PADDING_X == 0 && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
-                        || sxMax - sxMin == KERNEL_WIDTH))
-                    {
-                        macsOnRange<KERNEL_WIDTH * NB_CHANNELS>(
-                            inputs + iOffset,
-                            weights + wOffset,
-                            weightedSum);
-                    }
-                    else {
-                        for (int sx = 0; sx < KERNEL_WIDTH; ++sx) {
-                            if ((PADDING_X != 0
-                                    || OUTPUTS_WIDTH != OUTPUTS_WIDTH_NOPAD)
-                                && ((sx*DILATION_X < sxMin) || (sx*DILATION_X >= sxMax)))
-                            {
-                                continue;
-                            }
-
-                            int iOffsetInRange = iOffset
-                                + sx * DILATION_X * NB_CHANNELS;
-
-                            macsOnRange<NB_CHANNELS>(
-                                // same input line so no wrapping can occur
-                                inputs + iOffsetInRange,
-                                weights + wOffset + sx * NB_CHANNELS,
-                                weightedSum);
-                        }
-                    }
-                }
-
-                outputs[oOffset + output] = activation_forward_value<Output_T>(weightedSum, output, ACTIVATION, rescaling);
-            }
-        }
+    if (NB_CHANNELS % GROUPS != 0 || NB_OUTPUTS % GROUPS != 0) {
+        throw std::invalid_argument("Groups must be a divisor of both NB_CHANNELS and NB_OUTPUTS!");
+    }
+    
+    int c_in_g = NB_CHANNELS / GROUPS;
+    int c_out_g = NB_OUTPUTS / GROUPS;
+    #pragma omp parallel for
+    for (int oc = 0; oc < NB_OUTPUTS; oc++) {
+    	int g_oc = oc / c_out_g;
+        #pragma omp parallel for
+    	for (int i = 0; i < OUT_HEIGHT; ++i) {
+            #pragma omp parallel for
+    	    for (int j = 0; j < OUT_WIDTH; ++j) {
+    	        Output_T value = biases[oc];
+                #pragma omp parallel for
+    	        for (int ic = g_oc * c_in_g; ic < (g_oc + 1) * c_in_g; ++ic) {
+                    #pragma omp parallel for
+    	            for (int m = 0; m < KERNEL_HEIGHT; ++m) {
+                        #pragma omp parallel for
+    	                for (int n = 0; n < KERNEL_WIDTH; ++n) {
+    	                    int i_p = i * STRIDE_X - PADDING_X + m * DILATION_X;
+                            int j_p = j * STRIDE_Y - PADDING_Y + n * DILATION_Y;
+                            if (i_p >= 0 && i_p < IN_HEIGHT && j_p >= 0 && j_p < IN_WIDTH) {
+                                value += weights[inds_pos(oc, ic % c_in_g, m, n, NB_OUTPUTS, c_in_g, KERNEL_HEIGHT, KERNEL_WIDTH)] *
+                                         inputs[inds_pos(ic, i_p, j_p, NB_CHANNELS, IN_HEIGHT, IN_WIDTH)];
+    	                    }
+    	                }
+    	            }
+    	        }
+    	        outputs[inds_pos(oc, i, j, NB_OUTPUTS, OUT_HEIGHT, OUT_WIDTH)] = activation_forward_value<Output_T>(value, oc, ACTIVATION, rescaling);
+    	    }
+    	} 
     }
 }
-
-#endif  // __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION__
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION__
\ No newline at end of file
-- 
GitLab


From 05c867bef62d9508b7ef1eae0c33edec1718b887 Mon Sep 17 00:00:00 2001
From: Matthew Newson <matthew.newson@cea.fr>
Date: Fri, 28 Mar 2025 13:12:16 +0000
Subject: [PATCH 14/22] Cleaned file

---
 aidge_export_cpp/kernels/elemwise.hpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/aidge_export_cpp/kernels/elemwise.hpp b/aidge_export_cpp/kernels/elemwise.hpp
index 6f73bc0..1b45c59 100644
--- a/aidge_export_cpp/kernels/elemwise.hpp
+++ b/aidge_export_cpp/kernels/elemwise.hpp
@@ -26,7 +26,6 @@ void elemwise_forward (
         
         switch (ELEM_OP) {
             case Add: {
-                // std::cout <<"Add " << std::endl;
                 int ndim_a[SIZE_DIM_OUT];
                 int ndim_b[SIZE_DIM_OUT];
                 for (int i= 0; i<SIZE_DIM_OUT; i++){
@@ -182,7 +181,6 @@ void elemwise_forward (
                 break;
             }
             case Mul: {
-               //  std::cout<< "MUL " << std::endl;
                 int ndim_a[SIZE_DIM_OUT];
                 int ndim_b[SIZE_DIM_OUT];
 
@@ -258,8 +256,6 @@ void elemwise_forward (
                 break;
             }
             case Div: {
-
-                std::cout<< "DIV " << std::endl;
                 int ndim_a[SIZE_DIM_OUT];
                 int ndim_b[SIZE_DIM_OUT];
                 for (int i= 0; i<SIZE_DIM_OUT; i++){
-- 
GitLab


From 81d728acbb0b50c07b5c86dff61c87d250b97113 Mon Sep 17 00:00:00 2001
From: Matthew Newson <matthew.newson@cea.fr>
Date: Fri, 28 Mar 2025 13:48:54 +0000
Subject: [PATCH 15/22] New transpose method used for exporting ConvNeXt onnx

---
 aidge_export_cpp/kernels/transpose_diff.hpp   | 57 +++++++++++++++++++
 .../configuration/transpose_config.jinja      |  4 +-
 .../kernel_forward/transpose_forward.jinja    |  3 +-
 3 files changed, 59 insertions(+), 5 deletions(-)
 create mode 100644 aidge_export_cpp/kernels/transpose_diff.hpp

diff --git a/aidge_export_cpp/kernels/transpose_diff.hpp b/aidge_export_cpp/kernels/transpose_diff.hpp
new file mode 100644
index 0000000..298e5d9
--- /dev/null
+++ b/aidge_export_cpp/kernels/transpose_diff.hpp
@@ -0,0 +1,57 @@
+#ifndef __AIDGE_EXPORT_CPP_KERNELS_TRANSPOSE__
+#define __AIDGE_EXPORT_CPP_KERNELS_TRANSPOSE__
+
+#include "network/typedefs.hpp"
+#include <cstring>
+#include <cstdio>
+#include <iostream>
+
+using namespace std; 
+
+template< int INPUT_DIMS[],  int PERM[], int OUTPUT_DIMS[], 
+		int SIZE_OUTPUT_DIMS, int SIZE,
+        typename Input_T, typename Output_T>
+__attribute__((always_inline)) inline 
+void transpose_forward (
+    const Input_T* __restrict inputs,
+    Output_T* __restrict outputs)
+    {
+
+	int newStrides[SIZE_OUTPUT_DIMS];
+	for (int i = 0; i<SIZE_OUTPUT_DIMS;++i){newStrides[i] = 1;}
+	for (int i = 0; i < SIZE_OUTPUT_DIMS; ++i) {
+		for (int j = i + 1; j < SIZE_OUTPUT_DIMS; ++j) {
+			newStrides[i] *= OUTPUT_DIMS[j];
+		}
+	}
+
+	int indices[SIZE_OUTPUT_DIMS];
+	for (int i = 0; i<SIZE_OUTPUT_DIMS;++i){indices[i] = 0;}
+//	#pragma omp parallel for
+	for (int i = 0; i < SIZE; ++i) {
+		int idx = 0;
+//		#pragma omp parallel for
+		for (int j = SIZE_OUTPUT_DIMS -1; j >=0; --j) {
+			idx += indices[PERM[j]] * newStrides[j];
+		}
+
+		outputs[idx] = inputs[i];
+
+
+		for (int j = SIZE_OUTPUT_DIMS - 1; j >= 0; --j) {
+			if (indices[j] < INPUT_DIMS[j] - 1) {
+				indices[j]++;
+				break;
+			}
+			else {
+				indices[j] = 0;
+			}
+		}
+	}
+
+    
+}
+
+
+
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_TRANSPOSE__
diff --git a/aidge_export_cpp/templates/configuration/transpose_config.jinja b/aidge_export_cpp/templates/configuration/transpose_config.jinja
index c3eabc5..01e57b8 100644
--- a/aidge_export_cpp/templates/configuration/transpose_config.jinja
+++ b/aidge_export_cpp/templates/configuration/transpose_config.jinja
@@ -12,6 +12,4 @@ int {{name|upper}}_OUTPUT_DIMS[] =  { {{ out_dims[0]|join(", ") }} };
 int {{name|upper}}_INPUT_DIMS[] = { {{ in_dims[0]|join(", ") }} };
 int {{name|upper}}_PERM[] = { {{ output_dims_order|join(", ") }} };
 
-
-
-#endif /* {{ name|upper }}_LAYER_H */
+#endif /* {{ name|upper }}_LAYER_H */
\ No newline at end of file
diff --git a/aidge_export_cpp/templates/kernel_forward/transpose_forward.jinja b/aidge_export_cpp/templates/kernel_forward/transpose_forward.jinja
index 2f8d939..2a5433c 100644
--- a/aidge_export_cpp/templates/kernel_forward/transpose_forward.jinja
+++ b/aidge_export_cpp/templates/kernel_forward/transpose_forward.jinja
@@ -7,5 +7,4 @@ transpose_forward<{{ name|upper }}_INPUT_DIMS,
                     {{name|upper}}_SIZE>
                    ({{in_name[0]}}, {{out_name[0]}});
 {% include "./_save_outputs.jinja" %}
-{% endfilter %}
-
+{% endfilter %}
\ No newline at end of file
-- 
GitLab


From 2a06f059865e9581daf58fece9f128c06a333ee2 Mon Sep 17 00:00:00 2001
From: Matthew Newson <matthew.newson@cea.fr>
Date: Fri, 28 Mar 2025 14:07:16 +0000
Subject: [PATCH 16/22] Add new parameter groups for convolution

---
 .../kernels/convolution_groups.hpp            |  84 ++++++++++
 aidge_export_cpp/operators.py                 | 150 ++++++++----------
 .../configuration/convolution_config.jinja    |   1 -
 .../kernel_forward/convolution_forward.jinja  |   2 +-
 4 files changed, 154 insertions(+), 83 deletions(-)
 create mode 100644 aidge_export_cpp/kernels/convolution_groups.hpp

diff --git a/aidge_export_cpp/kernels/convolution_groups.hpp b/aidge_export_cpp/kernels/convolution_groups.hpp
new file mode 100644
index 0000000..7d73f79
--- /dev/null
+++ b/aidge_export_cpp/kernels/convolution_groups.hpp
@@ -0,0 +1,84 @@
+#ifndef __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION__
+#define __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION__
+
+#include "network/typedefs.hpp"
+#include "kernels/rescaling.hpp"
+#include "network/utils.hpp"
+#include "kernels/macs.hpp"
+#include "kernels/activation.hpp"
+#include <omp.h>
+#include <iostream>
+
+// Weights index en NHWC
+constexpr int inds_pos(int n, int c, int h, int w, int N, int C, int H, int W) {
+    return n * (H * W * C) +
+           h * (W * C) + 
+           w * C +
+           c;
+}
+
+// Image index in CHW
+constexpr int inds_pos(int c, int h, int w, int C, int H, int W) {
+    return c * (H * W) + 
+           h * W +
+           w;
+}
+
+
+
+template<int NB_CHANNELS, 
+         int IN_HEIGHT, int IN_WIDTH,
+         int NB_OUTPUTS, int GROUPS,
+         int OUT_HEIGHT, int OUT_WIDTH,
+         int PADDING_Y, int PADDING_X,
+         int STRIDE_Y, int STRIDE_X,
+         int DILATION_Y, int DILATION_X,
+         int KERNEL_HEIGHT, int KERNEL_WIDTH,
+         ActivationFunction_T ACTIVATION,
+         typename Input_T, typename Output_T, 
+         typename Weight_T, typename Bias_T,
+         typename Rescaling_T>
+__attribute__((always_inline)) inline 
+void convolution_forward(
+    const Input_T* __restrict inputs,  
+    Output_T* __restrict outputs,
+    const Weight_T* __restrict weights,
+    const Bias_T* __restrict biases,
+    const Rescaling_T& __restrict rescaling)
+{
+
+    if (NB_CHANNELS % GROUPS != 0 || NB_OUTPUTS % GROUPS != 0) {
+        throw std::invalid_argument("Groups must be a divisor of both NB_CHANNELS and NB_OUTPUTS!");
+    }
+    
+    int c_in_g = NB_CHANNELS / GROUPS;
+    int c_out_g = NB_OUTPUTS / GROUPS;
+    #pragma omp parallel for
+    for (int oc = 0; oc < NB_OUTPUTS; oc++) {
+    	int g_oc = oc / c_out_g;
+        #pragma omp parallel for
+    	for (int i = 0; i < OUT_HEIGHT; ++i) {
+            #pragma omp parallel for
+    	    for (int j = 0; j < OUT_WIDTH; ++j) {
+    	        Output_T value = biases[oc];
+                #pragma omp parallel for
+    	        for (int ic = g_oc * c_in_g; ic < (g_oc + 1) * c_in_g; ++ic) {
+                    #pragma omp parallel for
+    	            for (int m = 0; m < KERNEL_HEIGHT; ++m) {
+                        #pragma omp parallel for
+    	                for (int n = 0; n < KERNEL_WIDTH; ++n) {
+    	                    int i_p = i * STRIDE_X - PADDING_X + m * DILATION_X;
+                            int j_p = j * STRIDE_Y - PADDING_Y + n * DILATION_Y;
+                            if (i_p >= 0 && i_p < IN_HEIGHT && j_p >= 0 && j_p < IN_WIDTH) {
+                                value += weights[inds_pos(oc, ic % c_in_g, m, n, NB_OUTPUTS, c_in_g, KERNEL_HEIGHT, KERNEL_WIDTH)] *
+                                         inputs[inds_pos(ic, i_p, j_p, NB_CHANNELS, IN_HEIGHT, IN_WIDTH)];
+    	                    }
+    	                }
+    	            }
+    	        }
+    	        outputs[inds_pos(oc, i, j, NB_OUTPUTS, OUT_HEIGHT, OUT_WIDTH)] = activation_forward_value<Output_T>(value, oc, ACTIVATION, rescaling);
+    	    }
+    	} 
+    }
+}
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION__
\ No newline at end of file
diff --git a/aidge_export_cpp/operators.py b/aidge_export_cpp/operators.py
index a511637..5ee9992 100644
--- a/aidge_export_cpp/operators.py
+++ b/aidge_export_cpp/operators.py
@@ -4,14 +4,27 @@ from pathlib import Path
 import aidge_core
 from aidge_core.export_utils import ExportNode, ExportNodeCpp, generate_file
 from aidge_export_cpp.utils import ROOT
-from aidge_export_cpp.utils.converter import numpy_dtype2ctype
 from aidge_export_cpp import ExportLibCpp
 
 ##############################################
 ############## Export functions ##############
 ##############################################
-
-
+def numpy_dtype2ctype(dtype):
+    if dtype == np.int8:
+        return "int8_t"
+    elif dtype == np.int16:
+        return "int16_t"
+    elif dtype == np.int32:
+        return "int32_t"
+    elif dtype == np.int64:
+        return "int64_t"
+    elif dtype == np.float32:
+        return "float"
+    elif dtype == np.float64:
+        return "double"
+    # Add more dtype mappings as needed
+    else:
+        raise ValueError(f"Unsupported {dtype} dtype")
 
 def export_params(name: str,
                   array: np.ndarray,
@@ -43,7 +56,7 @@ class ProducerCPP(ExportNode):
         super().__init__(node, mem_info)
         self.values = np.array(self.operator.get_output(0))
 
-        if len(self.values.shape) == 4:  # Note: export in HWC
+        if len(self.values.shape) == 4:  # Note: export in HWC   
             self.values =  np.transpose(self.values, (0, 2, 3, 1))
 
     def export(self, export_folder: Path):
@@ -130,6 +143,24 @@ def _setup_conv2D(conv):
         str(ROOT / "kernels" / "rescaling.hpp")
     ]
 
+
+def _setup_elemwise_op(elemwise, op):
+    """Common code (template and kernel setup) shared across all the different elementWise operator (Add, Sub,...)."""
+
+    elemwise.attributes["elemwise_op"] = op
+    elemwise.attributes["activation"] = "Linear"
+    elemwise.attributes["rescaling"] = "NoScaling"
+    elemwise.config_template = str(
+        ROOT / "templates" / "configuration" / "elemwise_config.jinja")
+    elemwise.forward_template = str(
+        ROOT / "templates" / "kernel_forward" / "elemwise_forward.jinja")
+    elemwise.include_list = []
+    elemwise.kernels_to_copy = [
+        str(ROOT / "kernels" / "elemwise.hpp"),
+        str(ROOT / "kernels" / "activation.hpp"),
+        str(ROOT / "kernels" / "rescaling.hpp")
+    ]
+
 @ExportLibCpp.register("Conv2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
 class ConvCPP(ExportNodeCpp):
     def __init__(self, node, mem_info):
@@ -137,7 +168,7 @@ class ConvCPP(ExportNodeCpp):
         # No padding with Conv
         # Use PaddedConv to add padding attribute
         self.attributes["padding"] = [0, 0]
-
+        self.attributes["groups"] = 1
         _setup_conv2D(self)
 
 @ExportLibCpp.register_metaop("PaddedConv2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
@@ -156,25 +187,28 @@ class PaddedConvCPP(ExportNodeCpp):
                 ).attr.stride_dims
                 self.attributes["dilation_dims"] = n.get_operator(
                 ).attr.dilation_dims
-
+        self.attributes["groups"] = 1
         _setup_conv2D(self)
 
-def _setup_elemwise_op(elemwise, op):
-    """Common code (template and kernel setup) shared across all the different elementWise operator (Add, Sub,...)."""
+@ExportLibCpp.register_metaop("PaddedConvDepthWise2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
+class PaddedConvDepthWiseCPP(ExportNodeCpp):
+    def __init__(self, node, mem_info):
+        super().__init__(node, mem_info)
+        # TODO find a way to retrive attr for meta op
+        for n in self.operator.get_micro_graph().get_nodes():
+            if n.type() == "Pad2D":
+                self.attributes["padding"] = n.get_operator(
+                ).attr.begin_end_borders
+            if n.type() == "ConvDepthWise2D":
+                self.attributes["kernel_dims"] = n.get_operator(
+                ).attr.kernel_dims
+                self.attributes["stride_dims"] = n.get_operator(
+                ).attr.stride_dims
+                self.attributes["dilation_dims"] = n.get_operator(
+                ).attr.dilation_dims
 
-    elemwise.attributes["elemwise_op"] = op
-    elemwise.attributes["activation"] = "Linear"
-    elemwise.attributes["rescaling"] = "NoScaling"
-    elemwise.config_template = str(
-        ROOT / "templates" / "configuration" / "elemwise_config.jinja")
-    elemwise.forward_template = str(
-        ROOT / "templates" / "kernel_forward" / "elemwise_forward.jinja")
-    elemwise.include_list = []
-    elemwise.kernels_to_copy = [
-        str(ROOT / "kernels" / "elemwise.hpp"),
-        str(ROOT / "kernels" / "activation.hpp"),
-        str(ROOT / "kernels" / "rescaling.hpp")
-    ]
+        self.attributes["groups"] = self.attributes["out_chan"][0]
+        _setup_conv2D(self)
 
 @ExportLibCpp.register("Add", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
 class AddCPP(ExportNodeCpp):
@@ -197,6 +231,14 @@ class MulCPP(ExportNodeCpp):
 
         _setup_elemwise_op(self, "Mul")
 
+@ExportLibCpp.register("Div", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
+class MulCPP(ExportNodeCpp):
+    def __init__(self, node, mem_info):
+        super().__init__(node, mem_info)
+
+        _setup_elemwise_op(self, "Div")
+
+
 def _setup_pooling(pooling):
     """Common code (template and kernel setup) shared across all the different pooling operator."""
 
@@ -211,25 +253,6 @@ def _setup_pooling(pooling):
         str(ROOT / "kernels" / "rescaling.hpp")
     ]
 
-@ExportLibCpp.register("Div", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
-class DivCPP(ExportNodeCpp):
-    def __init__(self, node, mem_info):
-        super().__init__(node, mem_info)
-        self.attributes["div_op"] = "Div"
-        self.attributes["activation"] = "Linear"
-        self.attributes["rescaling"] = "NoScaling"
-        self.config_template = str(
-            ROOT / "templates" / "configuration" / "div_config.jinja")
-        self.forward_template = str(
-            ROOT / "templates" / "kernel_forward" / "div_forward.jinja")
-        self.include_list = []
-        self.kernels_to_copy = [
-            str(ROOT / "kernels" / "div.hpp"),
-            str(ROOT / "kernels" / "activation.hpp"),
-            str(ROOT / "kernels" / "rescaling.hpp")
-        ]
-
-
 @ExportLibCpp.register("MaxPooling2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
 class MaxPoolCPP(ExportNodeCpp):
     def __init__(self, node, mem_info):
@@ -297,23 +320,20 @@ class FcCPP(ExportNodeCpp):
             str(ROOT / "kernels" / "rescaling.hpp")
         ]
 
-@ExportLibCpp.register("MatMul", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
-class MatMulCPP(ExportNodeCpp):
+@ExportLibCpp.register("Transpose", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)))
+class TransposeCPP(ExportNodeCpp):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
-        self.attributes["activation"] = "Linear"
-        self.attributes["rescaling"] = "NoScaling"
         self.config_template = str(
-            ROOT / "templates" / "configuration" / "matmul_config.jinja")
+            ROOT / "templates" / "configuration" / "transpose_ND_config.jinja")
         self.forward_template = str(
-            ROOT / "templates" / "kernel_forward" / "matmul_forward.jinja")
+            ROOT / "templates" / "kernel_forward" / "transpose_ND_forward.jinja")
         self.include_list = []
         self.kernels_to_copy = [
-            str(ROOT / "kernels" / "matmul.hpp"),
-            str(ROOT / "kernels" / "activation.hpp"),
-            str(ROOT / "kernels" / "rescaling.hpp")
+            str(ROOT / "kernels" / "transpose.hpp")
         ]
 
+
 @ExportLibCpp.register("Erf", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
 class ErfCPP(ExportNodeCpp):
     def __init__(self, node, mem_info):
@@ -331,26 +351,6 @@ class ErfCPP(ExportNodeCpp):
             str(ROOT / "kernels" / "rescaling.hpp")
         ]
 
-@ExportLibCpp.register("Transpose", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
-class TransposeCPP(ExportNodeCpp):
-    def __init__(self, node, mem_info):
-        super().__init__(node, mem_info)
-        # Get parameter permutation from transpose
-        self.attributes["output_dims_order"] = self.operator.attr.get_attr("output_dims_order")
-
-        self.attributes["activation"] = "Linear"
-        self.attributes["rescaling"] = "NoScaling"
-        self.config_template = str(
-            ROOT / "templates" / "configuration" / "transpose_config.jinja")
-        self.forward_template = str(
-            ROOT / "templates" / "kernel_forward" / "transpose_forward.jinja")
-        self.include_list = []
-        self.kernels_to_copy = [
-            str(ROOT / "kernels" / "transpose.hpp"),
-            str(ROOT / "kernels" / "activation.hpp"),
-            str(ROOT / "kernels" / "rescaling.hpp")
-        ]
-
 @ExportLibCpp.register("BatchNorm2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
 class BatchNorm2DCPP(ExportNodeCpp):
     def __init__(self, node, mem_info):
@@ -366,16 +366,4 @@ class BatchNorm2DCPP(ExportNodeCpp):
             str(ROOT / "kernels" / "batchnorm.hpp"),
             str(ROOT / "kernels" / "activation.hpp"),
             str(ROOT / "kernels" / "rescaling.hpp")
-        ]
-@ExportLibCpp.register("Transpose", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)))
-class TransposeCPP(ExportNodeCpp):
-    def __init__(self, node, mem_info):
-        super().__init__(node, mem_info)
-        self.config_template = str(
-            ROOT / "templates" / "configuration" / "transpose_ND_config.jinja")
-        self.forward_template = str(
-            ROOT / "templates" / "kernel_forward" / "transpose_ND_forward.jinja")
-        self.include_list = []
-        self.kernels_to_copy = [
-            str(ROOT / "kernels" / "transpose.hpp")
         ]
\ No newline at end of file
diff --git a/aidge_export_cpp/templates/configuration/convolution_config.jinja b/aidge_export_cpp/templates/configuration/convolution_config.jinja
index 041e5b5..417f240 100644
--- a/aidge_export_cpp/templates/configuration/convolution_config.jinja
+++ b/aidge_export_cpp/templates/configuration/convolution_config.jinja
@@ -23,5 +23,4 @@ static const {{ rescaling }} {{ name|upper }}_RESCALING = {};
 #define {{ name|upper }}_WEIGHTS_SIZE {{ weights_size }}
 #define {{ name|upper }}_BIASES_SIZE {{ out_chan[0] }}
 
-
 #endif /* {{ name|upper }}_LAYER_H */
diff --git a/aidge_export_cpp/templates/kernel_forward/convolution_forward.jinja b/aidge_export_cpp/templates/kernel_forward/convolution_forward.jinja
index 1760aa3..98b9e03 100644
--- a/aidge_export_cpp/templates/kernel_forward/convolution_forward.jinja
+++ b/aidge_export_cpp/templates/kernel_forward/convolution_forward.jinja
@@ -19,4 +19,4 @@ convolution_forward<{{ in_name[0]|upper }}_NB_CHANNELS,
                     ({{in_name[0]}}, {{out_name[0]}}, {{in_name[1]}}, {{in_name[2]}}, {{name|upper}}_RESCALING);
 
 {% include "./_save_outputs.jinja" %}
-{% endfilter %}
+{% endfilter %}
\ No newline at end of file
-- 
GitLab


From f06fd92b9b665e781d1fb0c9370d880a6751dce6 Mon Sep 17 00:00:00 2001
From: Matthew Newson <matthew.newson@cea.fr>
Date: Fri, 28 Mar 2025 14:09:16 +0000
Subject: [PATCH 17/22] Add for div operator

---
 aidge_export_cpp/static/include/network/typedefs.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/aidge_export_cpp/static/include/network/typedefs.hpp b/aidge_export_cpp/static/include/network/typedefs.hpp
index acece91..9b83602 100644
--- a/aidge_export_cpp/static/include/network/typedefs.hpp
+++ b/aidge_export_cpp/static/include/network/typedefs.hpp
@@ -19,7 +19,8 @@ typedef enum {
 typedef enum {
     Add,
     Sub,
-    Mul
+    Mul,
+    Div
 } ElemWise_T;
 
 typedef enum {
-- 
GitLab


From 9fc76615f2efcb283fd5d2c3ceb1a2170b5ab665 Mon Sep 17 00:00:00 2001
From: Matthew Newson <matthew.newson@cea.fr>
Date: Tue, 1 Apr 2025 07:44:24 +0000
Subject: [PATCH 18/22] Delete debug includes

---
 aidge_export_cpp/kernels/add.hpp                | 3 ---
 aidge_export_cpp/kernels/batchnorm.hpp          | 1 -
 aidge_export_cpp/kernels/convolution_groups.hpp | 1 -
 aidge_export_cpp/kernels/erf.hpp                | 7 +++----
 aidge_export_cpp/kernels/matmul.hpp             | 2 +-
 5 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/aidge_export_cpp/kernels/add.hpp b/aidge_export_cpp/kernels/add.hpp
index 03ba2c5..eb8a93b 100644
--- a/aidge_export_cpp/kernels/add.hpp
+++ b/aidge_export_cpp/kernels/add.hpp
@@ -3,9 +3,6 @@
 
 #include "network/typedefs.hpp"
 #include "kernels/activation.hpp"
-#include <iostream>
-#include <cassert>
-
 
 
 template<int NB_ELTS, 
diff --git a/aidge_export_cpp/kernels/batchnorm.hpp b/aidge_export_cpp/kernels/batchnorm.hpp
index 0ed5080..201ef16 100644
--- a/aidge_export_cpp/kernels/batchnorm.hpp
+++ b/aidge_export_cpp/kernels/batchnorm.hpp
@@ -5,7 +5,6 @@
 #include "kernels/rescaling.hpp"
 #include "kernels/activation.hpp"
 #include <math.h>
-#include <iostream>
 
 // WARNING: this kernel only works for 32-bits floating point values
 
diff --git a/aidge_export_cpp/kernels/convolution_groups.hpp b/aidge_export_cpp/kernels/convolution_groups.hpp
index 7d73f79..321ffc7 100644
--- a/aidge_export_cpp/kernels/convolution_groups.hpp
+++ b/aidge_export_cpp/kernels/convolution_groups.hpp
@@ -7,7 +7,6 @@
 #include "kernels/macs.hpp"
 #include "kernels/activation.hpp"
 #include <omp.h>
-#include <iostream>
 
 // Weights index en NHWC
 constexpr int inds_pos(int n, int c, int h, int w, int N, int C, int H, int W) {
diff --git a/aidge_export_cpp/kernels/erf.hpp b/aidge_export_cpp/kernels/erf.hpp
index b509133..768f3b9 100644
--- a/aidge_export_cpp/kernels/erf.hpp
+++ b/aidge_export_cpp/kernels/erf.hpp
@@ -1,10 +1,9 @@
-#ifndef __AIDGE_EXPORT_CPP_KERNELS_ERP__
-#define __AIDGE_EXPORT_CPP_KERNELS_ERP__
+#ifndef __AIDGE_EXPORT_CPP_KERNELS_ERF__
+#define __AIDGE_EXPORT_CPP_KERNELS_ERF__
 
 #include "network/typedefs.hpp"
 #include <cmath>
 #include <math.h>
-#include <iostream>
 
 template<int _NB_ELTS,
          typename Input_T, typename Output_T>
@@ -37,4 +36,4 @@ void erf_forward (
 }
 
 
-#endif  // __AIDGE_EXPORT_CPP_KERNELS_ERP_
\ No newline at end of file
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_ERF_
\ No newline at end of file
diff --git a/aidge_export_cpp/kernels/matmul.hpp b/aidge_export_cpp/kernels/matmul.hpp
index b284214..1403a01 100644
--- a/aidge_export_cpp/kernels/matmul.hpp
+++ b/aidge_export_cpp/kernels/matmul.hpp
@@ -3,7 +3,7 @@
 
 #include "network/typedefs.hpp"
 #include "kernels/activation.hpp"
-#include <iostream>
+
 // Generic function for matmul and activation
 
 template<int INPUT_A_DIMS[],  int INPUT_B_DIMS[], int OUTPUT_DIMS[], 
-- 
GitLab


From 4a4517821e3d9c70f6844f74b581e0c34304920a Mon Sep 17 00:00:00 2001
From: Matthew Newson <matthew.newson@cea.fr>
Date: Tue, 1 Apr 2025 07:52:41 +0000
Subject: [PATCH 19/22] Add missing files

---
 .../templates/configuration/batchnorm_config.jinja          | 2 +-
 .../templates/configuration/matmul_config.jinja             | 6 ------
 .../templates/kernel_forward/batchnorm_forward.jinja        | 2 +-
 3 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/aidge_export_cpp/templates/configuration/batchnorm_config.jinja b/aidge_export_cpp/templates/configuration/batchnorm_config.jinja
index 701ba7c..3706ee6 100644
--- a/aidge_export_cpp/templates/configuration/batchnorm_config.jinja
+++ b/aidge_export_cpp/templates/configuration/batchnorm_config.jinja
@@ -8,4 +8,4 @@
 #define {{ name|upper }}_ACTIVATION {{ activation }}
 #define {{ name|upper }}_EPSILON {{ epsilon }}
 
-#endif /* {{ name|upper }}_LAYER_H */
+#endif /* {{ name|upper }}_LAYER_H */
\ No newline at end of file
diff --git a/aidge_export_cpp/templates/configuration/matmul_config.jinja b/aidge_export_cpp/templates/configuration/matmul_config.jinja
index 4e380e5..6ef27fe 100644
--- a/aidge_export_cpp/templates/configuration/matmul_config.jinja
+++ b/aidge_export_cpp/templates/configuration/matmul_config.jinja
@@ -7,7 +7,6 @@
 {% include "./_meminfo.jinja" %}
 
 {# For layer configuration -#}
-<<<<<<< HEAD
 {% include "./_def_io.jinja" %}
 {% include "./_meminfo.jinja" %}
 #define {{ name|upper }}_B {{ in_dims[0][0]}}
@@ -24,11 +23,6 @@ int {{name|upper}}_OUTPUT_DIMS[] =  { {{ out_dims[0]|join(", ") }} };
 int {{name|upper}}_INPUT_A_DIMS[] = { {{ in_dims[0]|join(", ") }} };
 int {{name|upper}}_INPUT_B_DIMS[] = { {{ in_dims[1]|join(", ") }} };
 
-=======
-#define {{ name|upper }}_M {{ in_dims[0][0] }}
-#define {{ name|upper }}_K {{ in_dims[0][1] }}
-#define {{ name|upper }}_N {{ in_dims[1][1] }}
->>>>>>> origin/dev
 #define {{ name|upper }}_ACTIVATION {{ activation }}
 
 {#- Calculate sizes #}
diff --git a/aidge_export_cpp/templates/kernel_forward/batchnorm_forward.jinja b/aidge_export_cpp/templates/kernel_forward/batchnorm_forward.jinja
index 5a759b8..a18e3a7 100644
--- a/aidge_export_cpp/templates/kernel_forward/batchnorm_forward.jinja
+++ b/aidge_export_cpp/templates/kernel_forward/batchnorm_forward.jinja
@@ -6,4 +6,4 @@ batchnorm_forward<{{ out_name[0]|upper }}_NB_OUTPUTS,
                   {{name|upper}}_ACTIVATION>
                   ({{in_name[0]}}, {{out_name[0]}}, {{in_name[1]}}, {{in_name[2]}}, {{in_name[3]}}, {{in_name[4]}}, {{name|upper}}_EPSILON);
 {% include "./_save_outputs.jinja" %}
-{% endfilter %}
+{% endfilter %}
\ No newline at end of file
-- 
GitLab


From b40da6dbf9c19b2bffb8aafb8e533dc59ffc1118 Mon Sep 17 00:00:00 2001
From: Matthew Newson <matthew.newson@cea.fr>
Date: Tue, 1 Apr 2025 07:53:16 +0000
Subject: [PATCH 20/22] Corrected typo

---
 aidge_export_cpp/operators.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aidge_export_cpp/operators.py b/aidge_export_cpp/operators.py
index 5ee9992..18089fd 100644
--- a/aidge_export_cpp/operators.py
+++ b/aidge_export_cpp/operators.py
@@ -232,7 +232,7 @@ class MulCPP(ExportNodeCpp):
         _setup_elemwise_op(self, "Mul")
 
 @ExportLibCpp.register("Div", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
-class MulCPP(ExportNodeCpp):
+class DivCPP(ExportNodeCpp):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
 
-- 
GitLab


From dc0225cb2a571a9b995bcc31c151c3820f889f05 Mon Sep 17 00:00:00 2001
From: Matthew Newson <matthew.newson@cea.fr>
Date: Wed, 2 Apr 2025 14:25:23 +0000
Subject: [PATCH 21/22] Delete debug includes

---
 aidge_export_cpp/kernels/div.hpp            | 4 ----
 aidge_export_cpp/kernels/elemwise.hpp       | 3 ---
 aidge_export_cpp/kernels/mul.hpp            | 4 ----
 aidge_export_cpp/kernels/pooling.hpp        | 2 --
 aidge_export_cpp/kernels/sub.hpp            | 3 ---
 aidge_export_cpp/kernels/transpose_diff.hpp | 7 ++-----
 6 files changed, 2 insertions(+), 21 deletions(-)

diff --git a/aidge_export_cpp/kernels/div.hpp b/aidge_export_cpp/kernels/div.hpp
index f1ff7d0..44640aa 100644
--- a/aidge_export_cpp/kernels/div.hpp
+++ b/aidge_export_cpp/kernels/div.hpp
@@ -3,10 +3,6 @@
 
 #include "network/typedefs.hpp"
 #include "kernels/activation.hpp"
-#include <iostream>
-#include <cassert>
-
-
 
 template<int NB_ELTS, 
         int INPUT_A_DIMS[],  int INPUT_B_DIMS[], int OUTPUT_DIMS[], 
diff --git a/aidge_export_cpp/kernels/elemwise.hpp b/aidge_export_cpp/kernels/elemwise.hpp
index 1b45c59..9b97959 100644
--- a/aidge_export_cpp/kernels/elemwise.hpp
+++ b/aidge_export_cpp/kernels/elemwise.hpp
@@ -3,9 +3,6 @@
 
 #include "network/typedefs.hpp"
 #include "kernels/activation.hpp"
-#include <iostream>
-#include <cassert>
-
 
 
 template<int NB_ELTS, ElemWise_T ELEM_OP,
diff --git a/aidge_export_cpp/kernels/mul.hpp b/aidge_export_cpp/kernels/mul.hpp
index cbed0f6..b3ff9e1 100644
--- a/aidge_export_cpp/kernels/mul.hpp
+++ b/aidge_export_cpp/kernels/mul.hpp
@@ -3,10 +3,6 @@
 
 #include "network/typedefs.hpp"
 #include "kernels/activation.hpp"
-#include <iostream>
-#include <cassert>
-
-
 
 template<int NB_ELTS, 
         int INPUT_A_DIMS[],  int INPUT_B_DIMS[], int OUTPUT_DIMS[], 
diff --git a/aidge_export_cpp/kernels/pooling.hpp b/aidge_export_cpp/kernels/pooling.hpp
index 14a2473..8f6de40 100644
--- a/aidge_export_cpp/kernels/pooling.hpp
+++ b/aidge_export_cpp/kernels/pooling.hpp
@@ -5,8 +5,6 @@
 #include "network/utils.hpp"
 #include <limits>
 #include <stdexcept>
-#include <iostream>
-
 
 
 void reorder_NCHW_NHWC_pool(const float* input, float* output, int N, int C, int H, int W, bool direct = true) {
diff --git a/aidge_export_cpp/kernels/sub.hpp b/aidge_export_cpp/kernels/sub.hpp
index 07637cd..2576edc 100644
--- a/aidge_export_cpp/kernels/sub.hpp
+++ b/aidge_export_cpp/kernels/sub.hpp
@@ -3,9 +3,6 @@
 
 #include "network/typedefs.hpp"
 #include "kernels/activation.hpp"
-#include <iostream>
-#include <cassert>
-
 
 
 template<int NB_ELTS, 
diff --git a/aidge_export_cpp/kernels/transpose_diff.hpp b/aidge_export_cpp/kernels/transpose_diff.hpp
index 298e5d9..712d9b4 100644
--- a/aidge_export_cpp/kernels/transpose_diff.hpp
+++ b/aidge_export_cpp/kernels/transpose_diff.hpp
@@ -2,9 +2,7 @@
 #define __AIDGE_EXPORT_CPP_KERNELS_TRANSPOSE__
 
 #include "network/typedefs.hpp"
-#include <cstring>
-#include <cstdio>
-#include <iostream>
+
 
 using namespace std; 
 
@@ -27,10 +25,9 @@ void transpose_forward (
 
 	int indices[SIZE_OUTPUT_DIMS];
 	for (int i = 0; i<SIZE_OUTPUT_DIMS;++i){indices[i] = 0;}
-//	#pragma omp parallel for
+	
 	for (int i = 0; i < SIZE; ++i) {
 		int idx = 0;
-//		#pragma omp parallel for
 		for (int j = SIZE_OUTPUT_DIMS -1; j >=0; --j) {
 			idx += indices[PERM[j]] * newStrides[j];
 		}
-- 
GitLab


From f74ec2c3a86cfb5783036d9abee25c7baa9289c7 Mon Sep 17 00:00:00 2001
From: Matthew Newson <matthew.newson@cea.fr>
Date: Thu, 3 Apr 2025 12:10:41 +0000
Subject: [PATCH 22/22] Add ifdef pragma or delete unneeded pragma

---
 aidge_export_cpp/kernels/add.hpp                |  2 --
 aidge_export_cpp/kernels/batchnorm.hpp          |  6 ++++++
 aidge_export_cpp/kernels/convolution_groups.hpp | 14 ++++++++------
 aidge_export_cpp/kernels/erf.hpp                |  3 ++-
 aidge_export_cpp/kernels/mul.hpp                |  2 --
 5 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/aidge_export_cpp/kernels/add.hpp b/aidge_export_cpp/kernels/add.hpp
index eb8a93b..52b58f5 100644
--- a/aidge_export_cpp/kernels/add.hpp
+++ b/aidge_export_cpp/kernels/add.hpp
@@ -55,12 +55,10 @@ void add_forward (
     if (contiguousidx > 0) {
         stride_post0[contiguousidx  - 1] = 1;
         stride_post1[contiguousidx  - 1] = 1;
-        #pragma omp parallel for
         for (int i = contiguousidx -2; i != -1; --i) {
             stride_post0[i] = stride_post0[i+1]*ndim_a[i+1];
             stride_post1[i] = stride_post1[i+1]*ndim_b[i+1];
         }
-        #pragma omp parallel for
         for (int i = 0; i < contiguousidx ; ++i) {
             stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1;
             stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1;
diff --git a/aidge_export_cpp/kernels/batchnorm.hpp b/aidge_export_cpp/kernels/batchnorm.hpp
index 201ef16..4100e6d 100644
--- a/aidge_export_cpp/kernels/batchnorm.hpp
+++ b/aidge_export_cpp/kernels/batchnorm.hpp
@@ -25,15 +25,21 @@ void batchnorm_forward (
 {
 
     int featureMapSize = OUTPUTS_HEIGHT * OUTPUTS_WIDTH;
+#ifdef _OPENMP
     #pragma omp parallel for
+#endif
     for (int ch = 0; ch < NB_OUTPUTS; ++ch) {
         int ioIndex = ch * featureMapSize;
+#ifdef _OPENMP
         #pragma omp parallel for
+#endif
         for (int i = ioIndex; i < ioIndex + featureMapSize; i++) {
             outputs[i] = biases[ch];
         }
         float var = sqrt(variances[ch] + epsilon);
+#ifdef _OPENMP
         #pragma omp parallel for
+#endif
         for (int feature = 0; feature < featureMapSize; ++feature) {
             outputs[ioIndex + feature] += (scales[ch] * (inputs[ioIndex + feature] - means[ch]) / var);
         }
diff --git a/aidge_export_cpp/kernels/convolution_groups.hpp b/aidge_export_cpp/kernels/convolution_groups.hpp
index 321ffc7..17cb1bf 100644
--- a/aidge_export_cpp/kernels/convolution_groups.hpp
+++ b/aidge_export_cpp/kernels/convolution_groups.hpp
@@ -6,7 +6,6 @@
 #include "network/utils.hpp"
 #include "kernels/macs.hpp"
 #include "kernels/activation.hpp"
-#include <omp.h>
 
 // Weights index en NHWC
 constexpr int inds_pos(int n, int c, int h, int w, int N, int C, int H, int W) {
@@ -52,19 +51,22 @@ void convolution_forward(
     
     int c_in_g = NB_CHANNELS / GROUPS;
     int c_out_g = NB_OUTPUTS / GROUPS;
-    #pragma omp parallel for
+#ifdef _OPENMP
+    #pragma omp parallel for collapse(3)
+#endif
     for (int oc = 0; oc < NB_OUTPUTS; oc++) {
-    	int g_oc = oc / c_out_g;
-        #pragma omp parallel for
     	for (int i = 0; i < OUT_HEIGHT; ++i) {
-            #pragma omp parallel for
     	    for (int j = 0; j < OUT_WIDTH; ++j) {
+                int g_oc = oc / c_out_g;
     	        Output_T value = biases[oc];
-                #pragma omp parallel for
     	        for (int ic = g_oc * c_in_g; ic < (g_oc + 1) * c_in_g; ++ic) {
+#ifdef _OPENMP
                     #pragma omp parallel for
+#endif
     	            for (int m = 0; m < KERNEL_HEIGHT; ++m) {
+#ifdef _OPENMP
                         #pragma omp parallel for
+#endif
     	                for (int n = 0; n < KERNEL_WIDTH; ++n) {
     	                    int i_p = i * STRIDE_X - PADDING_X + m * DILATION_X;
                             int j_p = j * STRIDE_Y - PADDING_Y + n * DILATION_Y;
diff --git a/aidge_export_cpp/kernels/erf.hpp b/aidge_export_cpp/kernels/erf.hpp
index 768f3b9..88aafe2 100644
--- a/aidge_export_cpp/kernels/erf.hpp
+++ b/aidge_export_cpp/kernels/erf.hpp
@@ -19,8 +19,9 @@ void erf_forward (
     double a5 =  1.061405429;
     double p  =  0.3275911;
 
-
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif 
     for (int i = 0; i < _NB_ELTS; ++i) {
         int sign = 1;
         if (inputs[i] < 0)
diff --git a/aidge_export_cpp/kernels/mul.hpp b/aidge_export_cpp/kernels/mul.hpp
index b3ff9e1..5c1ba62 100644
--- a/aidge_export_cpp/kernels/mul.hpp
+++ b/aidge_export_cpp/kernels/mul.hpp
@@ -54,12 +54,10 @@ void mul_forward (
     if (contiguousidx > 0) {
         stride_post0[contiguousidx  - 1] = 1;
         stride_post1[contiguousidx  - 1] = 1;
-        #pragma omp parallel for
         for (int i = contiguousidx -2; i != -1; --i) {
             stride_post0[i] = stride_post0[i+1]*ndim_a[i+1];
             stride_post1[i] = stride_post1[i+1]*ndim_b[i+1];
         }
-        #pragma omp parallel for
         for (int i = 0; i < contiguousidx ; ++i) {
             stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1;
             stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1;
-- 
GitLab