diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 4a443fc403ab341d770455b203524c7f6e65f42a..59b890b815549bce6ee40e5e6651ab1b3e9bd0c9 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -29,10 +29,4 @@ test:ubuntu_python: - DEPENDENCY_JOB="build:ubuntu_python" - !reference [.ubuntu:download:artifacts, script] # Need to install extra dependence for tests: - - python -m pip install torch torchvision -coverage:ubuntu_python: - before_script: - - !reference [.setup:coverage:ubuntu_python, before_script] - - DEPS_NAMES=("aidge_onnx" "aidge_quantization") - - DEPENDENCY_JOB="build:ubuntu_python" - - !reference [.ubuntu:download:artifacts, script] \ No newline at end of file + - python -m pip install torch torchvision \ No newline at end of file diff --git a/aidge_export_cpp/benchmark.py b/aidge_export_cpp/benchmark.py index c6e4b600255b18cc59e817090374650589fbed10..542f6711e635066769fa1cd42125f66772b56e93 100644 --- a/aidge_export_cpp/benchmark.py +++ b/aidge_export_cpp/benchmark.py @@ -15,31 +15,42 @@ def measure_inference_time(model: aidge_core.GraphView, input_data: list[str, np model.set_backend("cpu") # create input Tensor list for the GraphView - ordered_inputs: list[aidge_core.Tensor] = [] + ordered_inputs: list[aidge_core.Tensor] = [aidge_core.Tensor(i[1]) for i in input_data] # [tmp fix] manual transpositin of data for input of export BEFORE converting to Tensor - for i in input_data: - nb_dims = len(i[1].shape) - if nb_dims == 3: - ordered_inputs.append(aidge_core.Tensor(i[1].transpose(0,2,1).reshape(i[1].shape).copy())) - if nb_dims == 4: - ordered_inputs.append(aidge_core.Tensor(np.transpose(i[1], axes=(0,2,3,1)).reshape(i[1].shape).copy())) - else: - ordered_inputs.append(aidge_core.Tensor(i[1])) + # for i in input_data: + # nb_dims = len(i[1].shape) + # if nb_dims == 3: + # ordered_inputs.append(aidge_core.Tensor(i[1].transpose(0,2,1).reshape(i[1].shape).copy())) + # if nb_dims == 4: + # ordered_inputs.append(aidge_core.Tensor(np.transpose(i[1], axes=(0,2,3,1)).reshape(i[1].shape).copy())) + # else: + # ordered_inputs.append(aidge_core.Tensor(i[1])) # set inputs for the export for i, inp in enumerate(model.get_ordered_inputs()): op = inp[0].get_operator() - op.set_input(i, ordered_inputs[i]) + ordered_inputs[i].set_data_format(aidge_core.dformat.nchw) + op.associate_input(i, ordered_inputs[i]) + scheduler = aidge_core.SequentialScheduler(model) + scheduler.generate_scheduling() + + for i in range(len(ordered_inputs)): + ordered_inputs[i].set_data_format(aidge_core.dformat.nhwc) + + model.set_dataformat(aidge_core.dformat.nhwc) + model.set_backend(aidge_export_cpp.ExportLibCpp._name) + aidge_core.adapt_to_backend(model) + aidge_core.adapt_fc_params_format(model) model.forward_dims([t.dims() for t in ordered_inputs]) - scheduler = aidge_core.SequentialScheduler(model) + scheduler.reset_scheduling() scheduler.generate_scheduling() # for ordered_input in ordered_inputs: # ordered_input.set_backend("cpu") operator_type: str = model.get_ordered_outputs()[0][0].get_operator().type() - print(" ├─Generating export...", end="", flush=True) + folder_name: str = f"{operator_type.lower()}_test_export_cpp" with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f): aidge_core.export_utils.scheduler_export( @@ -50,12 +61,10 @@ def measure_inference_time(model: aidge_core.GraphView, input_data: list[str, np memory_manager_args={"wrapping": False } ) aidge_core.export_utils.generate_main_inference_time_cpp(folder_name, model, nb_iterations, nb_warmup) - print(" ok") - print(" ├─Compiling...", end="", flush=True) with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f): run(['make'], cwd=folder_name, stdout=f) - print(" ok") + timings_str = run(f'./{folder_name}/bin/run_export', capture_output=True, text=True) folder_path = os.path.abspath(folder_name) @@ -70,30 +79,31 @@ def compute_output(model: aidge_core.GraphView, input_data: list[str, np.ndarray model.set_backend("cpu") # create input Tensor list for the GraphView - ordered_inputs: list[aidge_core.Tensor] = [] - # [tmp fix] manual transpositin of data for input of export BEFORE converting to Tensor - for i in input_data: - nb_dims = len(i[1].shape) - if nb_dims == 3: - ordered_inputs.append(aidge_core.Tensor(i[1].transpose(0,2,1).reshape(i[1].shape).copy())) - if nb_dims == 4: - ordered_inputs.append(aidge_core.Tensor(np.transpose(i[1], axes=(0,2,3,1)).reshape(i[1].shape).copy())) - else: - ordered_inputs.append(aidge_core.Tensor(i[1])) + ordered_inputs: list[aidge_core.Tensor] = [aidge_core.Tensor(i[1]) for i in input_data] # set inputs for the export for i, inp in enumerate(model.get_ordered_inputs()): op = inp[0].get_operator() - op.set_input(i, ordered_inputs[i]) - - model.forward_dims([t.dims() for t in ordered_inputs]) + ordered_inputs[i].set_data_format(aidge_core.dformat.nchw) + op.associate_input(i, ordered_inputs[i]) scheduler = aidge_core.SequentialScheduler(model) scheduler.generate_scheduling() + for i in range(len(ordered_inputs)): + ordered_inputs[i].set_data_format(aidge_core.dformat.nhwc) + + model.set_dataformat(aidge_core.dformat.nhwc) + model.set_backend(aidge_export_cpp.ExportLibCpp._name) + aidge_core.adapt_to_backend(model) + aidge_core.adapt_fc_params_format(model) + model.forward_dims([t.dims() for t in ordered_inputs]) + + scheduler.reset_scheduling() + scheduler.generate_scheduling() operator_type: str = model.get_ordered_outputs()[0][0].get_operator().type() - print(" │ Generating export...", end="", flush=True) + folder_name: str = f"{operator_type.lower()}_test_export_cpp" with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f): aidge_core.export_utils.scheduler_export( @@ -104,12 +114,10 @@ def compute_output(model: aidge_core.GraphView, input_data: list[str, np.ndarray memory_manager_args={"wrapping": False } ) aidge_core.export_utils.generate_main_display_output_cpp(folder_name, model) - print(" ok") - print(" │ Compiling...", end="", flush=True) with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f): run(['make'], cwd=folder_name, stdout=f) - print(" ok") + output_str: str = run(f'./{folder_name}/bin/run_export', capture_output=True, text=True) folder_path = os.path.abspath(folder_name) if os.path.exists(folder_path): @@ -117,22 +125,12 @@ def compute_output(model: aidge_core.GraphView, input_data: list[str, np.ndarray outputs_str: list[str] = output_str.stdout.strip().split('\n') outputs = [np.array([float(val) for val in single_output_str.split(' ') if val.strip()]) for i, single_output_str in enumerate(outputs_str)] - - for i, pair in enumerate(model.get_ordered_outputs()): - dims = pair[0].get_operator().get_output(pair[1]).dims() - nb_dims = len(dims) - dims_permutted = dims - if nb_dims == 3: - dims_permutted = [dims[0], dims[2], dims[1]] - if nb_dims == 4: - dims_permutted = [dims[0], dims[2], dims[3], dims[1]] - - if np.prod(dims) != outputs[i].size: - aidge_core.Log.fatal("Incompatible export output size ({}) with required shape {}", outputs[i].size, dims) - outputs[i] = outputs[i].reshape(dims_permutted) - if nb_dims == 3: - outputs[i] = outputs[i].transpose(0,2,1) - if nb_dims == 4: - outputs[i] = outputs[i].transpose(0,3,1,2) - - return outputs + output_tensors = [] + outputs_dims = [pair[0].get_operator().get_output(pair[1]).dims() for pair in model.get_ordered_outputs()] + for out_idx, arr in enumerate(outputs): + t = aidge_core.Tensor(arr.reshape(outputs_dims[out_idx])) + t.set_data_format(aidge_core.dformat.nhwc) + t.set_data_format(aidge_core.dformat.nchw) + output_tensors.append(np.array(t)) + + return output_tensors diff --git a/aidge_export_cpp/export.py b/aidge_export_cpp/export.py index 10741b0d7bd5116b4d5d833157d2bad504d9aca0..4cd93b85466d8008c8d44d0b57ee428a5f1e0c5a 100644 --- a/aidge_export_cpp/export.py +++ b/aidge_export_cpp/export.py @@ -1,33 +1,34 @@ import os import shutil -import numpy as np from pathlib import Path from typing import List, Union import aidge_core from aidge_core.mem_info import generate_optimized_memory_info -from aidge_core.export_utils import scheduler_export, generate_main_cpp, aidge2c, generate_file +from aidge_core.export_utils import scheduler_export, generate_main_cpp -from aidge_export_cpp import ExportLibCpp, ROOT -from aidge_export_cpp.export_utils import read_log_file +from aidge_export_cpp import ExportLibCpp +from aidge_export_cpp.export_utils import * def export(export_folder_name: str, - graphview: aidge_core.GraphView, + model: aidge_core.GraphView, scheduler: Union[List[aidge_core.Node], aidge_core.Scheduler], inputs_tensor: aidge_core.Tensor = None, labels: aidge_core.Tensor = None, dev_mode: bool = False, - aidge_cmp: bool = False): + aidge_cmp: bool = False, + memory_manager = generate_optimized_memory_info, + memory_manager_args = {}): """ Export an aidge_core.Scheduler to C++ code :param export_folder_name: Export folder name :type export_folder_name: str - :param graph_view: An instance of :py:class:`aidge_core.graph_view`, providing access to nodes and + :param model: An instance of :py:class:`aidge_core.graph_view`, providing access to nodes and ordered input/output data within the computational graph. - :type graph_view: aidge_core.GraphView + :type model: aidge_core.GraphView :param scheduler: Scheduler instance managing the computation graph. Uses `graph_view` and `get_sequential_static_scheduling` methods :param inputs_tensor: **For future** argument to provide tensor to use in the main function, not implemented yet! @@ -43,52 +44,112 @@ def export(export_folder_name: str, :type dev_mode: boolean """ - export_folder_name = Path(export_folder_name) + # Graph Log (Save the state of the graph through export steps) + os.makedirs("graph_log", exist_ok=True) + model.save(f"graph_log/0_export_start") + + # Remove scaling producers from the export + exclude_unwanted_producers(model) + + # Fuse nodes into MetaOps adapted to the CPP Export + cpp_fuse_to_metaops(model) + model.save(f"graph_log/1_fused_model") + + # Reset the scheduler after graph modification + scheduler = aidge_core.SequentialScheduler(model) if scheduler is None else scheduler + scheduler.reset_scheduling() + scheduler.generate_scheduling() + + # Normalize nodes names + set_nodes_names(scheduler) + model.save(f"graph_log/2_named_model") + + # Last inference to set the inputs as well as the ifmaps (aidge_cmp) + if inputs_tensor is not None: + output_array = propagate(model, scheduler, inputs_tensor) + aidge_core.Log.notice(f"Exported sample results : {np.argmax(output_array)} ( {str(np.max(output_array))} )") + aidge_core.Log.notice(f"Label : {labels}") + elif aidge_cmp: + aidge_cmp = False + aidge_core.Log.error("aidge_cmp : No input_tensor has been provided to the export() function.\n\ + Therefore ifmaps have not been generated and aidge_cmp cannot be used.") + + # Set nodes datatypes if the model has been quantized + # TODO : Should be changed with future quantization feature + if inputs_tensor is not None: + if inputs_tensor.dtype() == aidge_core.dtype.int32: + set_nodes_datatypes(model) # Set datatype to int8 only + inputs_tensor.set_datatype(aidge_core.dtype.int8) + model.save(f"graph_log/3_set_datatypes") + + # [aidge_cmp] Export feature maps tensors as json + if aidge_cmp: + generate_aidge_ifmaps(model) + + # [aidge_cmp] Set flags on each node + if aidge_cmp: + for node in model.get_nodes(): + node.attributes().aidge_cmp = True + + # Set model's dataformat (NHWC) + ## Inputs + for in_node in model.get_ordered_inputs(): + input = in_node[0].get_operator().get_input(0) + if input is not None: + # Transpose the input + input_cpy = input.clone() + input_cpy.set_data_format(aidge_core.dformat.nchw) + input_cpy.set_data_format(aidge_core.dformat.nhwc) + in_node[0].get_operator().set_input(0, input_cpy) + ## Rest of the graph + model.set_dataformat(aidge_core.dformat.nhwc) + model.save(f"graph_log/4_set_dataformats") + + # Set model's backend + model.set_backend(ExportLibCpp._name) + + # Adapt the graph to the selected backend + aidge_core.adapt_to_backend(model) + model.save(f"graph_log/5_adapt_to_backend") + aidge_core.adapt_fc_params_format(model) + model.save(f"graph_log/6_adapt_fc_params_format") + + # At this point, the graph dimensions are supposed to be statically + # forwardable, thus allow_data_dependency can be safely set to True + dims = [] + for in_node in model.get_ordered_inputs(): + dims.append(in_node[0].get_operator().get_input(0).dims()) + model.forward_dims(dims=dims, allow_data_dependency=True) + + # Reset the scheduling as the graph may have been changed + scheduler.reset_scheduling() + scheduler.generate_scheduling() # Remove existing export + export_folder_name = Path(export_folder_name) if os.path.isdir(export_folder_name): print("Removing existing export directory...") shutil.rmtree(export_folder_name) - # Generate Model Files - """ - Perform the following tasks : - - Generate the parameters and layers config files - - Generate the forward.cpp file - - Copy all needed kernels - """ + # Save the model + model.save("graph_log/7_exported_model") + + # Setup stats folder + if "stats_folder" not in memory_manager_args: + memory_manager_args["stats_folder"] = f"{export_folder_name}/stats" + # Generate the export scheduler_export(scheduler, export_folder_name, ExportLibCpp, - memory_manager=generate_optimized_memory_info, - memory_manager_args={ - "stats_folder": f"{export_folder_name}/stats"}, + memory_manager=memory_manager, + memory_manager_args=memory_manager_args, dev_mode=dev_mode) + model.save(f"{export_folder_name}/graph") # Generate main file - generate_main_cpp(export_folder_name, graphview, labels=labels, inputs_tensor=inputs_tensor) + generate_main_cpp(export_folder_name, model, labels=labels, inputs_tensor=inputs_tensor) # Generate log files (aidge_cmp option) - """ - If the aidge_cmp option has been enabled, the generated log_outputs will - be copied into the generated export in order to be used as reference. - """ if aidge_cmp: - ranked_nodes = graphview.get_ranked_nodes_name("{0}[{1}#{3}]") - os.makedirs(export_folder_name / "data" / "aidge_outputs") - os.makedirs(export_folder_name / "data" / "export_outputs") - for node in graphview.get_nodes(): - if node.type() != "Producer": - file_path = 'log_outputs/' + ranked_nodes[node] + '/output_0.log' - data_t = aidge2c(node.get_operator().get_output(0).dtype()) - name = node.name() + '_output_0_aidge' - dims = node.get_operator().get_output(0).dims() - values = read_log_file(file_path) - - generate_file(export_folder_name / "data" / "aidge_outputs" / (node.name() + ".hpp"), - ROOT / "templates" / "data" / "aidge_tensor.jinja", - data_t=data_t, - name=name, - dims=dims, - values=values) + export_aidge_ifmaps(export_folder_name) diff --git a/aidge_export_cpp/export_utils.py b/aidge_export_cpp/export_utils.py index e22524fb9058dfb4c8b023d0df8fbe11e2ff791b..4782119649439b50befd91cb5a9b2c2d10a86610 100644 --- a/aidge_export_cpp/export_utils.py +++ b/aidge_export_cpp/export_utils.py @@ -1,13 +1,17 @@ import os +import json +import numpy as np from collections import OrderedDict import aidge_core -from aidge_core.export_utils import get_node_from_metaop +from aidge_core.export_utils import get_node_from_metaop, aidge2c, generate_file + +from aidge_export_cpp import ROOT def cpp_fuse_to_metaops(graph_view: aidge_core.GraphView): """ Fuse nodes into metaops adapted for the CPP Export - TODO: These recipes should be into aidge_core + TODO: These recipes should be in aidge_core :param graph_view: An instance of :py:class:`aidge_core.GraphView`, providing access to nodes and ordered input/output data within the computational graph. @@ -137,8 +141,9 @@ def set_nodes_datatypes(graph_view: aidge_core.GraphView): if get_node_from_metaop(node, "FC") or \ get_node_from_metaop(node, "Conv2D") or \ get_node_from_metaop(node, "ConvDepthWise2D"): - - node.get_operator().get_input(0).set_datatype(aidge_core.dtype.int8) # Input + + if node.get_operator().get_input(0) is not None: + node.get_operator().get_input(0).set_datatype(aidge_core.dtype.int8) # Input node.get_operator().get_input(1).set_datatype(aidge_core.dtype.int8) # Weights if node.get_parent(2) is not None: node.get_operator().get_input(2).set_datatype(aidge_core.dtype.int32) # Biases @@ -148,26 +153,8 @@ def set_nodes_datatypes(graph_view: aidge_core.GraphView): # Set input node's datatype for n in graph_view.get_input_nodes(): - n.get_operator().get_input(0).set_datatype(aidge_core.dtype.int8) - - - -def read_log_file(file_path: str): - """ Read log file - Used to read the aidge generated log files containing the intermediate - tensors of the exported model. - - :param file_path: Path to the file to read. - :type file_path: str - """ - # Check if the file exists - if not os.path.isfile(file_path): - print(f"File not found: {file_path}") - return None - - with open(file_path, 'r') as file: - content = file.read() - return content + if n.get_operator().get_input(0) is not None: + n.get_operator().get_input(0).set_datatype(aidge_core.dtype.int8) @@ -226,3 +213,68 @@ def normalize(array): array = (array - array.min()) / (array.max() - array.min()) return 2 * array - 1 + + +def generate_aidge_ifmaps(model): + + json_nodes = [] + for node in model.get_nodes(): + if node.type() != "Producer": + + output = node.get_operator().get_output(0) + data = { + "name": node.name(), + "dims": output.dims(), + "dtype": aidge2c(output.dtype()), + "dformat": str(output.dformat()), + "values": np.array(output).tolist() + } + json_nodes.append(data) + + # Write the entire list to the JSON file after the loop + with open('aidge_output.json', 'w') as file: + json.dump(json_nodes, file, indent=2, separators=(",", ": ")) + + + +def export_aidge_ifmaps(export_folder_name): + os.makedirs(export_folder_name / "data" / "aidge_outputs") + os.makedirs(export_folder_name / "data" / "export_outputs") + + # Load the JSON data from the file + with open('aidge_output.json', 'r') as file: + json_nodes = json.load(file) + + # Access the data + for node in json_nodes: + name = node["name"] + dims = node["dims"] + dtype = node["dtype"] + dformat = node["dformat"] + values = node["values"] + + generate_file(export_folder_name / "data" / "aidge_outputs" / (name + ".hpp"), + ROOT / "templates" / "data" / "aidge_tensor.jinja", + dtype=dtype, + dformat=dformat, + name=name + "_output_0_aidge", + dims=dims, + values=values) + + # Remove the JSON file + os.remove('aidge_output.json') + + + +def propagate(model, scheduler, tensor): + """ + Propagate the given tensor into the model and return the + output tensor. + """ + # Run the inference + scheduler.forward(True, [tensor]) + # Gather the results + output_node = model.get_ordered_outputs()[0][0] + output_tensor = output_node.get_operator().get_output(0).clone() + output_tensor.set_backend("cpu") + return np.array(output_tensor) diff --git a/aidge_export_cpp/kernels/activation.hpp b/aidge_export_cpp/kernels/activation.hpp index ee80ed275ab9edf574dee6e7d32276f00ba92412..616b09581f55b2312780e4391592dba1214566a7 100644 --- a/aidge_export_cpp/kernels/activation.hpp +++ b/aidge_export_cpp/kernels/activation.hpp @@ -3,9 +3,22 @@ #include "network/activation_utils.hpp" #include "network/rescaling_utils.hpp" +#include <sys/types.h> -template<int NB_DATA, +template<size_t NB_ELTS, ActivationFunction_T ACTIVATION, + // Memory mapping: inputs + size_t INPUT_MEM_CONT_OFFSET, + size_t INPUT_MEM_CONT_SIZE, + size_t INPUT_MEM_WRAP_OFFSET, + size_t INPUT_MEM_WRAP_SIZE, + size_t INPUT_MEM_STRIDE, + // Memory mapping: outputs + size_t OUTPUT_MEM_CONT_OFFSET, + size_t OUTPUT_MEM_CONT_SIZE, + size_t OUTPUT_MEM_WRAP_OFFSET, + size_t OUTPUT_MEM_WRAP_SIZE, + size_t OUTPUT_MEM_STRIDE, typename Input_T, typename Output_T, typename Rescaling_T> __attribute__((always_inline)) inline void activation_forward ( @@ -13,12 +26,22 @@ void activation_forward ( Output_T* __restrict outputs, const Rescaling_T& __restrict rescaling) { - for (int i = 0; i < NB_DATA; ++i) - { - outputs[i] = activation_forward_value<Output_T>(inputs[i], i, ACTIVATION, rescaling); - } + int inOffset = 0; + int outOffset = 0; -} + for (size_t i = 0; i < NB_ELTS; ++i) { + if (INPUT_MEM_WRAP_SIZE > 0 && i == static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) { + inOffset = (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET + - INPUT_MEM_CONT_SIZE) / sizeof(Input_T); + } + + if (OUTPUT_MEM_WRAP_SIZE > 0 && i == static_cast<int>(OUTPUT_MEM_CONT_SIZE / sizeof(Output_T))) { + outOffset = (OUTPUT_MEM_WRAP_OFFSET - OUTPUT_MEM_CONT_OFFSET + - OUTPUT_MEM_CONT_SIZE) / sizeof(Output_T); + } + outputs[outOffset + i] = activation_forward_value<Output_T>(inputs[inOffset + i], i, ACTIVATION, rescaling); + } +} #endif // __AIDGE_EXPORT_CPP_KERNELS_ACTIVATION__ diff --git a/aidge_export_cpp/kernels/batchnorm.hpp b/aidge_export_cpp/kernels/batchnorm.hpp index 27866ab923eb8a519e684030cfb63f894c15ec98..b2a3452c9c12d0e64cc656718d6653409647105c 100644 --- a/aidge_export_cpp/kernels/batchnorm.hpp +++ b/aidge_export_cpp/kernels/batchnorm.hpp @@ -3,13 +3,13 @@ #include "network/typedefs.hpp" #include "network/activation_utils.hpp" - +#include <sys/types.h> #include <math.h> // WARNING: this kernel only works for 32-bits floating point values -template<int NB_BATCHES, int NB_OUTPUTS, - int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH, +template<size_t NB_BATCHES, size_t NB_OUTPUTS, + size_t OUTPUTS_HEIGHT, size_t OUTPUTS_WIDTH, ActivationFunction_T ACTIVATION, typename Input_T, typename Output_T, typename Param_T, @@ -25,14 +25,14 @@ void batchnorm_forward ( const double epsilon, const Rescaling_T& __restrict rescaling) { - for (unsigned int batch = 0; batch < NB_BATCHES; ++batch) { - for (unsigned int output = 0; output < NB_OUTPUTS; ++output) { + for (size_t batch = 0; batch < NB_BATCHES; ++batch) { + for (size_t output = 0; output < NB_OUTPUTS; ++output) { // If the variance is 0, we need to avoid division by 0 Output_T var = sqrt(variances[output] > 0.0 ? variances[output] + epsilon : epsilon); - for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) { - for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) { - const int outputOffset = batch * OUTPUTS_WIDTH * OUTPUTS_HEIGHT * NB_OUTPUTS + output * OUTPUTS_WIDTH * OUTPUTS_HEIGHT + OUTPUTS_WIDTH * oy + ox; + for (size_t oy = 0; oy < OUTPUTS_HEIGHT; ++oy) { + for (size_t ox = 0; ox < OUTPUTS_WIDTH; ++ox) { + const size_t outputOffset = batch * OUTPUTS_WIDTH * OUTPUTS_HEIGHT * NB_OUTPUTS + output * OUTPUTS_WIDTH * OUTPUTS_HEIGHT + OUTPUTS_WIDTH * oy + ox; const Output_T normalized = (inputs[outputOffset] - means[output]) / var; const Output_T sAs = scales[output] * normalized + biases[output]; diff --git a/aidge_export_cpp/kernels/concat.hpp b/aidge_export_cpp/kernels/concat.hpp index dde8c4fc3a9ce9eea5d4ae4cfad35c078f60450d..effaeba35bbd2997df0116e7a04dc53e5f7f3798 100644 --- a/aidge_export_cpp/kernels/concat.hpp +++ b/aidge_export_cpp/kernels/concat.hpp @@ -1,39 +1,37 @@ #ifndef __AIDGE_EXPORT_CPP_KERNELS_CONCAT__ #define __AIDGE_EXPORT_CPP_KERNELS_CONCAT__ -template<int AXIS_SIZE_POST, - int AXIS_SIZE_PRE, - unsigned int NB_INPUTS, +#include <sys/types.h> + +template<size_t AXIS_SIZE_POST, + size_t AXIS_SIZE_PRE, + const size_t AXIS_SIZE[], + size_t TOTAL_AXIS_SIZE, + size_t NB_INPUTS, typename T> __attribute__((always_inline)) inline static void concat_forward ( const T* const * __restrict inputs, - const unsigned int* __restrict sizes, T* __restrict output) { - unsigned int total_concat_axis_size = 0; - for (unsigned int n = 0; n < NB_INPUTS; ++n) - total_concat_axis_size += sizes[n]; - - for (int i = 0; i < AXIS_SIZE_PRE; ++i) { + for (size_t i = 0; i < AXIS_SIZE_PRE; ++i) { // Loop over post-axis (e.g., dims after axis 1) - for (int j = 0; j < AXIS_SIZE_POST; ++j) { - unsigned int axis_offset = 0; + for (size_t j = 0; j < AXIS_SIZE_POST; ++j) { + size_t axis_offset = 0; // Loop over each input tensor - for (unsigned int n = 0; n < NB_INPUTS; ++n) { - for (unsigned int k = 0; k < sizes[n]; ++k) { - const int input_idx = i * sizes[n] * AXIS_SIZE_POST + k * AXIS_SIZE_POST + j; + for (size_t n = 0; n < NB_INPUTS; ++n) { + for (size_t k = 0; k < AXIS_SIZE[n]; ++k) { + const size_t input_idx = i * AXIS_SIZE[n] * AXIS_SIZE_POST + k * AXIS_SIZE_POST + j; - output[i * total_concat_axis_size * AXIS_SIZE_POST + (axis_offset + k) * AXIS_SIZE_POST + j] = + output[i * TOTAL_AXIS_SIZE * AXIS_SIZE_POST + (axis_offset + k) * AXIS_SIZE_POST + j] = inputs[n][input_idx]; } - axis_offset += sizes[n]; // move along axis in output + axis_offset += AXIS_SIZE[n]; // move along axis in output } } } - } #endif // __AIDGE_EXPORT_CPP_KERNELS_CONCAT__ \ No newline at end of file diff --git a/aidge_export_cpp/kernels/convolution.hpp b/aidge_export_cpp/kernels/convolution.hpp index ed62401e69ff8d53b23ba9f88917bb54acd3740a..01b3f1b8e2ac3bc731f7067f6b6c3e8ee8706089 100644 --- a/aidge_export_cpp/kernels/convolution.hpp +++ b/aidge_export_cpp/kernels/convolution.hpp @@ -6,17 +6,29 @@ #include "network/utils.hpp" #include "network/macs.hpp" #include "network/activation_utils.hpp" - - -template<int NB_CHANNELS, - int CHANNELS_HEIGHT, int CHANNELS_WIDTH, - int NB_OUTPUTS, - int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH, - int PADDING_Y, int PADDING_X, - int STRIDE_Y, int STRIDE_X, - int DILATION_Y, int DILATION_X, - int KERNEL_HEIGHT, int KERNEL_WIDTH, +#include <sys/types.h> + +template<size_t NB_CHANNELS, + size_t CHANNELS_HEIGHT, size_t CHANNELS_WIDTH, + size_t NB_OUTPUTS, + size_t OUTPUTS_HEIGHT, size_t OUTPUTS_WIDTH, + size_t PADDING_Y, size_t PADDING_X, + size_t STRIDE_Y, size_t STRIDE_X, + size_t DILATION_Y, size_t DILATION_X, + size_t KERNEL_HEIGHT, size_t KERNEL_WIDTH, ActivationFunction_T ACTIVATION, + // Memory mapping: inputs + size_t INPUT_MEM_CONT_OFFSET, + size_t INPUT_MEM_CONT_SIZE, + size_t INPUT_MEM_WRAP_OFFSET, + size_t INPUT_MEM_WRAP_SIZE, + size_t INPUT_MEM_STRIDE, + // Memory mapping: outputs + size_t OUTPUT_MEM_CONT_OFFSET, + size_t OUTPUT_MEM_CONT_SIZE, + size_t OUTPUT_MEM_WRAP_OFFSET, + size_t OUTPUT_MEM_WRAP_SIZE, + size_t OUTPUT_MEM_STRIDE, typename Input_T, typename Output_T, typename Weight_T, typename Bias_T, typename Rescaling_T> @@ -28,62 +40,82 @@ void convolution_forward( const Bias_T* __restrict biases, const Rescaling_T& __restrict rescaling) { - constexpr int DILATED_KERNEL_HEIGHT - = KERNEL_HEIGHT + (DILATION_Y - 1) * (KERNEL_HEIGHT - 1); - - constexpr int DILATED_KERNEL_WIDTH - = KERNEL_WIDTH + (DILATION_X - 1) * (KERNEL_WIDTH - 1); - - constexpr int OUTPUTS_HEIGHT_NOPAD + constexpr size_t OUTPUTS_HEIGHT_NOPAD = (CHANNELS_HEIGHT - DILATION_Y * (KERNEL_HEIGHT - 1) - 1 + STRIDE_Y) / STRIDE_Y; - constexpr int OUTPUTS_WIDTH_NOPAD + constexpr size_t OUTPUTS_WIDTH_NOPAD = (CHANNELS_WIDTH - DILATION_X * (KERNEL_WIDTH - 1) - 1 + STRIDE_X) / STRIDE_X; - for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) { - const int syMin = (PADDING_Y == 0) ? 0 - : max(PADDING_Y - (oy * STRIDE_Y), 0); - const int syMax = (PADDING_Y == 0 - && OUTPUTS_HEIGHT == OUTPUTS_HEIGHT_NOPAD) ? DILATED_KERNEL_HEIGHT - : clamp(CHANNELS_HEIGHT + PADDING_Y - (oy * STRIDE_Y), - 0, DILATED_KERNEL_HEIGHT); - const int iy = (oy * STRIDE_Y) - PADDING_Y; + for (size_t oy = 0; oy < OUTPUTS_HEIGHT; ++oy) { + const size_t syMin = (PADDING_Y == 0) ? 0 + : max((PADDING_Y - (oy * STRIDE_Y) + DILATION_Y - 1) / DILATION_Y, 0); + const size_t syMax = (PADDING_Y == 0 + && OUTPUTS_HEIGHT == OUTPUTS_HEIGHT_NOPAD) ? KERNEL_HEIGHT + : clamp((CHANNELS_HEIGHT + PADDING_Y - (oy * STRIDE_Y)) / DILATION_Y, + 0, KERNEL_HEIGHT); + const int iy = static_cast<int>(oy * STRIDE_Y) - static_cast<int>(PADDING_Y); #ifdef _OPENMP #pragma omp parallel for collapse(2) #endif - for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) { - for (int output = 0; output < NB_OUTPUTS; ++output) { + for (size_t ox = 0; ox < OUTPUTS_WIDTH; ++ox) { + for (size_t output = 0; output < NB_OUTPUTS; ++output) { // moved to inner loop for collapsing --> - const int sxMin = (PADDING_X == 0) ? 0 - : max(PADDING_X - (ox * STRIDE_X), 0); - const int sxMax = (PADDING_X == 0 + const size_t sxMin = (PADDING_X == 0) ? 0 + : max((PADDING_X - (ox * STRIDE_X) + DILATION_X - 1) / DILATION_X, 0); + const size_t sxMax = (PADDING_X == 0 && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD) - ? DILATED_KERNEL_WIDTH - : clamp(CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X), - 0, DILATED_KERNEL_WIDTH); - const int ix = (ox * STRIDE_X) - PADDING_X; + ? KERNEL_WIDTH + : clamp((CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X)) / DILATION_X, + 0, KERNEL_WIDTH); + const int ix = static_cast<int>(ox * STRIDE_X) - static_cast<int>(PADDING_X); - const int oPos = (ox + OUTPUTS_WIDTH * oy); - const int oOffset = NB_OUTPUTS * oPos; + const size_t oPos = (ox + OUTPUTS_WIDTH * oy); + int oOffset = (OUTPUT_MEM_STRIDE / sizeof(Output_T)) * oPos; + + if (OUTPUT_MEM_WRAP_SIZE > 0 && oOffset >= static_cast<int>(OUTPUT_MEM_CONT_SIZE / sizeof(Output_T))) { + oOffset += (OUTPUT_MEM_WRAP_OFFSET - OUTPUT_MEM_CONT_OFFSET + - OUTPUT_MEM_CONT_SIZE) / sizeof(Output_T); + } // <-- // Check if the biases are defined Bias_T weightedSum = biases ? biases[output] : 0; - for (int sy = 0; sy < KERNEL_HEIGHT; ++sy) { + for (size_t sy = 0; sy < KERNEL_HEIGHT; ++sy) { if ((PADDING_Y != 0 || OUTPUTS_HEIGHT != OUTPUTS_HEIGHT_NOPAD) - && ((sy*DILATION_Y < syMin) || (sy*DILATION_Y >= syMax))) + && sy >= syMax - syMin) { - continue; + break; } - const int iPos = ix + CHANNELS_WIDTH * (iy + sy*DILATION_Y); - const int iOffset = NB_CHANNELS * iPos; + const size_t iPos = static_cast<size_t>(sxMin * DILATION_X + ix) + + CHANNELS_WIDTH * (static_cast<size_t>(iy + (syMin + sy) * DILATION_Y)); + int iOffset = (INPUT_MEM_STRIDE / sizeof(Input_T)) * iPos; - const int wOffset = (output*KERNEL_HEIGHT + sy) * KERNEL_WIDTH * NB_CHANNELS; + // Wrapping cannot occur in the middle of a line, except if + // there is only one line (1D)! + bool wrapInRange = false; - if (DILATION_X == 1 && ((PADDING_X == 0 && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD) + if (INPUT_MEM_WRAP_SIZE > 0 + && iOffset >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) + { + iOffset += (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET + - INPUT_MEM_CONT_SIZE) / sizeof(Input_T); + } + else if (INPUT_MEM_WRAP_SIZE > 0 && KERNEL_WIDTH > 1 + && CHANNELS_HEIGHT == 1 // single line (1D)! + && iOffset + KERNEL_WIDTH * NB_CHANNELS + > (INPUT_MEM_CONT_SIZE / sizeof(Input_T))) + { + wrapInRange = true; + } + + const size_t wOffset = NB_CHANNELS * (sxMin + + KERNEL_WIDTH * (syMin + sy + KERNEL_HEIGHT * output)); + + if (!wrapInRange && NB_CHANNELS == (INPUT_MEM_STRIDE / sizeof(Input_T)) + && DILATION_X == 1 && ((PADDING_X == 0 && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD) || sxMax - sxMin == KERNEL_WIDTH)) { macsOnRange<KERNEL_WIDTH * NB_CHANNELS>( @@ -92,16 +124,24 @@ void convolution_forward( weightedSum); } else { - for (int sx = 0; sx < KERNEL_WIDTH; ++sx) { + for (size_t sx = 0; sx < KERNEL_WIDTH; ++sx) { if ((PADDING_X != 0 || OUTPUTS_WIDTH != OUTPUTS_WIDTH_NOPAD) - && ((sx*DILATION_X < sxMin) || (sx*DILATION_X >= sxMax))) + && sx >= sxMax - sxMin) { - continue; + break; } - const int iOffsetInRange = iOffset - + sx * DILATION_X * NB_CHANNELS; + int iOffsetInRange = iOffset + + sx * DILATION_X * (INPUT_MEM_STRIDE / sizeof(Input_T)); + + if (wrapInRange + && iOffsetInRange >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) + { + iOffsetInRange += (INPUT_MEM_WRAP_OFFSET + - INPUT_MEM_CONT_OFFSET + - INPUT_MEM_CONT_SIZE) / sizeof(Input_T); + } macsOnRange<NB_CHANNELS>( // same input line so no wrapping can occur @@ -118,16 +158,28 @@ void convolution_forward( } } -// Template specialization when biases are not given to the convolution -template<int NB_CHANNELS, - int CHANNELS_HEIGHT, int CHANNELS_WIDTH, - int NB_OUTPUTS, - int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH, - int PADDING_Y, int PADDING_X, - int STRIDE_Y, int STRIDE_X, - int DILATION_Y, int DILATION_X, - int KERNEL_HEIGHT, int KERNEL_WIDTH, +// Template overloading when biases are not given to the convolution +template<size_t NB_CHANNELS, + size_t CHANNELS_HEIGHT, size_t CHANNELS_WIDTH, + size_t NB_OUTPUTS, + size_t OUTPUTS_HEIGHT, size_t OUTPUTS_WIDTH, + size_t PADDING_Y, size_t PADDING_X, + size_t STRIDE_Y, size_t STRIDE_X, + size_t DILATION_Y, size_t DILATION_X, + size_t KERNEL_HEIGHT, size_t KERNEL_WIDTH, ActivationFunction_T ACTIVATION, + // Memory mapping: inputs + size_t INPUT_MEM_CONT_OFFSET, + size_t INPUT_MEM_CONT_SIZE, + size_t INPUT_MEM_WRAP_OFFSET, + size_t INPUT_MEM_WRAP_SIZE, + size_t INPUT_MEM_STRIDE, + // Memory mapping: outputs + size_t OUTPUT_MEM_CONT_OFFSET, + size_t OUTPUT_MEM_CONT_SIZE, + size_t OUTPUT_MEM_WRAP_OFFSET, + size_t OUTPUT_MEM_WRAP_SIZE, + size_t OUTPUT_MEM_STRIDE, typename Input_T, typename Output_T, typename Weight_T, typename Rescaling_T> @@ -155,7 +207,19 @@ void convolution_forward( DILATION_X, KERNEL_HEIGHT, KERNEL_WIDTH, - ACTIVATION> + ACTIVATION, + // Memory mapping: inputs + INPUT_MEM_CONT_OFFSET, + INPUT_MEM_CONT_SIZE, + INPUT_MEM_WRAP_OFFSET, + INPUT_MEM_WRAP_SIZE, + INPUT_MEM_STRIDE, + // Memory mapping: outputs + OUTPUT_MEM_CONT_OFFSET, + OUTPUT_MEM_CONT_SIZE, + OUTPUT_MEM_WRAP_OFFSET, + OUTPUT_MEM_WRAP_SIZE, + OUTPUT_MEM_STRIDE> (inputs, outputs, weights, b, rescaling); } diff --git a/aidge_export_cpp/kernels/convolution_depthwise.hpp b/aidge_export_cpp/kernels/convolution_depthwise.hpp index 244dd86bc01be7142474380f1e3393ce32446aaf..613271cae946a82cd65d0d8dbc5db66fe42745d4 100644 --- a/aidge_export_cpp/kernels/convolution_depthwise.hpp +++ b/aidge_export_cpp/kernels/convolution_depthwise.hpp @@ -6,16 +6,29 @@ #include "network/utils.hpp" #include "network/macs.hpp" #include "network/activation_utils.hpp" - -template<int NB_CHANNELS, - int CHANNELS_HEIGHT, int CHANNELS_WIDTH, - int NB_OUTPUTS, - int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH, - int PADDING_Y, int PADDING_X, - int STRIDE_Y, int STRIDE_X, - int DILATION_Y, int DILATION_X, - int KERNEL_HEIGHT, int KERNEL_WIDTH, +#include <sys/types.h> + +template<size_t NB_CHANNELS, + size_t CHANNELS_HEIGHT, size_t CHANNELS_WIDTH, + size_t NB_OUTPUTS, + size_t OUTPUTS_HEIGHT, size_t OUTPUTS_WIDTH, + size_t PADDING_Y, size_t PADDING_X, + size_t STRIDE_Y, size_t STRIDE_X, + size_t DILATION_Y, size_t DILATION_X, + size_t KERNEL_HEIGHT, size_t KERNEL_WIDTH, ActivationFunction_T ACTIVATION, + // Memory mapping: inputs + size_t INPUT_MEM_CONT_OFFSET, + size_t INPUT_MEM_CONT_SIZE, + size_t INPUT_MEM_WRAP_OFFSET, + size_t INPUT_MEM_WRAP_SIZE, + size_t INPUT_MEM_STRIDE, + // Memory mapping: outputs + size_t OUTPUT_MEM_CONT_OFFSET, + size_t OUTPUT_MEM_CONT_SIZE, + size_t OUTPUT_MEM_WRAP_OFFSET, + size_t OUTPUT_MEM_WRAP_SIZE, + size_t OUTPUT_MEM_STRIDE, typename Input_T, typename Output_T, typename Weight_T, typename Bias_T, typename Rescaling_T> @@ -30,50 +43,55 @@ void convolution_depthwise_forward( static_assert(NB_OUTPUTS % NB_CHANNELS == 0, "NB_OUTPUTS should be a multiple of NB_CHANNELS."); - constexpr int DILATED_KERNEL_HEIGHT + constexpr size_t DILATED_KERNEL_HEIGHT = KERNEL_HEIGHT + (DILATION_Y - 1) * (KERNEL_HEIGHT - 1); - constexpr int DILATED_KERNEL_WIDTH + constexpr size_t DILATED_KERNEL_WIDTH = KERNEL_WIDTH + (DILATION_X - 1) * (KERNEL_WIDTH - 1); - constexpr int OUTPUTS_HEIGHT_NOPAD + constexpr size_t OUTPUTS_HEIGHT_NOPAD = (CHANNELS_HEIGHT - DILATION_Y * (KERNEL_HEIGHT - 1) - 1 + STRIDE_Y) / STRIDE_Y; - constexpr int OUTPUTS_WIDTH_NOPAD + constexpr size_t OUTPUTS_WIDTH_NOPAD = (CHANNELS_WIDTH - DILATION_X * (KERNEL_WIDTH - 1) - 1 + STRIDE_X) / STRIDE_X; - for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) { - const int syMin = (PADDING_Y == 0) ? 0 + for (size_t oy = 0; oy < OUTPUTS_HEIGHT; ++oy) { + const size_t syMin = (PADDING_Y == 0) ? 0 : max(PADDING_Y - (oy * STRIDE_Y), 0); - const int syMax = (PADDING_Y == 0 + const size_t syMax = (PADDING_Y == 0 && OUTPUTS_HEIGHT == OUTPUTS_HEIGHT_NOPAD) ? DILATED_KERNEL_HEIGHT : clamp(CHANNELS_HEIGHT + PADDING_Y - (oy * STRIDE_Y), 0, DILATED_KERNEL_HEIGHT); - const int iy = (oy * STRIDE_Y) - PADDING_Y; + const int iy = static_cast<int>(oy * STRIDE_Y) - static_cast<int>(PADDING_Y); #ifdef _OPENMP #pragma omp parallel for collapse(2) #endif - for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) { - for (int output = 0; output < NB_OUTPUTS; ++output) { + for (size_t ox = 0; ox < OUTPUTS_WIDTH; ++ox) { + for (size_t output = 0; output < NB_OUTPUTS; ++output) { // moved to inner loop for collapsing --> - const int sxMin = (PADDING_X == 0) ? 0 + const size_t sxMin = (PADDING_X == 0) ? 0 : max(PADDING_X - (ox * STRIDE_X), 0); - const int sxMax = (PADDING_X == 0 + const size_t sxMax = (PADDING_X == 0 && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD) ? DILATED_KERNEL_WIDTH : clamp(CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X), 0, DILATED_KERNEL_WIDTH); - const int ix = (ox * STRIDE_X) - PADDING_X; + const int ix = static_cast<int>(ox * STRIDE_X) - static_cast<int>(PADDING_X); - const int oPos = (ox + OUTPUTS_WIDTH * oy); - const int oOffset = NB_OUTPUTS * oPos; + const size_t oPos = (ox + OUTPUTS_WIDTH * oy); + int oOffset = (OUTPUT_MEM_STRIDE / sizeof(Output_T)) * oPos; + + if (OUTPUT_MEM_WRAP_SIZE > 0 && oOffset >= static_cast<int>(OUTPUT_MEM_CONT_SIZE / sizeof(Output_T))) { + oOffset += (OUTPUT_MEM_WRAP_OFFSET - OUTPUT_MEM_CONT_OFFSET + - OUTPUT_MEM_CONT_SIZE) / sizeof(Output_T); + } // <-- - const int channel = (output * NB_CHANNELS) / NB_OUTPUTS; + const size_t channel = (output * NB_CHANNELS) / NB_OUTPUTS; Bias_T weightedSum = biases ? biases[output] : 0; - for (int sy = 0; sy < KERNEL_HEIGHT; ++sy) { + for (size_t sy = 0; sy < KERNEL_HEIGHT; ++sy) { if ((PADDING_Y != 0 || OUTPUTS_HEIGHT != OUTPUTS_HEIGHT_NOPAD) && ((sy*DILATION_Y < syMin) || (sy*DILATION_Y >= syMax))) @@ -81,13 +99,33 @@ void convolution_depthwise_forward( continue; } - const int iPos = ix + CHANNELS_WIDTH * (iy + sy*DILATION_Y); - const int iOffset = NB_CHANNELS * iPos; + const size_t iPos = static_cast<size_t>(ix) + + CHANNELS_WIDTH * (static_cast<size_t>(iy + sy * DILATION_Y)); + int iOffset = (INPUT_MEM_STRIDE / sizeof(Input_T)) * iPos; - const int wOffset = (output*KERNEL_HEIGHT + sy) + // Wrapping cannot occur in the middle of a line, except if + // there is only one line (1D)! + bool wrapInRange = false; + + if (INPUT_MEM_WRAP_SIZE > 0 + && iOffset >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) + { + iOffset += (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET + - INPUT_MEM_CONT_SIZE) / sizeof(Input_T); + } + else if (INPUT_MEM_WRAP_SIZE > 0 && KERNEL_WIDTH > 1 + && CHANNELS_HEIGHT == 1 // single line (1D)! + && iOffset + KERNEL_WIDTH * NB_CHANNELS + > (INPUT_MEM_CONT_SIZE / sizeof(Input_T))) + { + wrapInRange = true; + } + + const size_t wOffset = (output*KERNEL_HEIGHT + sy) * KERNEL_WIDTH; - if (DILATION_X == 1 && ((PADDING_X == 0 + if (!wrapInRange && NB_CHANNELS == (INPUT_MEM_STRIDE / sizeof(Input_T)) + && DILATION_X == 1 && ((PADDING_X == 0 && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD) || sxMax - sxMin == KERNEL_WIDTH)) { @@ -97,7 +135,7 @@ void convolution_depthwise_forward( weightedSum); } else { - for (int sx = 0; sx < KERNEL_WIDTH; ++sx) { + for (size_t sx = 0; sx < KERNEL_WIDTH; ++sx) { if ((PADDING_X != 0 || OUTPUTS_WIDTH != OUTPUTS_WIDTH_NOPAD) && ((sx*DILATION_X < sxMin) || (sx*DILATION_X >= sxMax))) @@ -105,8 +143,16 @@ void convolution_depthwise_forward( continue; } - const int iOffsetInRange = iOffset - + sx * DILATION_X * NB_CHANNELS; + int iOffsetInRange = iOffset + + sx * DILATION_X * (INPUT_MEM_STRIDE / sizeof(Input_T)); + + if (wrapInRange + && iOffsetInRange >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) + { + iOffsetInRange += (INPUT_MEM_WRAP_OFFSET + - INPUT_MEM_CONT_OFFSET + - INPUT_MEM_CONT_SIZE) / sizeof(Input_T); + } weightedSum += inputs[iOffsetInRange + channel] * weights[wOffset + sx]; @@ -130,6 +176,18 @@ template<int NB_CHANNELS, int DILATION_Y, int DILATION_X, int KERNEL_HEIGHT, int KERNEL_WIDTH, ActivationFunction_T ACTIVATION, + // Memory mapping: inputs + int INPUT_MEM_CONT_OFFSET, + int INPUT_MEM_CONT_SIZE, + int INPUT_MEM_WRAP_OFFSET, + int INPUT_MEM_WRAP_SIZE, + int INPUT_MEM_STRIDE, + // Memory mapping: outputs + int OUTPUT_MEM_CONT_OFFSET, + int OUTPUT_MEM_CONT_SIZE, + int OUTPUT_MEM_WRAP_OFFSET, + int OUTPUT_MEM_WRAP_SIZE, + int OUTPUT_MEM_STRIDE, typename Input_T, typename Output_T, typename Weight_T, typename Rescaling_T> @@ -157,7 +215,19 @@ void convolution_depthwise_forward( DILATION_X, KERNEL_HEIGHT, KERNEL_WIDTH, - ACTIVATION> + ACTIVATION, + // Memory mapping: inputs + INPUT_MEM_CONT_OFFSET, + INPUT_MEM_CONT_SIZE, + INPUT_MEM_WRAP_OFFSET, + INPUT_MEM_WRAP_SIZE, + INPUT_MEM_STRIDE, + // Memory mapping: outputs + OUTPUT_MEM_CONT_OFFSET, + OUTPUT_MEM_CONT_SIZE, + OUTPUT_MEM_WRAP_OFFSET, + OUTPUT_MEM_WRAP_SIZE, + OUTPUT_MEM_STRIDE> (inputs, outputs, weights, b, rescaling); } diff --git a/aidge_export_cpp/kernels/elemwise.hpp b/aidge_export_cpp/kernels/elemwise.hpp index 9468b33f6b9785f36f511b14daffe9cc4a0ed420..fed13f956f7a464ba0ca7cf3e92642e6c28cd9db 100644 --- a/aidge_export_cpp/kernels/elemwise.hpp +++ b/aidge_export_cpp/kernels/elemwise.hpp @@ -3,169 +3,73 @@ #include "network/typedefs.hpp" #include "network/activation_utils.hpp" - -// Generic function for two inputs - -template<int NB_ELTS, - ElemWise_T ELEM_OP, - ActivationFunction_T ACTIVATION, - typename Input_T, typename Output_T, - typename Rescaling_T> +#include <sys/types.h> + +template<size_t NB_MAT, ElemWise_T ELEM_OP, + size_t INPUT1_CONT_SIZE, size_t INPUT2_CONT_SIZE, size_t OUTPUT_CONT_SIZE, + const size_t OFFSET_IN1[], const size_t OFFSET_IN2[], + ActivationFunction_T ACTIVATION, + // Memory mapping: inputs + size_t INPUT1_MEM_CONT_OFFSET, + size_t INPUT1_MEM_CONT_SIZE, + size_t INPUT1_MEM_WRAP_OFFSET, + size_t INPUT1_MEM_WRAP_SIZE, + size_t INPUT1_MEM_STRIDE, + size_t INPUT2_MEM_CONT_OFFSET, + size_t INPUT2_MEM_CONT_SIZE, + size_t INPUT2_MEM_WRAP_OFFSET, + size_t INPUT2_MEM_WRAP_SIZE, + size_t INPUT2_MEM_STRIDE, + // Memory mapping: outputs + size_t OUTPUT_MEM_CONT_OFFSET, + size_t OUTPUT_MEM_CONT_SIZE, + size_t OUTPUT_MEM_WRAP_OFFSET, + size_t OUTPUT_MEM_WRAP_SIZE, + size_t OUTPUT_MEM_STRIDE, + typename Input_T, typename Output_T, typename Rescaling_T> __attribute__((always_inline)) inline -void elemwise_forward ( +void elemwise_forward( Output_T* __restrict outputs, const Rescaling_T& __restrict rescaling, const Input_T* __restrict inputs1, const Input_T* __restrict inputs2) { - if (std::is_floating_point<Input_T>::value) - { - Input_T val = 0; - - switch (ELEM_OP) { - case Add: { - for (int i = 0; i < NB_ELTS; ++i) { - val = inputs1[i] + inputs2[i]; - outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling); - } - break; - } - case Sub: { - for (int i = 0; i < NB_ELTS; ++i) { - val = inputs1[i] - inputs2[i]; - outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling); - - } - break; - } - case Mul: { - for (int i = 0; i < NB_ELTS; ++i) { - val = inputs1[i] * inputs2[i]; - outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling); - } - break; - } - default: { - // Copy inputs1 in outputs for default case - for (int i = 0; i < NB_ELTS; ++i) { - val = inputs1[i]; - outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling); - } - break; - } - } - } - else - { - int32_t val = 0; + static_assert(INPUT1_MEM_WRAP_SIZE == 0, "Incompatible input memory wrapping"); + static_assert(INPUT2_MEM_WRAP_SIZE == 0, "Incompatible input memory wrapping"); + static_assert(OUTPUT_MEM_CONT_SIZE % OUTPUT_CONT_SIZE == 0, "Incompatible output memory wrapping"); + auto apply_op = [](auto a, auto b) -> Output_T { switch (ELEM_OP) { - case Add: { - for (int i = 0; i < NB_ELTS; ++i) { - val = inputs1[i] + inputs2[i]; - outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling); - } - break; - } - case Sub: { - for (int i = 0; i < NB_ELTS; ++i) { - val = inputs1[i] - inputs2[i]; - outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling); - } - break; - } - case Mul: { - for (int i = 0; i < NB_ELTS; ++i) { - val = inputs1[i] * inputs2[i]; - outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling); - } - break; - } - default: { - // Copy inputs1 in outputs for default case - for (int i = 0; i < NB_ELTS; ++i) { - val = inputs1[i]; - outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling); - } - break; - } + case Add: return a + b; + case Sub: return a - b; + case Mul: return a * b; + case Div: return a / b; + default: return a; } - } -} - + }; -// Generic function for multiple inputs -// Not working - -// template<ElemWise_T ELEM_OP, typename Output_T> -// __attribute__((always_inline)) inline -// Output_T elemWise (int /*pos*/, int /*ch*/) -// { -// return 0; -// } - -// template<ElemWise_T ELEM_OP, -// int NB_CHANNELS, -// // For next inputs -// int... ARGS, -// typename... INPUTS, -// // Types -// typename Input_T, typename Output_T> -// __attribute__((always_inline)) inline -// Output_T elemWise (int pos, int ch, -// const Input_T* __restrict firstInputs, -// INPUTS... inputs) -// { -// int iOffset = NB_CHANNELS * pos; - -// return firstInputs[iOffset + ch] -// + elemWise<ELEM_OP, ARGS...>(pos, ch, inputs...); -// } - -// template<// For all inputs -// int NB_CHANNELS, -// int CHANNELS_HEIGHT, int CHANNELS_WIDTH, -// int NB_ELTS, -// int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH, -// ElemWise_T ELEM_OP, -// ActivationFunction_T ACTIVATION, -// // For next inputs -// int... ARGS, -// typename... INPUTS, -// // Types -// typename Input_T, typename Output_T, -// typename Rescaling_T> -// __attribute__((always_inline)) inline -// void elemWise_forward ( -// Output_T* __restrict outputs, -// const Rescaling_T& __restrict rescaling, -// const Input_T* __restrict firstInputs, -// INPUTS... inputs) -// { -// for (int oy = 0; oy < OUTPUTS_HEIGHT; oy++) { -// for (int ox = 0; ox < OUTPUTS_WIDTH; ox++) { -// const int pos = (ox + OUTPUTS_WIDTH * oy); -// int oOffset = NB_ELTS * pos; - -// for (int ch = 0; ch < NB_ELTS; ++ch) { -// const Add_T val = elemWise<ELEM_OP, -// INPUT_NB_CHANNELS, -// INPUT_MEM_CONT_OFFSET, -// INPUT_MEM_CONT_NB_ELTS, -// INPUT_MEM_WRAP_OFFSET, -// INPUT_MEM_WRAP_NB_ELTS, -// INPUT_MEM_STRIDE, -// ARGS...>(pos, ch, firstInputs, inputs...); - -// outputs[oOffset + ch] -// = sat<Output_T>(val, ch, ACTIVATION, rescaling); -// } -// } -// } -// } + for (size_t stack = 0; stack < NB_MAT; ++stack) { + const size_t offset_in1 = OFFSET_IN1[stack] * INPUT1_CONT_SIZE; + const size_t offset_in2 = OFFSET_IN2[stack] * INPUT2_CONT_SIZE; + int out_offset = stack * OUTPUT_CONT_SIZE; + if (OUTPUT_MEM_WRAP_SIZE > 0 && out_offset >= static_cast<int>(OUTPUT_MEM_CONT_SIZE / sizeof(Output_T))) { + out_offset += (OUTPUT_MEM_WRAP_OFFSET - OUTPUT_MEM_CONT_OFFSET + - OUTPUT_MEM_CONT_SIZE) / sizeof(Output_T); + } + for (size_t i = 0; i < OUTPUT_CONT_SIZE; ++i) { + const size_t in0_id = (INPUT1_CONT_SIZE != 1) ? i : 0; + const size_t in1_id = (INPUT2_CONT_SIZE != 1) ? i : 0; + const size_t out_id = out_offset + i; + const auto val1 = inputs1[in0_id + offset_in1]; + const auto val2 = inputs2[in1_id + offset_in2]; + const Output_T val = apply_op(val1, val2); + outputs[out_id] = activation_forward_value<Output_T>(val, out_id, ACTIVATION, rescaling); + } + } +} #endif // __AIDGE_EXPORT_CPP_KERNELS_ELEMWISE__ diff --git a/aidge_export_cpp/kernels/erf.hpp b/aidge_export_cpp/kernels/erf.hpp new file mode 100644 index 0000000000000000000000000000000000000000..371b959b011fe2f80fe8ac5e8f9284433885d95d --- /dev/null +++ b/aidge_export_cpp/kernels/erf.hpp @@ -0,0 +1,40 @@ +#ifndef __AIDGE_EXPORT_CPP_KERNELS_ERP__ +#define __AIDGE_EXPORT_CPP_KERNELS_ERP__ + +#include "network/typedefs.hpp" +#include "math.h" +#include <sys/types.h> + +template<size_t NB_ELTS, + typename Input_T, typename Output_T> +__attribute__((always_inline)) inline +void erf_forward ( + const Input_T* __restrict inputs, + Output_T* __restrict outputs) +{ + constexpr double a1 = 0.254829592; + constexpr double a2 = -0.284496736; + constexpr double a3 = 1.421413741; + constexpr double a4 = -1.453152027; + constexpr double a5 = 1.061405429; + constexpr double p = 0.3275911; + + #ifdef _OPENMP + #pragma omp parallel for + #endif + for (size_t i = 0; i < NB_ELTS; ++i) { + int sign = 1; + if (inputs[i] < 0) + sign = -1; + const double abs_value = abs(inputs[i]); + + // A&S formula 7.1.26 + const double t = 1.0/(1.0 + p*abs_value); + const double y = 1.0 - (((((a5*t + a4)*t) + a3)*t + a2)*t + a1)*t*exp(-abs_value*abs_value); + outputs[i] = sign*y; + + } +} + + +#endif // __AIDGE_EXPORT_CPP_KERNELS_ERP_ \ No newline at end of file diff --git a/aidge_export_cpp/kernels/fullyconnected.hpp b/aidge_export_cpp/kernels/fullyconnected.hpp index abaab59c355263a79c905ffeb8a2a72b6e976445..5d9aaf5a91d4a2ba6176df6da59b4bd7a7122d1e 100644 --- a/aidge_export_cpp/kernels/fullyconnected.hpp +++ b/aidge_export_cpp/kernels/fullyconnected.hpp @@ -6,12 +6,29 @@ #include "network/utils.hpp" #include "network/macs.hpp" #include "network/activation_utils.hpp" +#include <sys/types.h> -template<int NB_CHANNELS, - int CHANNELS_HEIGHT, int CHANNELS_WIDTH, - int NB_OUTPUTS, - int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH, +/** + * @brief Kernel to use when the input is in the NHWC format, and the + * weights have been transposed accordingly. + */ +template<size_t NB_CHANNELS, + size_t CHANNELS_HEIGHT, size_t CHANNELS_WIDTH, + size_t NB_OUTPUTS, + size_t OUTPUTS_HEIGHT, size_t OUTPUTS_WIDTH, ActivationFunction_T ACTIVATION, + // Memory mapping: inputs + size_t INPUT_MEM_CONT_OFFSET, + size_t INPUT_MEM_CONT_SIZE, + size_t INPUT_MEM_WRAP_OFFSET, + size_t INPUT_MEM_WRAP_SIZE, + size_t INPUT_MEM_STRIDE, + // Memory mapping: outputs + size_t OUTPUT_MEM_CONT_OFFSET, + size_t OUTPUT_MEM_CONT_SIZE, + size_t OUTPUT_MEM_WRAP_OFFSET, + size_t OUTPUT_MEM_WRAP_SIZE, + size_t OUTPUT_MEM_STRIDE, typename Input_T, typename Output_T, typename Weight_T, typename Bias_T, typename Rescaling_T> @@ -23,55 +40,209 @@ void fullyconnected_forward ( const Bias_T* __restrict biases, const Rescaling_T& __restrict rescaling) { - // Warning, there is a trick here ! - // To use this kernel, the inputs have to be in NHWC and the weights are in NCHW - // It is only an issue if the FC was after a flatten layer. - // Otherwise it is not an issue for the other FC because CHANNELS_WIDTH = CHANNELS_HEIGHT = 1 - // Solution: Add a system to check dataformat + constexpr size_t INPUT_WIDTH_STRIDE = (INPUT_MEM_STRIDE / sizeof(Input_T)); + constexpr size_t INPUT_HEIGHT_STRIDE = (INPUT_MEM_STRIDE / sizeof(Input_T))*CHANNELS_WIDTH; + // constexpr size_t INPUT_OUT_CHANNELS_STRIDE = (INPUT_MEM_STRIDE / sizeof(Input_T))*CHANNELS_WIDTH*CHANNELS_HEIGHT; + + constexpr size_t WEIGHT_WIDTH_STRIDE = NB_CHANNELS; + constexpr size_t WEIGHT_HEIGHT_STRIDE = NB_CHANNELS*CHANNELS_WIDTH; + constexpr size_t WEIGHT_OUT_CHANNELS_STRIDE = NB_CHANNELS*CHANNELS_WIDTH*CHANNELS_HEIGHT; #ifdef _OPENMP #pragma omp parallel for #endif - for (int och = 0; och < NB_OUTPUTS; och++) { - + for (size_t och = 0; och < NB_OUTPUTS; ++och) { Bias_T weightedSum = (biases) ? biases[och] : Bias_T(0); - for (int iy = 0; iy < CHANNELS_HEIGHT; ++iy) { - for (int ix = 0; ix < CHANNELS_WIDTH; ++ix) { - for (int ch = 0; ch < NB_CHANNELS; ++ch) { - weightedSum += inputs[CHANNELS_WIDTH*NB_CHANNELS*iy + NB_CHANNELS*ix + ch] - * weights[CHANNELS_HEIGHT*CHANNELS_WIDTH*NB_CHANNELS*och + CHANNELS_HEIGHT*CHANNELS_WIDTH*ch + CHANNELS_HEIGHT*iy + ix]; + for (size_t iy = 0; iy < CHANNELS_HEIGHT; ++iy) { + int iOffset = INPUT_HEIGHT_STRIDE * iy; + + // Wrapping cannot occur in the middle of a line, except if + // there is only one line (1D)! + bool wrapInRange = false; + + if (INPUT_MEM_WRAP_SIZE > 0 && iOffset >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) { + iOffset += (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET + - INPUT_MEM_CONT_SIZE) / sizeof(Input_T); + } + else if (INPUT_MEM_WRAP_SIZE > 0 && CHANNELS_WIDTH > 1 + && CHANNELS_HEIGHT == 1 // single line (1D)! + && iOffset + CHANNELS_WIDTH * NB_CHANNELS + > (INPUT_MEM_CONT_SIZE / sizeof(Input_T))) + { + wrapInRange = true; + } + + const size_t wOffset = WEIGHT_HEIGHT_STRIDE * iy + WEIGHT_OUT_CHANNELS_STRIDE * och; + + if (!wrapInRange && INPUT_WIDTH_STRIDE == WEIGHT_WIDTH_STRIDE) { + macsOnRange<INPUT_HEIGHT_STRIDE>( + inputs + iOffset, + weights + wOffset, + weightedSum); + } + else { + for (size_t ix = 0; ix < CHANNELS_WIDTH; ++ix) { + int iOffsetInRange = iOffset + ix * INPUT_WIDTH_STRIDE; + + if (wrapInRange + && iOffsetInRange >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) + { + iOffsetInRange += (INPUT_MEM_WRAP_OFFSET + - INPUT_MEM_CONT_OFFSET + - INPUT_MEM_CONT_SIZE) / sizeof(Input_T); + } + + macsOnRange<INPUT_WIDTH_STRIDE>( + inputs + iOffsetInRange, + weights + wOffset + ix * WEIGHT_WIDTH_STRIDE, + weightedSum); } } } outputs[och] = activation_forward_value<Output_T>(weightedSum, och, ACTIVATION, rescaling); } -/* -Here the kernel to use with inputs in NHWC and weights in NHWC +} + +/** + * @brief Kernel to use when the input is in the NCHW or Default format + * format (4D or 2D). + */ +template<size_t NB_CHANNELS, + size_t CHANNELS_HEIGHT, size_t CHANNELS_WIDTH, + size_t NB_OUTPUTS, + size_t OUTPUTS_HEIGHT, size_t OUTPUTS_WIDTH, + ActivationFunction_T ACTIVATION, + // Memory mapping: inputs + size_t INPUT_MEM_CONT_OFFSET, + size_t INPUT_MEM_CONT_SIZE, + size_t INPUT_MEM_WRAP_OFFSET, + size_t INPUT_MEM_WRAP_SIZE, + size_t INPUT_MEM_STRIDE, + // Memory mapping: outputs + size_t OUTPUT_MEM_CONT_OFFSET, + size_t OUTPUT_MEM_CONT_SIZE, + size_t OUTPUT_MEM_WRAP_OFFSET, + size_t OUTPUT_MEM_WRAP_SIZE, + size_t OUTPUT_MEM_STRIDE, + typename Input_T, typename Output_T, + typename Weight_T, typename Bias_T, + typename Rescaling_T> +__attribute__((always_inline)) inline +void fullyconnected_default_forward ( + const Input_T* __restrict inputs, + Output_T* __restrict outputs, + const Weight_T* __restrict weights, + const Bias_T* __restrict biases, + const Rescaling_T& __restrict rescaling) +{ + constexpr size_t WEIGHT_OUT_CHANNELS_STRIDE = NB_CHANNELS*CHANNELS_WIDTH*CHANNELS_HEIGHT; + #ifdef _OPENMP #pragma omp parallel for #endif - for (int och = 0; och < NB_OUTPUTS; och++) { + for (size_t och = 0; och < NB_OUTPUTS; och++) { + Bias_T weightedSum = (biases) ? biases[och] : Bias_T(0); + + const size_t wOffset = WEIGHT_OUT_CHANNELS_STRIDE * och; + + macsOnRange<WEIGHT_OUT_CHANNELS_STRIDE>( + inputs, + weights + wOffset, + weightedSum); + outputs[och] = activation_forward_value<Output_T>(weightedSum, och, ACTIVATION, rescaling); + } +} + +/** + * @brief Kernel to use when the input is in the NHWC format, but the + * weights have not been transposed and still follow the NCHW format order. + */ +template<size_t NB_CHANNELS, + size_t CHANNELS_HEIGHT, size_t CHANNELS_WIDTH, + size_t NB_OUTPUTS, + size_t OUTPUTS_HEIGHT, size_t OUTPUTS_WIDTH, + ActivationFunction_T ACTIVATION, + // Memory mapping: inputs + size_t INPUT_MEM_CONT_OFFSET, + size_t INPUT_MEM_CONT_SIZE, + size_t INPUT_MEM_WRAP_OFFSET, + size_t INPUT_MEM_WRAP_SIZE, + size_t INPUT_MEM_STRIDE, + // Memory mapping: outputs + size_t OUTPUT_MEM_CONT_OFFSET, + size_t OUTPUT_MEM_CONT_SIZE, + size_t OUTPUT_MEM_WRAP_OFFSET, + size_t OUTPUT_MEM_WRAP_SIZE, + size_t OUTPUT_MEM_STRIDE, + typename Input_T, typename Output_T, + typename Weight_T, typename Bias_T, + typename Rescaling_T> +__attribute__((always_inline)) inline +void fullyconnected_transpose_forward ( + const Input_T* __restrict inputs, + Output_T* __restrict outputs, + const Weight_T* __restrict weights, + const Bias_T* __restrict biases, + const Rescaling_T& __restrict rescaling) +{ + constexpr size_t INPUT_WIDTH_STRIDE = (INPUT_MEM_STRIDE / sizeof(Input_T)); + constexpr size_t INPUT_HEIGHT_STRIDE = (INPUT_MEM_STRIDE / sizeof(Input_T))*CHANNELS_WIDTH; + // constexpr size_t INPUT_OUT_CHANNELS_STRIDE = (INPUT_MEM_STRIDE / sizeof(Input_T))*CHANNELS_WIDTH*CHANNELS_HEIGHT; + + constexpr size_t WEIGHT_HEIGHT_STRIDE = CHANNELS_WIDTH; + constexpr size_t WEIGHT_IN_CHANNELS_STRIDE = CHANNELS_HEIGHT*CHANNELS_WIDTH; + constexpr size_t WEIGHT_OUT_CHANNELS_STRIDE = NB_CHANNELS*CHANNELS_HEIGHT*CHANNELS_WIDTH; +#ifdef _OPENMP +#pragma omp parallel for +#endif + for (size_t och = 0; och < NB_OUTPUTS; och++) { Bias_T weightedSum = (biases) ? biases[och] : Bias_T(0); - for (int iy = 0; iy < CHANNELS_HEIGHT; ++iy) { - const int iPos = (CHANNELS_WIDTH * iy); - int iOffset = NB_CHANNELS * iPos; + for (size_t iy = 0; iy < CHANNELS_HEIGHT; ++iy) { + int iOffset = INPUT_HEIGHT_STRIDE * iy; + + // Wrapping cannot occur in the middle of a line, except if + // there is only one line (1D)! + bool wrapInRange = false; + + if (INPUT_MEM_WRAP_SIZE > 0 && iOffset >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) { + iOffset += (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET + - INPUT_MEM_CONT_SIZE) / sizeof(Input_T); + } + else if (INPUT_MEM_WRAP_SIZE > 0 && CHANNELS_WIDTH > 1 + && CHANNELS_HEIGHT == 1 // single line (1D)! + && iOffset + CHANNELS_WIDTH * NB_CHANNELS + > (INPUT_MEM_CONT_SIZE / sizeof(Input_T))) + { + wrapInRange = true; + } - const int wOffset = NB_CHANNELS * CHANNELS_WIDTH - * (iy + CHANNELS_HEIGHT * och); + const size_t wOffset = WEIGHT_OUT_CHANNELS_STRIDE * och + WEIGHT_HEIGHT_STRIDE * iy; - macsOnRange<NB_CHANNELS * CHANNELS_WIDTH>( - inputs + iOffset, - weights + wOffset, - weightedSum); + for (size_t ix = 0; ix < CHANNELS_WIDTH; ++ix) { + int iOffsetInRange = iOffset + ix * INPUT_WIDTH_STRIDE; + + if (wrapInRange + && iOffsetInRange >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) + { + iOffsetInRange += (INPUT_MEM_WRAP_OFFSET + - INPUT_MEM_CONT_OFFSET + - INPUT_MEM_CONT_SIZE) / sizeof(Input_T); + } + + // Beware that the pointer increment for weights is + // CHANNELS_HEIGHT*CHANNELS_WIDTH + macsOnRange<NB_CHANNELS, WEIGHT_IN_CHANNELS_STRIDE>( + inputs + iOffsetInRange, + weights + wOffset + ix, + weightedSum); + } } outputs[och] = activation_forward_value<Output_T>(weightedSum, och, ACTIVATION, rescaling); } -*/ } - #endif // __AIDGE_EXPORT_CPP_KERNELS_FULLYCONNECTED__ diff --git a/aidge_export_cpp/kernels/hardmax.hpp b/aidge_export_cpp/kernels/hardmax.hpp index 0f183b394e0aca1fd5502573ab627ea23d067169..0f15936d255be014003ae2550d2ea1e19849eabb 100644 --- a/aidge_export_cpp/kernels/hardmax.hpp +++ b/aidge_export_cpp/kernels/hardmax.hpp @@ -3,15 +3,16 @@ #include "network/typedefs.hpp" #include "network/utils.hpp" +#include <sys/types.h> // Todo add border value and border type (Reflect, Constant, Wrap...) and add // the two missing pad value (bottom and right) -template <unsigned int AXIS_DIM_SIZE, - unsigned int PREAXIS_STRIDE, - unsigned int AXIS_STRIDE, - unsigned int POSTAXIS_STRIDE, - unsigned int NB_ELTS, +template <size_t AXIS_DIM_SIZE, + size_t PREAXIS_STRIDE, + size_t AXIS_STRIDE, + size_t POSTAXIS_STRIDE, + size_t NB_ELTS, typename Input_T, typename Output_T> // void HardmaxImpl_cpu_forward_kernel(std::int32_t axis_, const @@ -26,17 +27,17 @@ hardmax2d_forward(const Input_T *__restrict input, // For each index on all the axes before and after 'axis', we have a // different max element to find - for (unsigned int i = 0, preAxisOffset = 0; i < PREAXIS_STRIDE; + for (size_t i = 0, preAxisOffset = 0; i < PREAXIS_STRIDE; ++i, preAxisOffset += AXIS_DIM_SIZE * POSTAXIS_STRIDE) { - for (unsigned int j = 0; j < POSTAXIS_STRIDE; ++j) { + for (size_t j = 0; j < POSTAXIS_STRIDE; ++j) { // Init the max with first element - unsigned int maxIdx = 0; + size_t maxIdx = 0; Input_T maxVal = input[preAxisOffset + j]; // Loop over the elements on 'axis' // Since we start at 0th idx, we already initialize the values like // the 1st iteration has been done - for (unsigned int k = 1, + for (size_t k = 1, postAxisOffset = preAxisOffset + POSTAXIS_STRIDE; k < AXIS_DIM_SIZE; ++k, postAxisOffset += POSTAXIS_STRIDE) { diff --git a/aidge_export_cpp/kernels/reshape.hpp b/aidge_export_cpp/kernels/identity.hpp similarity index 56% rename from aidge_export_cpp/kernels/reshape.hpp rename to aidge_export_cpp/kernels/identity.hpp index 376ed8859241947c8239d086df75838b4865505e..7f7b31d80b2ab5eba9a41ce63d35c88cc8255c72 100644 --- a/aidge_export_cpp/kernels/reshape.hpp +++ b/aidge_export_cpp/kernels/identity.hpp @@ -1,27 +1,27 @@ -#ifndef __AIDGE_EXPORT_CPP_KERNELS_RESHAPE__ -#define __AIDGE_EXPORT_CPP_KERNELS_RESHAPE__ +#ifndef __AIDGE_EXPORT_CPP_KERNELS_IDENTITY__ +#define __AIDGE_EXPORT_CPP_KERNELS_IDENTITY__ #include "network/typedefs.hpp" +#include <sys/types.h> -// Generic function for reshape and activation +// Generic function for identity and activation -template<int M, +template<size_t M, typename Input_T, typename Output_T> __attribute__((always_inline)) inline -void reshape_forward ( +void identity_forward ( const Input_T* __restrict inputs, - const Input_T* __restrict /*shape*/, Output_T* __restrict outputs) { // If inputs and outputs pointers are the same, the memory manager has already optimized this function so it is a no-op ! if (inputs == outputs) return; - // A reshape in c++ world should equal to a Noop + // A identity in c++ world should equal to a Noop // We only need to copy the input buffer to the output - for (int m = 0; m < M; ++m) { + for (size_t m = 0; m < M; ++m) { outputs[m] = inputs[m]; } } -#endif // __AIDGE_EXPORT_CPP_KERNELS_RESHAPE__ \ No newline at end of file +#endif // __AIDGE_EXPORT_CPP_KERNELS_IDENTITY__ \ No newline at end of file diff --git a/aidge_export_cpp/kernels/leakyrelu.hpp b/aidge_export_cpp/kernels/leakyrelu.hpp index 5e6598d8fe5d43d9ae9320498289577ab7695e97..cd18762fab47274eab3b3e095452ac8d414fc827 100644 --- a/aidge_export_cpp/kernels/leakyrelu.hpp +++ b/aidge_export_cpp/kernels/leakyrelu.hpp @@ -2,26 +2,50 @@ #define __AIDGE_EXPORT_CPP_KERNELS_LEAKYRELU__ #include "network/typedefs.hpp" +#include <sys/types.h> +#include <sys/types.h> -template<int NB_DATA, +template<size_t NB_ELTS, + // Memory mapping: inputs + size_t INPUT_MEM_CONT_OFFSET, + size_t INPUT_MEM_CONT_SIZE, + size_t INPUT_MEM_WRAP_OFFSET, + size_t INPUT_MEM_WRAP_SIZE, + size_t INPUT_MEM_STRIDE, + // Memory mapping: outputs + size_t OUTPUT_MEM_CONT_OFFSET, + size_t OUTPUT_MEM_CONT_SIZE, + size_t OUTPUT_MEM_WRAP_OFFSET, + size_t OUTPUT_MEM_WRAP_SIZE, + size_t OUTPUT_MEM_STRIDE, typename Input_T, typename Output_T> -__attribute__((always_inline)) inline +__attribute__((always_inline)) inline void leakyrelu_forward ( const Input_T* __restrict inputs, Output_T* __restrict outputs, const float negative_slope) { -#ifdef _OPENMP -#pragma omp parallel for -#endif - for (int i = 0; i < NB_DATA; ++i) { - if (inputs[i] >= 0) { - outputs[i] = inputs[i]; - } else { - outputs[i] = negative_slope * inputs[i]; + int inOffset = 0; + int outOffset = 0; + + for (size_t i = 0; i < NB_ELTS; ++i) { + if (INPUT_MEM_WRAP_SIZE > 0 && i == static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) { + inOffset = (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET + - INPUT_MEM_CONT_SIZE) / sizeof(Input_T); + } + + if (OUTPUT_MEM_WRAP_SIZE > 0 && i == static_cast<int>(OUTPUT_MEM_CONT_SIZE / sizeof(Output_T))) { + outOffset = (OUTPUT_MEM_WRAP_OFFSET - OUTPUT_MEM_CONT_OFFSET + - OUTPUT_MEM_CONT_SIZE) / sizeof(Output_T); + } + + if (inputs[inOffset + i] >= 0) { + outputs[outOffset + i] = inputs[inOffset + i]; + } + else { + outputs[outOffset + i] = negative_slope * inputs[inOffset + i]; } } } - -#endif // __AIDGE_EXPORT_CPP_KERNELS_LEAKYRELU__ \ No newline at end of file +#endif // __AIDGE_EXPORT_CPP_KERNELS_LEAKYRELU__ diff --git a/aidge_export_cpp/kernels/matmul.hpp b/aidge_export_cpp/kernels/matmul.hpp index b507c4f1e37065a620a0ac37ed370cfa6847487d..7494519e73cccc5fc0647e02065e3a1685979c7b 100644 --- a/aidge_export_cpp/kernels/matmul.hpp +++ b/aidge_export_cpp/kernels/matmul.hpp @@ -3,15 +3,14 @@ #include "network/typedefs.hpp" #include "network/activation_utils.hpp" +#include <sys/types.h> // Generic function for matmul and activation -template<int M, - int K, - int N, - ActivationFunction_T ACTIVATION, - typename Input_T, typename Output_T, - typename Rescaling_T> +template<size_t NB_MAT, size_t N, size_t M, size_t K, + const size_t OFFSET_IN1[], const size_t OFFSET_IN2[], + ActivationFunction_T ACTIVATION, + typename Input_T, typename Output_T, typename Rescaling_T> __attribute__((always_inline)) inline void matmul_forward ( const Input_T* __restrict inputs1, @@ -19,15 +18,28 @@ void matmul_forward ( Output_T* __restrict outputs, const Rescaling_T& __restrict rescaling) { - for (int m = 0; m < M; ++m) { - for (int n = 0; n < N; ++n) { - Output_T sum = Output_T(0); - for (int k = 0; k < K; ++k) { - sum += inputs1[K*m + k] * inputs2[N*k + n]; + for (size_t stack = 0; stack < NB_MAT; ++stack) { + const size_t offset1 = OFFSET_IN1[stack] * N * K; + const size_t offset2 = OFFSET_IN2[stack] * K * M; + Output_T* out_ptr = &outputs[stack * N * M]; + + for (size_t i = 0; i < N; ++i) { + const Output_T* in1_row = &inputs1[offset1 + i * K]; + + for (size_t j = 0; j < M; ++j) { + Output_T sum = 0; + + // Access column of inputs2 as row-major + for (size_t l = 0; l < K; ++l) { + sum += in1_row[l] * inputs2[offset2 + l * M + j]; + } + + out_ptr[i * M + j] = activation_forward_value<Output_T>( + sum, 0 /* not applicable */, ACTIVATION, rescaling + ); } - outputs[N*m + n] = activation_forward_value<Output_T>(sum, 0/*not applicable*/, ACTIVATION, rescaling); } } } -#endif // __AIDGE_EXPORT_CPP_KERNELS_MATMUL__ +#endif // __AIDGE_EXPORT_CPP_KERNELS_MATMUL__ \ No newline at end of file diff --git a/aidge_export_cpp/kernels/pad.hpp b/aidge_export_cpp/kernels/pad.hpp index 4e83257c1152b1963dd4b0eefc912216a729de7d..3f382c466de7b4549bf45b34398a7a893c8c5fc8 100644 --- a/aidge_export_cpp/kernels/pad.hpp +++ b/aidge_export_cpp/kernels/pad.hpp @@ -3,44 +3,83 @@ #include "network/typedefs.hpp" #include "network/utils.hpp" +#include <sys/types.h> -// Todo add border value and border type (Reflect, Constant, Wrap...) and add the two missing pad value (bottom and right) - -template<int NB_BATCHES, int NB_CHANNELS, - int CHANNELS_HEIGHT, int CHANNELS_WIDTH, - int NB_OUTPUTS, - int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH, - int PADDING_TOP, - int PADDING_LEFT, - int PADDING_BOTTOM, - int PADDING_RIGHT, - typename Input_T, typename Output_T> -__attribute__((always_inline)) inline -void pad_forward( - double borderValue, - const Input_T* __restrict inputs, - Output_T* __restrict outputs - ) -{ - const unsigned int oySize = CHANNELS_HEIGHT + PADDING_TOP + PADDING_BOTTOM; - const unsigned int oxSize = CHANNELS_WIDTH + PADDING_LEFT + PADDING_RIGHT; - - for (unsigned int batch = 0; batch < NB_BATCHES; ++batch) { - for (unsigned int ch = 0; ch < NB_CHANNELS; ++ch) { - const unsigned int preIndex = batch * NB_CHANNELS * CHANNELS_HEIGHT * CHANNELS_WIDTH + ch * CHANNELS_HEIGHT * CHANNELS_WIDTH; - - for (unsigned int oy = 0; oy < oySize; ++oy) { - for (unsigned int ox = 0; ox < oxSize; ++ox) { - const unsigned int outIndex = batch * NB_CHANNELS * oySize * oxSize + ch * oySize * oxSize + oy * oxSize + ox; - - outputs[outIndex] = borderValue; - - const unsigned int inputX = ox - PADDING_LEFT; - const unsigned int inputY = oy - PADDING_TOP; - - if (inputY >= 0 and inputY < CHANNELS_HEIGHT and inputX >= 0 and inputX < CHANNELS_WIDTH) - { - outputs[outIndex] = inputs[preIndex + inputY * CHANNELS_WIDTH + inputX]; +// TODO : add border value and border type (Reflect, Constant, Wrap...) and add +// the two missing pad value (bottom and right) + +template <size_t NB_BATCHES, + size_t NB_CHANNELS, + size_t CHANNELS_HEIGHT, + size_t CHANNELS_WIDTH, + size_t NB_OUTPUTS, + size_t OUTPUTS_HEIGHT, + size_t OUTPUTS_WIDTH, + int PADDING_TOP, + int PADDING_LEFT, + int PADDING_BOTTOM, + int PADDING_RIGHT, + typename Input_T, + typename Output_T> +__attribute__((always_inline)) inline void +pad_forward(double borderValue, + const Input_T *__restrict inputs, + Output_T *__restrict outputs) { + constexpr size_t oySize = + CHANNELS_HEIGHT + PADDING_TOP + PADDING_BOTTOM; + constexpr size_t oxSize = + CHANNELS_WIDTH + PADDING_LEFT + PADDING_RIGHT; + + constexpr size_t inputStrides[3] = { + NB_CHANNELS * CHANNELS_HEIGHT * CHANNELS_WIDTH, + CHANNELS_WIDTH * CHANNELS_HEIGHT, + CHANNELS_WIDTH}; + constexpr size_t outputStrides[3] = { + NB_CHANNELS * oySize * oxSize, + oySize * oxSize, + oxSize, + }; + + for (size_t batch = 0, inBatchOffset = 0, outBatchOffset = 0; + batch < NB_BATCHES; + ++batch, + inBatchOffset += inputStrides[0], + outBatchOffset += outputStrides[0]) { + + for (size_t ch = 0, + inChannelOffset = inBatchOffset, + outChannelOffset = outBatchOffset; + ch < NB_CHANNELS; + ++ch, + inChannelOffset += inputStrides[1], + outChannelOffset += outputStrides[1]) { + + for (int oY = 0, + oDimYOffset = outChannelOffset, + iY = oY - PADDING_TOP, + // iDimOffset won't be used unless iY >= 0 hence no risk + // of negative idx + iDimYOffset = inChannelOffset + iY * inputStrides[2]; + static_cast<size_t>(oY) < oySize; + ++oY, + ++iY, + iDimYOffset += inputStrides[2], + oDimYOffset += outputStrides[2]) { + + if (iY < 0 or iY >= CHANNELS_HEIGHT) { + for (Output_T *o = outputs + oDimYOffset; + o != outputs + oDimYOffset + outputStrides[2]; + ++o) { + *o = borderValue; + } + continue; + } + for (size_t oX = 0; oX < oxSize; ++oX) { + const int iX = static_cast<int>(oX - PADDING_LEFT); + if (iX < 0 or iX >= CHANNELS_WIDTH) { + outputs[oDimYOffset + oX] = borderValue; + } else { + outputs[oDimYOffset + oX] = inputs[iDimYOffset + iX]; } } } @@ -48,4 +87,4 @@ void pad_forward( } } -#endif // __AIDGE_EXPORT_CPP_KERNELS_PAD2D__ +#endif // __AIDGE_EXPORT_CPP_KERNELS_PAD2D__ diff --git a/aidge_export_cpp/kernels/pooling.hpp b/aidge_export_cpp/kernels/pooling.hpp index 12ac69ffcf30e72c6d854753d4d2a22b1ce4419c..7f2d3a392c33f132727ce4638a45c09c9d48761e 100644 --- a/aidge_export_cpp/kernels/pooling.hpp +++ b/aidge_export_cpp/kernels/pooling.hpp @@ -6,60 +6,77 @@ #include <limits> #include <cmath> #include <stdexcept> - - -template<int NB_CHANNELS, - int CHANNELS_HEIGHT, int CHANNELS_WIDTH, - int NB_OUTPUTS, - int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH, - int PADDING_Y, int PADDING_X, - int STRIDE_Y, int STRIDE_X, - int POOL_HEIGHT, int POOL_WIDTH, +#include <sys/types.h> + +template<size_t NB_CHANNELS, + size_t CHANNELS_HEIGHT, size_t CHANNELS_WIDTH, + size_t NB_OUTPUTS, + size_t OUTPUTS_HEIGHT, size_t OUTPUTS_WIDTH, + size_t PADDING_Y, size_t PADDING_X, + size_t STRIDE_Y, size_t STRIDE_X, + size_t POOL_HEIGHT, size_t POOL_WIDTH, Pooling_T POOLING_TYPE, ActivationFunction_T ACTIVATION, + // Memory mapping: inputs + size_t INPUT_MEM_CONT_OFFSET, + size_t INPUT_MEM_CONT_SIZE, + size_t INPUT_MEM_WRAP_OFFSET, + size_t INPUT_MEM_WRAP_SIZE, + size_t INPUT_MEM_STRIDE, + // Memory mapping: outputs + size_t OUTPUT_MEM_CONT_OFFSET, + size_t OUTPUT_MEM_CONT_SIZE, + size_t OUTPUT_MEM_WRAP_OFFSET, + size_t OUTPUT_MEM_WRAP_SIZE, + size_t OUTPUT_MEM_STRIDE, typename Input_T, typename Output_T> __attribute__((always_inline)) inline void pooling_forward( const Input_T* __restrict inputs, Output_T* __restrict outputs) { - constexpr int OUTPUTS_HEIGHT_NOPAD + constexpr size_t OUTPUTS_HEIGHT_NOPAD = (CHANNELS_HEIGHT - POOL_HEIGHT + STRIDE_Y) / STRIDE_Y; - constexpr int OUTPUTS_WIDTH_NOPAD + constexpr size_t OUTPUTS_WIDTH_NOPAD = (CHANNELS_WIDTH - POOL_WIDTH + STRIDE_X) / STRIDE_X; - for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) { - const int syMin = (PADDING_Y == 0) ? 0 + for (size_t oy = 0; oy < OUTPUTS_HEIGHT; ++oy) { + const size_t syMin = (PADDING_Y == 0) ? 0 : max(PADDING_Y - (oy * STRIDE_Y), 0); - const int syMax = (PADDING_Y == 0 + const size_t syMax = (PADDING_Y == 0 && OUTPUTS_HEIGHT == OUTPUTS_HEIGHT_NOPAD) ? POOL_HEIGHT : clamp(CHANNELS_HEIGHT + PADDING_Y - (oy * STRIDE_Y), 0, POOL_HEIGHT); - const int iy = (oy * STRIDE_Y) - PADDING_Y; + const int iy = static_cast<int>(oy * STRIDE_Y) - static_cast<int>(PADDING_Y); #ifdef _OPENMP #pragma omp parallel for collapse(2) #endif - for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) { - for (int output = 0; output < NB_OUTPUTS; ++output) { + for (size_t ox = 0; ox < OUTPUTS_WIDTH; ++ox) { + for (size_t output = 0; output < NB_OUTPUTS; ++output) { // moved to inner loop for collapsing --> - const int sxMin = (PADDING_X == 0) ? 0 + const size_t sxMin = (PADDING_X == 0) ? 0 : max(PADDING_X - (ox * STRIDE_X), 0); - const int sxMax = (PADDING_X == 0 + const size_t sxMax = (PADDING_X == 0 && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD) ? POOL_WIDTH : clamp(CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X), 0, POOL_WIDTH); - const int ix = (ox * STRIDE_X) - PADDING_X; + const int ix = static_cast<int>(ox * STRIDE_X) - static_cast<int>(PADDING_X); + + const size_t oPos = (ox + OUTPUTS_WIDTH * oy); + int oOffset = (OUTPUT_MEM_STRIDE / sizeof(Output_T)) * oPos; - const int oPos = (ox + OUTPUTS_WIDTH * oy); - int oOffset = NB_OUTPUTS * oPos; + if (OUTPUT_MEM_WRAP_SIZE > 0 && oOffset >= static_cast<int>(OUTPUT_MEM_CONT_SIZE / sizeof(Output_T))) { + oOffset += (OUTPUT_MEM_WRAP_OFFSET - OUTPUT_MEM_CONT_OFFSET + - OUTPUT_MEM_CONT_SIZE) / sizeof(Output_T); + } // <-- if (POOLING_TYPE == Max) { Input_T maxVal = std::numeric_limits<Input_T>::lowest(); - for (int sy = 0; sy < POOL_HEIGHT; ++sy) { + for (size_t sy = 0; sy < POOL_HEIGHT; ++sy) { if ((PADDING_Y != 0 || OUTPUTS_HEIGHT != OUTPUTS_HEIGHT_NOPAD) && sy >= syMax - syMin) @@ -67,11 +84,29 @@ void pooling_forward( break; } - const int iPos = ((sxMin + ix) - + CHANNELS_WIDTH * (iy + syMin + sy)); - int iOffset = NB_CHANNELS * iPos; + const size_t iPos = static_cast<size_t>(sxMin + ix) + + CHANNELS_WIDTH * (static_cast<size_t>(iy + syMin + sy)); + int iOffset = (INPUT_MEM_STRIDE / sizeof(Input_T)) * iPos; + + // Wrapping cannot occur in the middle of a line, except if + // there is only one line (1D)! + bool wrapInRange = false; + + if (INPUT_MEM_WRAP_SIZE > 0 + && iOffset >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) + { + iOffset += (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET + - INPUT_MEM_CONT_SIZE) / sizeof(Input_T); + } + else if (INPUT_MEM_WRAP_SIZE > 0 && POOL_WIDTH > 1 + && CHANNELS_HEIGHT == 1 // single line (1D)! + && iOffset + POOL_WIDTH * (INPUT_MEM_CONT_SIZE / sizeof(Input_T)) + > (INPUT_MEM_CONT_SIZE / sizeof(Input_T))) + { + wrapInRange = true; + } - for (int sx = 0; sx < POOL_WIDTH; ++sx) { + for (size_t sx = 0; sx < POOL_WIDTH; ++sx) { if ((PADDING_X != 0 || OUTPUTS_WIDTH != OUTPUTS_WIDTH_NOPAD) && sx >= sxMax - sxMin) @@ -79,7 +114,16 @@ void pooling_forward( break; } - int iOffsetInRange = iOffset + output + sx * NB_CHANNELS; + int iOffsetInRange = iOffset + output + + sx * (INPUT_MEM_STRIDE / sizeof(Input_T)); + + if (wrapInRange && + iOffsetInRange >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) + { + iOffsetInRange += (INPUT_MEM_WRAP_OFFSET + - INPUT_MEM_CONT_OFFSET + - INPUT_MEM_CONT_SIZE) / sizeof(Input_T); + } if (inputs[iOffsetInRange] > maxVal) maxVal = inputs[iOffsetInRange]; @@ -91,7 +135,7 @@ void pooling_forward( else if (POOLING_TYPE == Average) { float sum = 0; - for (int sy = 0; sy < POOL_HEIGHT; ++sy) { + for (size_t sy = 0; sy < POOL_HEIGHT; ++sy) { if ((PADDING_Y != 0 || OUTPUTS_HEIGHT != OUTPUTS_HEIGHT_NOPAD) && sy >= syMax - syMin) @@ -99,11 +143,29 @@ void pooling_forward( break; } - const int iPos = ((sxMin + ix) - + CHANNELS_WIDTH * (iy + syMin + sy)); - int iOffset = NB_CHANNELS * iPos; + const size_t iPos = static_cast<size_t>(sxMin + ix) + + CHANNELS_WIDTH * (static_cast<size_t>(iy + syMin + sy)); + int iOffset = (INPUT_MEM_STRIDE / sizeof(Input_T)) * iPos; + + // Wrapping cannot occur in the middle of a line, except if + // there is only one line (1D)! + bool wrapInRange = false; - for (int sx = 0; sx < POOL_WIDTH; ++sx) { + if (INPUT_MEM_WRAP_SIZE > 0 + && iOffset >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) + { + iOffset += (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET + - INPUT_MEM_CONT_SIZE) / sizeof(Input_T); + } + else if (INPUT_MEM_WRAP_SIZE > 0 && POOL_WIDTH > 1 + && CHANNELS_HEIGHT == 1 // single line (1D)! + && iOffset + POOL_WIDTH * (INPUT_MEM_STRIDE / sizeof(Input_T)) + > (INPUT_MEM_CONT_SIZE / sizeof(Input_T))) + { + wrapInRange = true; + } + + for (size_t sx = 0; sx < POOL_WIDTH; ++sx) { if ((PADDING_X != 0 || OUTPUTS_WIDTH != OUTPUTS_WIDTH_NOPAD) && sx >= sxMax - sxMin) @@ -111,7 +173,17 @@ void pooling_forward( break; } - int iOffsetInRange = iOffset + output + sx * NB_CHANNELS; + int iOffsetInRange = iOffset + output + + sx * (INPUT_MEM_STRIDE / sizeof(Input_T)); + + if (wrapInRange && + iOffsetInRange >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) + { + iOffsetInRange += (INPUT_MEM_WRAP_OFFSET + - INPUT_MEM_CONT_OFFSET + - INPUT_MEM_CONT_SIZE) / sizeof(Input_T); + } + sum += inputs[iOffsetInRange]; } } diff --git a/aidge_export_cpp/kernels/reducemean.hpp b/aidge_export_cpp/kernels/reducemean.hpp new file mode 100644 index 0000000000000000000000000000000000000000..b4a693d084af4941dd1d95c74eda84d720388922 --- /dev/null +++ b/aidge_export_cpp/kernels/reducemean.hpp @@ -0,0 +1,175 @@ +#ifndef __AIDGE_EXPORT_CPP_KERNELS_REDUCEMEAN__ +#define __AIDGE_EXPORT_CPP_KERNELS_REDUCEMEAN__ + +#include "network/typedefs.hpp" +#include "network/utils.hpp" +#include <cmath> +#include <type_traits> +#include <sys/types.h> + +template <typename T> +using Acc_T = + typename std::conditional_t<std::is_floating_point<T>::value, T, double>; + +// computes iterative mean +template <typename T> +typename std::enable_if<std::is_floating_point<T>::value, T>::type +stableMean(const T *vec, std::size_t len, std::size_t stride) { + T mean = 0; + for (std::size_t i = 0; i < len; ++i) { + mean = std::fma(vec[i * stride] - mean, + static_cast<T>(1) / static_cast<T>(i + 1), + mean); + } + return mean; +} + +// Specialization for integers: perform the mean computation in float +template <typename T> +typename std::enable_if_t<!std::is_floating_point<T>::value, double> +stableMean(const T *vec, std::size_t len, std::size_t stride) { + double mean = 0; + for (size_t i = 0; i < len; ++i) { + mean = std::fma<double>(static_cast<double>(vec[i * stride]) - mean, + 1.0 / static_cast<double>(i + 1), + mean); + } + return mean; +} + +template <typename T> +typename std::enable_if_t<std::is_floating_point<T>::value, T> +castFromFloat(T value) { + return value; +} + +template <typename T> +typename std::enable_if_t<!std::is_floating_point<T>::value, T> +castFromFloat(double value) { + return static_cast<T>(std::nearbyint(value)); +} + +/** + * @brief computes the mean of the tensor values over specified axis + * This function can be called in a for loop to compute along different axes + iteratively + * @param[in] iDim : input dimensions of the tensor along axis to reduce + * @param[in] preAxisNbElts : nb of elements on each axis before the axis to + reduce. + * @param[in] postAxisNbElts : nb of elements on each axis after the axis to + reduce + * @param[in] axisNbElts : nb of elements on the axis to reduce + * @param[inout] prevAcc: Values returned by previous computation, if 1st + * iteration, its the input tensor. + * @param[inout] currAcc: output of computation : tensor with averaged + values + * along given axis + */ +template <typename Input_T, typename Output_T> +Output_T *computeMeanOverAxis(const size_t preAxisNbElts, + const size_t postAxisNbElts, + const size_t axisNbElts, + const size_t iDim, + const Input_T *__restrict__ prevAcc, + Output_T *currAcc) { + for (size_t preAxisIdx = 0, iPreAxisOffset = 0, oPreAxisOffset = 0; + preAxisIdx < preAxisNbElts; + ++preAxisIdx, + iPreAxisOffset += axisNbElts, + oPreAxisOffset += postAxisNbElts) { + + for (size_t postAxisIdx = 0; postAxisIdx < postAxisNbElts; + ++postAxisIdx) { + currAcc[oPreAxisOffset + postAxisIdx] = castFromFloat<Output_T>( + stableMean(prevAcc + iPreAxisOffset + postAxisIdx, + iDim, + postAxisNbElts)); + } + } + return currAcc; +} + +template <size_t IN_NB_DIMS, + size_t IN_NB_ELTS, + size_t OUT_NB_ELTS, + size_t NB_AXES_TO_REDUCE, + typename Input_T, + typename Output_T> +__attribute__((always_inline)) inline void +reducemean_forward(const size_t axesToReduce[NB_AXES_TO_REDUCE], + const size_t iDims[IN_NB_DIMS], + const size_t preAxisStrides[IN_NB_DIMS], + const size_t postAxisStrides[IN_NB_DIMS], + const Input_T *__restrict input, + Output_T *__restrict output) { + + switch (NB_AXES_TO_REDUCE) { + case 0: { + copy_n(input, IN_NB_ELTS, output); + break; + } + case 1: { + output = computeMeanOverAxis<Input_T, Output_T>( + preAxisStrides[axesToReduce[0]], + postAxisStrides[axesToReduce[0]], + iDims[axesToReduce[0]] * postAxisStrides[axesToReduce[0]], + iDims[axesToReduce[0]], + input, + output); + break; + } + default: { + + // the set up for th elfor loop is basically just unrolling the 1st + // iteration. + size_t outputElements = IN_NB_ELTS / iDims[axesToReduce[0]]; + Acc_T<Output_T> *currAcc = new Acc_T<Input_T>[outputElements]; + Acc_T<Input_T> *prevAcc = nullptr; + prevAcc = computeMeanOverAxis<Input_T, Output_T>( + preAxisStrides[axesToReduce[0]], + postAxisStrides[axesToReduce[0]], + iDims[axesToReduce[0]] * postAxisStrides[axesToReduce[0]], + iDims[axesToReduce[0]], + input, + currAcc); + + // mutable copy of preAxisStride to avoid modifying input values + size_t preAxisStrides_mut[IN_NB_DIMS]; + for (size_t i = 0; i < IN_NB_DIMS; ++i) { + preAxisStrides_mut[i] = + i < axesToReduce[0] + 1 + ? preAxisStrides[i] + : preAxisStrides[i] / iDims[axesToReduce[0]]; + } + + for (size_t i = 1; i < NB_AXES_TO_REDUCE; ++i) { + const size_t axis = axesToReduce[i]; + outputElements /= iDims[i]; + currAcc = new Acc_T<Input_T>[outputElements]; + currAcc = computeMeanOverAxis<Acc_T<Input_T>, Acc_T<Output_T>>( + preAxisStrides_mut[axis], + postAxisStrides[axis], + iDims[axis] * postAxisStrides[axis], + iDims[axis], + prevAcc, + currAcc); + + for (size_t j = axis + 1; j < IN_NB_DIMS; ++j) { + preAxisStrides_mut[j] /= iDims[axis]; + } + delete[] prevAcc; + prevAcc = currAcc; + } + + for (size_t i = 0; i < OUT_NB_ELTS; ++i) { + output[i] = castFromFloat<Output_T>(currAcc[i]); + } + + if (currAcc) { + delete[] currAcc; + } + } + } +} + +#endif // __AIDGE_EXPORT_CPP_KERNELS_REDUCEMEAN__ diff --git a/aidge_export_cpp/kernels/rescaling.hpp b/aidge_export_cpp/kernels/rescaling.hpp deleted file mode 100644 index a831fa8730dfa45384c6f251d7fe079caa015ce6..0000000000000000000000000000000000000000 --- a/aidge_export_cpp/kernels/rescaling.hpp +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef __AIDGE_EXPORT_CPP_NETWORK_RESCALING__ -#define __AIDGE_EXPORT_CPP_NETWORK_RESCALING__ - -#include "network/rescaling_utils.hpp" -#include "network/activation_utils.hpp" - -template<int NB_DATA, - ActivationFunction_T ACTIVATION, - typename Input_T, - typename Output_T, - typename Rescaling_T> -__attribute__((always_inline)) inline -void rescaling_forward ( - const Input_T* __restrict inputs, - Output_T* __restrict outputs, - const Rescaling_T& __restrict rescaling) -{ -#ifdef _OPENMP - #pragma omp parallel -#endif - for (int i = 0; i < NB_DATA; ++i) { - outputs[i] = activation_forward_value<Output_T>(inputs[i] , 0, ACTIVATION, rescaling); - } -} - -#endif // __AIDGE_EXPORT_CPP_NETWORK_RESCALING__ diff --git a/aidge_export_cpp/kernels/sigmoid.hpp b/aidge_export_cpp/kernels/sigmoid.hpp new file mode 100644 index 0000000000000000000000000000000000000000..d3d548d7e8ebd4bdd215e232079aecad4a31c206 --- /dev/null +++ b/aidge_export_cpp/kernels/sigmoid.hpp @@ -0,0 +1,75 @@ +#ifndef __AIDGE_EXPORT_CPP_KERNELS_SIGMOID__ +#define __AIDGE_EXPORT_CPP_KERNELS_SIGMOID__ + +#include "network/typedefs.hpp" +#include <cmath> +#include <array> +#include <algorithm> +#include <limits> +#include <cstdint> +#include <sys/types.h> + +template <typename Input_T, class Output_T, size_t LUT_IDX, size_t LUT_SIZE> +constexpr Input_T sigmoid_index() +{ + constexpr auto unitVal = std::numeric_limits<typename std::make_unsigned<Output_T>::type>::max(); + constexpr auto y = 0.5f + 0.5f * static_cast<float>(LUT_IDX) / LUT_SIZE; + constexpr auto x = std::log(y) - std::log(1.0f - y); + return unitVal * x; +} + +template <typename Input_T, typename Output_T, std::size_t... I> +constexpr auto sigmoid_lookup_helper(std::index_sequence<I...>) +{ + return std::array<Input_T, sizeof...(I)>({sigmoid_index<Input_T, Output_T, I, sizeof...(I)>()...}); +} + +template <typename Input_T, typename Output_T, size_t LUT_Size> +constexpr auto sigmoid_lookup() +{ + return sigmoid_lookup_helper<Input_T, Output_T>(std::make_index_sequence<LUT_Size>()); +} + +template <typename Output_T, size_t LUT_SIZE> +constexpr Output_T sigmoid_scale_idx(size_t idx, bool pos) { + constexpr auto midVal = (std::numeric_limits<Output_T>::max() + 1) / 2; + return (pos) ? midVal + (midVal*idx) / LUT_SIZE - 1 : midVal - (midVal*idx) / LUT_SIZE; +} + +template<size_t NB_ELTS, + typename Input_T, typename Output_T> +__attribute__((always_inline)) inline +typename std::enable_if<std::is_floating_point<Input_T>::value || std::is_floating_point<Output_T>::value, void>::type +sigmoid_forward ( + const Input_T* __restrict inputs, + Output_T* __restrict outputs) +{ + for (size_t i = 0; i < NB_ELTS; ++i) { + if (inputs[i] > Input_T(0)) { + outputs[i] = Output_T(1) / (Output_T(1) + std::exp(-inputs[i])); + } + else { + outputs[i] = std::exp(inputs[i]) / (Output_T(1) + std::exp(inputs[i])); + } + } +} + +template<size_t NB_ELTS, + typename Input_T, typename Output_T> +__attribute__((always_inline)) inline +typename std::enable_if<!std::is_floating_point<Input_T>::value && !std::is_floating_point<Output_T>::value, void>::type +sigmoid_forward ( + const Input_T* __restrict inputs, + Output_T* __restrict outputs) +{ + constexpr size_t LUT_Size = 1 << (8 * sizeof(Output_T) - 1); + static constexpr auto lut = sigmoid_lookup<Input_T, Output_T, LUT_Size>(); + + for (size_t i = 0; i < NB_ELTS; ++i) { + const auto it = std::lower_bound(std::begin(lut), std::end(lut), std::abs(inputs[i])); + const auto idx = std::distance(std::begin(lut), it); + outputs[i] = sigmoid_scale_idx<Output_T, LUT_Size>(idx, inputs[i] > 0); + } +} + +#endif // __AIDGE_EXPORT_CPP_KERNELS_SIGMOID__ diff --git a/aidge_export_cpp/kernels/slice.hpp b/aidge_export_cpp/kernels/slice.hpp new file mode 100644 index 0000000000000000000000000000000000000000..978ffca5bf549efcac0aa9a3c80570e6fcd830a6 --- /dev/null +++ b/aidge_export_cpp/kernels/slice.hpp @@ -0,0 +1,40 @@ +#ifndef __AIDGE_EXPORT_CPP_KERNELS_SLICE__ +#define __AIDGE_EXPORT_CPP_KERNELS_SLICE__ + +#include "network/typedefs.hpp" +#include <sys/types.h> + +// Generic function for slice +// Note : implementation differs from cpu_backend's but this one uses no additional buffer. + +template <typename T, + size_t NB_DIMS, size_t NB_ELTS, size_t NB_AXES, + const size_t STARTS[], const size_t ENDS[], const size_t STEPS[], + const size_t AXES_MOD[], const size_t AXES_DIV[], + typename Input_T, typename Output_T> +__attribute__((always_inline)) inline +void slice_forward ( + const Input_T* __restrict inputs, + Output_T* __restrict outputs) +{ + // iterate on each element and check if it belongs into the slice + size_t o = 0; + for (size_t e=0; e<NB_ELTS; e++){ + bool is_sliced=true; + for (size_t i=0; i<NB_AXES; i++){ // check for for each sliced ax + const size_t ax_idx = (e % AXES_MOD[i]) / AXES_DIV[i]; + // check steps and boundaries + if (((ax_idx - STARTS[i]) % STEPS[i] != 0) || (ax_idx < STARTS[i]) || (ax_idx >= ENDS[i])){ + is_sliced = false; + break; + } + } + // If the element is in the slice, copy it to output + if (is_sliced){ + outputs[o] = inputs[e]; + o++; + } + } +} + +#endif // __AIDGE_EXPORT_CPP_KERNELS_SLICE__ \ No newline at end of file diff --git a/aidge_export_cpp/kernels/softmax.hpp b/aidge_export_cpp/kernels/softmax.hpp index d29e9b42cba35287c71d32f211550a51b784aa12..da97f2dae18e1715f800aa737f0738057ee6f20a 100644 --- a/aidge_export_cpp/kernels/softmax.hpp +++ b/aidge_export_cpp/kernels/softmax.hpp @@ -7,10 +7,11 @@ #include <type_traits> #include <cmath> #include <algorithm> +#include <sys/types.h> -template<int AXIS_SIZE, - int AXIS_SIZE_POST, - int AXIS_SIZE_PRE, +template<size_t AXIS_SIZE, + size_t AXIS_SIZE_POST, + size_t AXIS_SIZE_PRE, typename Input_T, typename Output_T> __attribute__((always_inline)) inline void softmax_forward ( @@ -20,29 +21,29 @@ void softmax_forward ( // Iterate over the "pre-axis" and "post-axis" slices. // For each slice along the axis, compute the maximum value, // the sum of exponentials, and then write the normalized softmax outputs. - for (int i = 0; i < AXIS_SIZE_PRE; ++i) { - for (int j = 0; j < AXIS_SIZE_POST; ++j) { + for (size_t i = 0; i < AXIS_SIZE_PRE; ++i) { + for (size_t j = 0; j < AXIS_SIZE_POST; ++j) { // Compute the base index for this slice. - const int baseIdx = i * AXIS_SIZE * AXIS_SIZE_POST + j; + const size_t baseIdx = i * AXIS_SIZE * AXIS_SIZE_POST + j; // Find the maximum value along the axis. Input_T maxVal = inputs[baseIdx]; - for (int k = 1; k < AXIS_SIZE; ++k) { - const int idx = baseIdx + k * AXIS_SIZE_POST; + for (size_t k = 1; k < AXIS_SIZE; ++k) { + const size_t idx = baseIdx + k * AXIS_SIZE_POST; maxVal = std::max(maxVal, inputs[idx]); } // Compute the sum of the exponentials along the axis. Input_T sumExp = 0; - for (int k = 0; k < AXIS_SIZE; ++k) { - const int idx = baseIdx + k * AXIS_SIZE_POST; + for (size_t k = 0; k < AXIS_SIZE; ++k) { + const size_t idx = baseIdx + k * AXIS_SIZE_POST; outputs[idx] = std::exp(inputs[idx] - maxVal); sumExp += outputs[idx]; } // Write the softmax values to the output. - for (int k = 0; k < AXIS_SIZE; ++k) { - const int idx = baseIdx + k * AXIS_SIZE_POST; + for (size_t k = 0; k < AXIS_SIZE; ++k) { + const size_t idx = baseIdx + k * AXIS_SIZE_POST; outputs[idx] /= sumExp; } } diff --git a/aidge_export_cpp/kernels/transpose.hpp b/aidge_export_cpp/kernels/transpose.hpp index 31c9e27869c5e2fde701f6700fd4964ea4cefd29..57d48c283644df26a52ff2c561206e09af0602a1 100644 --- a/aidge_export_cpp/kernels/transpose.hpp +++ b/aidge_export_cpp/kernels/transpose.hpp @@ -12,6 +12,8 @@ #ifndef __AIDGE_EXPORT_CPP_KERNELS_TRANSPOSE__ #define __AIDGE_EXPORT_CPP_KERNELS_TRANSPOSE__ +#include <sys/types.h> + /** * @brief Transposes an N-dimensional tensor based on the specified permutation. * @@ -25,62 +27,36 @@ * @tparam T Data type of the tensor elements. * @tparam NB_DIMS Number of dimensions of the input tensor. * @param[in] inputs Pointer to the input tensor data stored in contiguous memory. - * @param[in] in_dims Array containing the size of each dimension of the input tensor. - * @param[in] permute Array of unsigned integers specifying the desired permutation - * of dimensions. Each value should be in the range [0, NB_DIMS-1], - * defining the new order of dimensions for the output tensor. - * @param[in] total_size Total number of elements in the input/output tensor. * @param[out] outputs Pointer to the pre-allocated memory for the transposed tensor. * Ensure this memory is appropriately sized to hold the transposed data. */ -template <typename T,unsigned int NB_DIMS> +template <typename T, size_t NB_DIMS, size_t NB_ELTS, + const size_t PERMUTE[], const size_t IN_DIMS[], const size_t OUT_STRIDE[]> +__attribute__((always_inline)) inline void transpose_ND_forward(const T *__restrict inputs, - const unsigned int *in_dims, - const unsigned int *permute, - const unsigned int total_size, T *__restrict outputs) { - // Compute strides for input tensor - unsigned int in_strides[NB_DIMS]; - in_strides[NB_DIMS - 1] = 1; - for (int i = NB_DIMS - 2; i >= 0; --i) - { - in_strides[i] = in_strides[i + 1] * in_dims[i + 1]; - } - - // Compute dimensions and strides for output tensor - unsigned int out_dims[NB_DIMS]; - unsigned int out_strides[NB_DIMS]; - out_strides[NB_DIMS - 1] = 1; - for (unsigned int i = 0; i < NB_DIMS; ++i) - { - out_dims[i] = in_dims[permute[i]]; - } - for (int i = NB_DIMS - 2; i >= 0; --i) - { - out_strides[i] = out_strides[i + 1] * out_dims[i + 1]; - } - - unsigned int current_idx[NB_DIMS]; + size_t current_idx[NB_DIMS] = {0}; // Iterate over all elements in the input tensor - for (unsigned int idx = 0; idx < total_size; ++idx) - { - - unsigned int remaining = idx; - for (unsigned int i = 0; i < NB_DIMS; ++i) - { - current_idx[i] = remaining / in_strides[i]; - remaining = remaining % in_strides[i]; - } - - unsigned int output_index = 0; - for (unsigned int i = 0; i < NB_DIMS; ++i) - { - output_index += current_idx[permute[i]] * out_strides[i]; + for (size_t idx = 0; idx < NB_ELTS; ++idx) { + // Compute output index using current_idx + size_t output_index = 0; + for (size_t i = 0; i < NB_DIMS; ++i) { + output_index += current_idx[PERMUTE[i]] * OUT_STRIDE[i]; } outputs[output_index] = inputs[idx]; + + // Increment current_idx as a multidimensional counter + for (int i = NB_DIMS - 1; i >= 0; --i) { + if (++current_idx[i] < IN_DIMS[i]) { + break; + } + else { + current_idx[i] = 0; + } + } } } diff --git a/aidge_export_cpp/operators/BatchNorm.py b/aidge_export_cpp/operators/BatchNorm.py index b0f5a16f195fb27846db9a8727b3804d84520d12..4552a009a2bc46726e5b8acc6473dfde467c2cf2 100644 --- a/aidge_export_cpp/operators/BatchNorm.py +++ b/aidge_export_cpp/operators/BatchNorm.py @@ -3,7 +3,9 @@ from aidge_core.export_utils import ExportNodeCpp from aidge_export_cpp import ROOT from aidge_export_cpp import ExportLibCpp -@ExportLibCpp.register("BatchNorm2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32))) +@ExportLibCpp.register("BatchNorm2D", + aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32, aidge_core.dformat.nchw)), + aidge_core.ProdConso.in_place_model) class BatchNorm(ExportNodeCpp): def __init__(self, node, mem_info): super().__init__(node, mem_info) diff --git a/aidge_export_cpp/operators/Conv.py b/aidge_export_cpp/operators/Conv.py index c8137c51377c103855b9c2d133707124ebef64c5..44b50b49a2ecf4c5957fc842aee01a19c3e0ecb1 100644 --- a/aidge_export_cpp/operators/Conv.py +++ b/aidge_export_cpp/operators/Conv.py @@ -2,8 +2,116 @@ import aidge_core from aidge_core.export_utils import ExportNodeCpp, get_node_from_metaop from aidge_export_cpp import ROOT, ExportLibCpp, set_scaling_attributes -@ExportLibCpp.register("Conv2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any))) -class Conv(ExportNodeCpp): +# Consumer-Producer model to allow memory wrapping for Conv/PaddedConv +# (and Pool/PaddedPool), keeping one input line margin in NHWC data format +# (one input line = W*C) +class PaddedInPlace_CP(aidge_core.ProdConso): + def __init__(self, op: aidge_core.Operator): + aidge_core.ProdConso.__init__(self, op, False) + + def default_model(op: aidge_core.Operator): + return PaddedInPlace_CP(op) + + def get_nb_required_protected(self, input_idx): + if input_idx != 0: + return super().get_nb_required_protected(input_idx) + + input = self.get_operator().get_input(0) + if not input: + return aidge_core.Elts_t.none_elts() + + # Non-Padded case: margin = one input line + margin = 1 + if not self.get_operator().is_atomic(): + # Padded case: margin = (padding_y / stride_y) input lines + sub_graph = self.get_operator().get_micro_graph().clone() + aidge_core.expand_metaops(sub_graph, True) + + padding_y = 0 + stride_y = 1 + for node in sub_graph.get_nodes(): + if hasattr(node.get_operator().attr, 'stride_dims'): + if len(node.get_operator().attr.stride_dims) > 1: + stride_y = node.get_operator().attr.stride_dims[0] + elif hasattr(node.get_operator().attr, 'begin_end_borders'): + if len(node.get_operator().attr.begin_end_borders) > 2: + padding_y = node.get_operator().attr.begin_end_borders[0] + + margin += padding_y // stride_y + + if len(input.dims()) == 4: + # 2D: one input line = W*C + margin *= input.dims()[2] * input.dims()[3] + else: + # 1D: one input line = C + margin *= input.dims()[2] + + return aidge_core.Elts_t.data_elts(margin) + + +@ExportLibCpp.register("Conv1D", + aidge_core.ImplSpec( + [ # Input specifications + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nwc), + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nwc), + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any) + ], + [ # Output specifications + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nwc) + ], + ), + PaddedInPlace_CP.default_model) +class Conv1D(ExportNodeCpp): + def __init__(self, node, mem_info): + super().__init__(node, mem_info) + + # Initialize kernel attributes + self.attributes["padding"] = [0, 0, 0, 0] + self.attributes["activation"] = "Linear" + self.attributes["aidge_cmp"] = node.attributes().has_attr("aidge_cmp") + + ## Scaling + self.attributes["rescaling"] = "NoScaling" + self.attributes["shift_value"] = 0 + + # Browse the metaop to update kernel attributes + ConvNode = get_node_from_metaop(node, "Conv1D") + self.attributes["kernel_dims"] = ConvNode[0].get_operator().attr.kernel_dims + self.attributes["stride_dims"] = ConvNode[0].get_operator().attr.stride_dims + self.attributes["dilation_dims"] = ConvNode[0].get_operator().attr.dilation_dims + + # Template for layer configutation file generation + self.config_template = str(ROOT / "templates" / "configuration" / "convolution_config.jinja") + + # Template layer call function generation within the forward file + self.forward_template = str(ROOT / "templates" / "kernel_forward" / "convolution_forward.jinja") + + # Files to include within the generated forward.cpp file + self.include_list = [] + + # Path to the kernel(s) files to copy + self.add_kernel_to_copy(ROOT / "kernels" / "convolution.hpp") + self.add_kernel_to_copy(ROOT / "static" / "macs.hpp", "include/network", fwd_include=False) + + # Include aidge outputs within the fwd file + if self.attributes["aidge_cmp"]: + self.include_list.append("network/utils.hpp") # aidge_cmp function + self.include_list.append("data/aidge_outputs/" + node.name() + ".hpp") + + +@ExportLibCpp.register("Conv2D", + aidge_core.ImplSpec( + [ # Input specifications + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc), + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc), + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any) + ], + [ # Output specifications + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc) + ], + ), + PaddedInPlace_CP.default_model) +class Conv2D(ExportNodeCpp): def __init__(self, node, mem_info): super().__init__(node, mem_info) @@ -41,8 +149,19 @@ class Conv(ExportNodeCpp): self.include_list.append("data/aidge_outputs/" + node.name() + ".hpp") -@ExportLibCpp.register_metaop("QConv", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any))) -class QConv(Conv): +@ExportLibCpp.register_metaop("QConv", + aidge_core.ImplSpec( + [ # Input specifications + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc), + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc), + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any) + ], + [ # Output specifications + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc) + ], + ), + PaddedInPlace_CP.default_model) +class QConv(Conv2D): def __init__(self, node, mem_info): super().__init__(node, mem_info) @@ -54,7 +173,18 @@ class QConv(Conv): self.attributes["rescaling"] = "SingleShiftScaling" -@ExportLibCpp.register_metaop("PadConv", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any))) +@ExportLibCpp.register_metaop(["PaddedConv2D", "PadConv"], + aidge_core.ImplSpec( + [ # Input specifications + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc), + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc), + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any) + ], + [ # Output specifications + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc) + ], + ), + PaddedInPlace_CP.default_model) class PadConv(QConv): def __init__(self, node, mem_info): super().__init__(node, mem_info) @@ -64,7 +194,18 @@ class PadConv(QConv): self.attributes["padding"] = PadNode[0].get_operator().attr.begin_end_borders -@ExportLibCpp.register_metaop("ConvAct", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any))) +@ExportLibCpp.register_metaop("ConvAct", + aidge_core.ImplSpec( + [ # Input specifications + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc), + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc), + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any) + ], + [ # Output specifications + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc) + ], + ), + PaddedInPlace_CP.default_model) class ConvAct(QConv): def __init__(self, node, mem_info): super().__init__(node, mem_info) @@ -75,7 +216,18 @@ class ConvAct(QConv): else: aidge_core.Log.error(f"{node.type()} activation is not yet supported.") -@ExportLibCpp.register_metaop("PadConvAct", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any))) +@ExportLibCpp.register_metaop("PadConvAct", + aidge_core.ImplSpec( + [ # Input specifications + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc), + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc), + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any) + ], + [ # Output specifications + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc) + ], + ), + PaddedInPlace_CP.default_model) class PadConvAct(PadConv, ConvAct): def __init__(self, node, mem_info): super().__init__(node, mem_info) diff --git a/aidge_export_cpp/operators/ConvDw.py b/aidge_export_cpp/operators/ConvDw.py index 936c3b6a9a6b96012d3c01a1ab6961e1adcc0c1e..136666719552aa58ac8e7a08c363c4cfd128c5e2 100644 --- a/aidge_export_cpp/operators/ConvDw.py +++ b/aidge_export_cpp/operators/ConvDw.py @@ -1,8 +1,20 @@ import aidge_core from aidge_core.export_utils import ExportNodeCpp, get_node_from_metaop from aidge_export_cpp import ROOT, ExportLibCpp, set_scaling_attributes - -@ExportLibCpp.register("ConvDepthWise2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any))) +from .Conv import PaddedInPlace_CP + +@ExportLibCpp.register("ConvDepthWise2D", + aidge_core.ImplSpec( + [ # Input specifications + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc), + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc), + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any) + ], + [ # Output specifications + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc) + ], + ), + PaddedInPlace_CP.default_model) class ConvDw(ExportNodeCpp): def __init__(self, node, mem_info): super().__init__(node, mem_info) @@ -42,7 +54,18 @@ class ConvDw(ExportNodeCpp): self.include_list.append("data/aidge_outputs/" + node.name() + ".hpp") -@ExportLibCpp.register_metaop("QConvDw", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any))) +@ExportLibCpp.register_metaop("QConvDw", + aidge_core.ImplSpec( + [ # Input specifications + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc), + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc), + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any) + ], + [ # Output specifications + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc) + ], + ), + PaddedInPlace_CP.default_model) class QConvDw(ConvDw): def __init__(self, node, mem_info): super().__init__(node, mem_info) @@ -55,7 +78,18 @@ class QConvDw(ConvDw): self.attributes["rescaling"] = "SingleShiftScaling" -@ExportLibCpp.register_metaop("PadConvDw", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any))) +@ExportLibCpp.register_metaop(["PaddedConvDepthWise2D", "PadConvDw"], + aidge_core.ImplSpec( + [ # Input specifications + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc), + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc), + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any) + ], + [ # Output specifications + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc) + ], + ), + PaddedInPlace_CP.default_model) class PadConvDw(QConvDw): def __init__(self, node, mem_info): super().__init__(node, mem_info) @@ -65,7 +99,18 @@ class PadConvDw(QConvDw): self.attributes["padding"] = PadNode[0].get_operator().attr.begin_end_borders -@ExportLibCpp.register_metaop("ConvDwAct", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any))) +@ExportLibCpp.register_metaop("ConvDwAct", + aidge_core.ImplSpec( + [ # Input specifications + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc), + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc), + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any) + ], + [ # Output specifications + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc) + ], + ), + PaddedInPlace_CP.default_model) class ConvDwAct(QConvDw): def __init__(self, node, mem_info): super().__init__(node, mem_info) @@ -76,7 +121,18 @@ class ConvDwAct(QConvDw): else: aidge_core.Log.error(f"{node.type()} activation is not yet supported.") -@ExportLibCpp.register_metaop("PadConvDwAct", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any))) +@ExportLibCpp.register_metaop("PadConvDwAct", + aidge_core.ImplSpec( + [ # Input specifications + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc), + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc), + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any) + ], + [ # Output specifications + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc) + ], + ), + PaddedInPlace_CP.default_model) class PadConvDwAct(PadConvDw, ConvDwAct): def __init__(self, node, mem_info): super().__init__(node, mem_info) diff --git a/aidge_export_cpp/operators/ElemWise.py b/aidge_export_cpp/operators/ElemWise.py index 7d073ca9549aa0fb67a2c63562536a7c6808e6cd..751fb3a7ec89d1f6e27daa3a1aae478709d715ac 100644 --- a/aidge_export_cpp/operators/ElemWise.py +++ b/aidge_export_cpp/operators/ElemWise.py @@ -15,6 +15,80 @@ class ElemWise(ExportNodeCpp): self.attributes["shift_value"] = 0 self.attributes["coef_value"] = 1 + nbdims_out = len(self.attributes["out_dims"][0]) + dims_a = self.attributes["in_dims"][0] + dims_b = self.attributes["in_dims"][1] + ndim_a = [0] * nbdims_out + ndim_b = [0] * nbdims_out + + idx_a = nbdims_out - len(dims_a) + for i in range(nbdims_out): + ndim_a[i] = 1 if i < idx_a else dims_a[i - idx_a] + + idx_b = nbdims_out - len(dims_b) + for i in range(nbdims_out): + ndim_b[i] = 1 if i < idx_b else dims_b[i - idx_b] + + # Find highest equal dimension + contiguousIdx = nbdims_out - 1 + for i in range(nbdims_out - 1, -1, -1): + if ndim_a[i] != ndim_b[i]: + break + contiguousIdx = i + + # Compute the highest number of contiguous data + input0_contiguous_size = 1 + input1_contiguous_size = 1 + output_contiguous_size = 1 + for i in range(contiguousIdx, nbdims_out): + input0_contiguous_size *= ndim_a[i] + input1_contiguous_size *= ndim_b[i] + output_contiguous_size *= self.attributes["out_dims"][0][i] + + self.attributes["input1_cont_size"] = input0_contiguous_size + self.attributes["input2_cont_size"] = input1_contiguous_size + self.attributes["output_cont_size"] = output_contiguous_size + + # Initialize strides for broadcasting + stride_post0 = [0] * contiguousIdx + stride_post1 = [0] * contiguousIdx + stride_step0 = [0] * contiguousIdx + stride_step1 = [0] * contiguousIdx + + if contiguousIdx > 0: + stride_post0[contiguousIdx - 1] = 1 + stride_post1[contiguousIdx - 1] = 1 + for i in range(contiguousIdx - 2, -1, -1): + stride_post0[i] = stride_post0[i + 1] * ndim_a[i + 1] + stride_post1[i] = stride_post1[i + 1] * ndim_b[i + 1] + + for i in range(contiguousIdx): + stride_step0[i] = 1 - stride_post0[i] if ndim_a[i] == 1 else 1 + stride_step1[i] = 1 - stride_post1[i] if ndim_b[i] == 1 else 1 + + # Offset and matrix count + offsetIn0 = 0 + offsetIn1 = 0 + nbMatrices = 1 + for i in range(contiguousIdx): + nbMatrices *= self.attributes["out_dims"][0][i] + + + self.attributes["offset_in1"] = [0] + self.attributes["offset_in2"] = [0] + + for stack in range(1, nbMatrices): + dim = contiguousIdx - 1 + tmp_stack = stack + while tmp_stack % self.attributes["out_dims"][0][dim] == 0: + tmp_stack //= self.attributes["out_dims"][0][dim] + dim -= 1 + offsetIn0 += stride_step0[dim] + offsetIn1 += stride_step1[dim] + + self.attributes["offset_in1"].append(offsetIn0) + self.attributes["offset_in2"].append(offsetIn1) + # Template for layer configutation file generation self.config_template = str(ROOT / "templates" / "configuration" / "elemwise_config.jinja") @@ -47,20 +121,26 @@ class QElemWise(ElemWise): self.attributes["rescaling"] = "SingleShiftScaling" -@ExportLibCpp.register("Add", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any))) +@ExportLibCpp.register("Add", + aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)), + aidge_core.ProdConso.in_place_model) class Add(ElemWise): def __init__(self, node, mem_info): super().__init__(node, mem_info) self.attributes["elemwise_op"] = "Add" -@ExportLibCpp.register_metaop("QAdd", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any))) +@ExportLibCpp.register_metaop("QAdd", + aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)), + aidge_core.ProdConso.in_place_model) class QAdd(QElemWise, Add): def __init__(self, node, mem_info): super().__init__(node, mem_info) -@ExportLibCpp.register_metaop("AddAct", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any))) +@ExportLibCpp.register_metaop("AddAct", + aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)), + aidge_core.ProdConso.in_place_model) class AddAct(QAdd): def __init__(self, node, mem_info): super().__init__(node, mem_info) @@ -72,20 +152,26 @@ class AddAct(QAdd): aidge_core.Log.error(f"{node.type()} activation is not yet supported.") -@ExportLibCpp.register("Sub", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any))) +@ExportLibCpp.register("Sub", + aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)), + aidge_core.ProdConso.in_place_model) class Sub(ElemWise): def __init__(self, node, mem_info): super().__init__(node, mem_info) self.attributes["elemwise_op"] = "Sub" -@ExportLibCpp.register_metaop("QSub", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any))) +@ExportLibCpp.register_metaop("QSub", + aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)), + aidge_core.ProdConso.in_place_model) class QSub(QElemWise, Sub): def __init__(self, node, mem_info): super().__init__(node, mem_info) -@ExportLibCpp.register_metaop("SubAct", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any))) +@ExportLibCpp.register_metaop("SubAct", + aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)), + aidge_core.ProdConso.in_place_model) class SubAct(QSub): def __init__(self, node, mem_info): super().__init__(node, mem_info) @@ -97,15 +183,41 @@ class SubAct(QSub): aidge_core.Log.error(f"{node.type()} activation is not yet supported.") -@ExportLibCpp.register("Mul", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any))) +@ExportLibCpp.register("Mul", + aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)), + aidge_core.ProdConso.in_place_model) class Mul(QElemWise): def __init__(self, node, mem_info): super().__init__(node, mem_info) self.attributes["elemwise_op"] = "Mul" -@ExportLibCpp.register_metaop("MulAct", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any))) +@ExportLibCpp.register_metaop("MulAct", + aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)), + aidge_core.ProdConso.in_place_model) class MulAct(Mul): + def __init__(self, node, mem_info): + super().__init__(node, mem_info) + + # Browse the metaop to update kernel attributes + if get_node_from_metaop(node, "ReLU"): + self.attributes["activation"] = "Rectifier" + else: + aidge_core.Log.error(f"{node.type()} activation is not yet supported.") + +@ExportLibCpp.register("Div", + aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)), + aidge_core.ProdConso.in_place_model) +class Div(QElemWise): + def __init__(self, node, mem_info): + super().__init__(node, mem_info) + self.attributes["elemwise_op"] = "Div" + + +@ExportLibCpp.register_metaop("DivAct", + aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)), + aidge_core.ProdConso.in_place_model) +class DivAct(Div): def __init__(self, node, mem_info): super().__init__(node, mem_info) diff --git a/aidge_export_cpp/operators/Erf.py b/aidge_export_cpp/operators/Erf.py new file mode 100644 index 0000000000000000000000000000000000000000..dd0f36a2e09ea359681d88b77073905ce6bb2fff --- /dev/null +++ b/aidge_export_cpp/operators/Erf.py @@ -0,0 +1,21 @@ +import aidge_core +from aidge_core.export_utils import ExportNodeCpp, get_node_from_metaop +from aidge_export_cpp import ROOT, ExportLibCpp, set_scaling_attributes + +@ExportLibCpp.register("Erf", + aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)), + aidge_core.ProdConso.in_place_model) +class ErfCPP(ExportNodeCpp): + def __init__(self, node, mem_info): + super().__init__(node, mem_info) + self.attributes["activation"] = "Linear" + self.attributes["rescaling"] = "NoScaling" + self.config_template = str( + ROOT / "templates" / "configuration" / "erf_config.jinja") + self.forward_template = str( + ROOT / "templates" / "kernel_forward" / "erf_forward.jinja") + self.include_list = [] + self.kernels_to_copy = [ + str(ROOT / "kernels" / "erf.hpp"), + str(ROOT / "kernels" / "activation.hpp") + ] \ No newline at end of file diff --git a/aidge_export_cpp/operators/Fc.py b/aidge_export_cpp/operators/Fc.py index d32d20e2fad90f8418ee58067f1cd6e6c7e72065..184baabd7c02cae7ca6b1942a6cb37136ceb1a7b 100644 --- a/aidge_export_cpp/operators/Fc.py +++ b/aidge_export_cpp/operators/Fc.py @@ -2,12 +2,22 @@ import aidge_core from aidge_core.export_utils import ExportNodeCpp, get_node_from_metaop from aidge_export_cpp import ROOT, ExportLibCpp, set_scaling_attributes -@ExportLibCpp.register("FC", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any))) +@ExportLibCpp.register("FC", + aidge_core.ImplSpec( + [ # Input specifications + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.default), + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any) + ], + [ # Output specifications + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any) + ], + )) class FC(ExportNodeCpp): def __init__(self, node, mem_info): super().__init__(node, mem_info) # Initialize kernel attributes + self.attributes["kernel"] = "default" self.attributes["activation"] = "Linear" self.attributes["aidge_cmp"] = node.attributes().has_attr("aidge_cmp") @@ -33,8 +43,34 @@ class FC(ExportNodeCpp): self.include_list.append("network/utils.hpp") # aidge_cmp function self.include_list.append("data/aidge_outputs/" + node.name() + ".hpp") +@ExportLibCpp.register("FC", + aidge_core.ImplSpec( + [ # Input specifications + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc), + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any) + ], + [ # Output specifications + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any) + ], + )) +class FC_NHWC(FC): + def __init__(self, node, mem_info): + super().__init__(node, mem_info) + if node.attributes().has_attr("ignore_input_format"): + self.attributes["kernel"] = "" + else: + self.attributes["kernel"] = "transpose" -@ExportLibCpp.register_metaop("QFC", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any))) +@ExportLibCpp.register_metaop("QFC", + aidge_core.ImplSpec( + [ # Input specifications + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.default), + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any) + ], + [ # Output specifications + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any) + ], + )) class QFC(FC): def __init__(self, node, mem_info): super().__init__(node, mem_info) @@ -46,8 +82,34 @@ class QFC(FC): if self.attributes["shift_value"] != 0: self.attributes["rescaling"] = "SingleShiftScaling" +@ExportLibCpp.register_metaop("QFC", + aidge_core.ImplSpec( + [ # Input specifications + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc), + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any) + ], + [ # Output specifications + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any) + ], + )) +class QFC_NHWC(QFC): + def __init__(self, node, mem_info): + super().__init__(node, mem_info) + if node.attributes().has_attr("ignore_input_format"): + self.attributes["kernel"] = "" + else: + self.attributes["kernel"] = "transpose" -@ExportLibCpp.register_metaop("FCAct", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any))) +@ExportLibCpp.register_metaop("FCAct", + aidge_core.ImplSpec( + [ # Input specifications + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.default), + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any) + ], + [ # Output specifications + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any) + ], + )) class FCAct(QFC): def __init__(self, node, mem_info): super().__init__(node, mem_info) @@ -57,3 +119,21 @@ class FCAct(QFC): self.attributes["activation"] = "Rectifier" else: aidge_core.Log.error(f"{node.type()} activation is not yet supported.") + +@ExportLibCpp.register_metaop("FCAct", + aidge_core.ImplSpec( + [ # Input specifications + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc), + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any) + ], + [ # Output specifications + aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.any) + ], + )) +class FCAct_NHWC(FCAct): + def __init__(self, node, mem_info): + super().__init__(node, mem_info) + if node.attributes().has_attr("ignore_input_format"): + self.attributes["kernel"] = "" + else: + self.attributes["kernel"] = "transpose" diff --git a/aidge_export_cpp/operators/Identity.py b/aidge_export_cpp/operators/Identity.py new file mode 100644 index 0000000000000000000000000000000000000000..69d36c59a9d41259ff5f40320aaf35070606169b --- /dev/null +++ b/aidge_export_cpp/operators/Identity.py @@ -0,0 +1,19 @@ +import aidge_core +from aidge_core.export_utils import ExportNodeCpp +from aidge_export_cpp import ROOT +from aidge_export_cpp import ExportLibCpp + +@ExportLibCpp.register("Identity", + aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)), + aidge_core.ProdConso.in_place_model) +class IdentityCPP(ExportNodeCpp): + def __init__(self, node, mem_info): + super().__init__(node, mem_info) + self.config_template = str( + ROOT / "templates" / "configuration" / "identity_config.jinja") + self.forward_template = str( + ROOT / "templates" / "kernel_forward" / "identity_forward.jinja") + self.include_list = [] + self.kernels_to_copy = [ + str(ROOT / "kernels" / "identity.hpp"), + ] diff --git a/aidge_export_cpp/operators/MatMul.py b/aidge_export_cpp/operators/MatMul.py index ac8802cd876c257f70f82ac5464df775299f52ce..7baf4a7886e0c286d8263d924fa75b07865c3790 100644 --- a/aidge_export_cpp/operators/MatMul.py +++ b/aidge_export_cpp/operators/MatMul.py @@ -9,6 +9,75 @@ class MatMulCPP(ExportNodeCpp): super().__init__(node, mem_info) self.attributes["activation"] = "Linear" self.attributes["rescaling"] = "NoScaling" + self.attributes["aidge_cmp"] = node.attributes().has_attr("aidge_cmp") + + # Initialize arrays storing broadcasted(or not) dims + nbdims_out = len(self.attributes["out_dims"][0]) + dims_a = self.attributes["in_dims"][0] + dims_b = self.attributes["in_dims"][1] + ndim_a = [0] * nbdims_out + ndim_b = [0] * nbdims_out + + if len(dims_a) == 1: + ndim_a[0] = 1 + ndim_a[1] = dims_a[0] + + if len(dims_b) == 1: + ndim_b[0] = 1 + ndim_b[1] = dims_b[0] + + idx_a = nbdims_out - len(dims_a) + for i in range(nbdims_out): + ndim_a[i] = 1 if i < idx_a else dims_a[i - idx_a] + + idx_b = nbdims_out - len(dims_b) + for i in range(nbdims_out): + ndim_b[i] = 1 if i < idx_b else dims_b[i - idx_b] + + # Initialize strides for broadcasting + stride_post0 = [0] * (nbdims_out - 2) + stride_post1 = [0] * (nbdims_out - 2) + stride_step0 = [0] * (nbdims_out - 2) + stride_step1 = [0] * (nbdims_out - 2) + + if nbdims_out > 2: + stride_post0[nbdims_out - 3] = 1 + stride_post1[nbdims_out - 3] = 1 + for i in range(nbdims_out - 4, -1, -1): + stride_post0[i] = stride_post0[i + 1] * ndim_a[i + 1] + stride_post1[i] = stride_post1[i + 1] * ndim_b[i + 1] + + for i in range(nbdims_out - 2): + stride_step0[i] = 1 - stride_post0[i] if ndim_a[i] == 1 else 1 + stride_step1[i] = 1 - stride_post1[i] if ndim_b[i] == 1 else 1 + + # if len(dims_b) == len(dims_a), then len(dims_a) == nbdims_out == len(dims_b); + # else it will be broadcasted to the correct dims + nbMatrices = 1 + for i in range(nbdims_out - 3, -1, -1): + nbMatrices *= self.attributes["out_dims"][0][i] + + offsetIn0 = 0 + offsetIn1 = 0 + self.attributes["offset_in1"] = [0] + self.attributes["offset_in2"] = [0] + + for stack in range(1, nbMatrices): + dim = nbdims_out - 3 + tmp_stack = stack + while tmp_stack % self.attributes["out_dims"][0][dim] == 0: + tmp_stack //= self.attributes["out_dims"][0][dim] + dim -= 1 + offsetIn0 += stride_step0[dim] + offsetIn1 += stride_step1[dim] + + self.attributes["offset_in1"].append(offsetIn0) + self.attributes["offset_in2"].append(offsetIn1) + + self.attributes["n"] = ndim_a[nbdims_out - 2] + self.attributes["m"] = ndim_b[nbdims_out - 1] + self.attributes["k"] = ndim_a[nbdims_out - 1] + self.config_template = str( ROOT / "templates" / "configuration" / "matmul_config.jinja") self.forward_template = str( @@ -17,3 +86,8 @@ class MatMulCPP(ExportNodeCpp): self.kernels_to_copy = [ str(ROOT / "kernels" / "matmul.hpp"), ] + + # Include aidge outputs within the fwd file + if self.attributes["aidge_cmp"]: + self.include_list.append("network/utils.hpp") # aidge_cmp function + self.include_list.append("data/aidge_outputs/" + node.name() + ".hpp") diff --git a/aidge_export_cpp/operators/Pad.py b/aidge_export_cpp/operators/Pad.py index f84f2cff0f1df283327c1122ddb7b77049bddd3f..bc73ef43ab862bf11a2a41477006827ffcb19afa 100644 --- a/aidge_export_cpp/operators/Pad.py +++ b/aidge_export_cpp/operators/Pad.py @@ -3,7 +3,30 @@ from aidge_core.export_utils import ExportNodeCpp from aidge_export_cpp import ROOT from aidge_export_cpp import ExportLibCpp -@ExportLibCpp.register("Pad2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any))) +# Consumer-Producer model to allow memory wrapping for Pad in-place operator +class PadInPlace_CP(aidge_core.ProdConso): + def __init__(self, op: aidge_core.Operator): + aidge_core.ProdConso.__init__(self, op, False) + + def default_model(op: aidge_core.Operator): + return PadInPlace_CP(op) + + def get_nb_required_protected(self, input_idx): + if input_idx != 0: + return super().get_nb_required_protected(input_idx) + + pad_node = self.get_operator() + input = pad_node.get_operator().get_input(0) + if input: + output = pad_node.get_operator().get_output(0) + return aidge_core.Elts_t.data_elts(max(0, output.size() - input.size())) + else: + return aidge_core.Elts_t.none_elts() + + +@ExportLibCpp.register("Pad2D", + aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)), + PadInPlace_CP.default_model) class CppPad(ExportNodeCpp): def __init__(self, node, mem_info): super().__init__(node, mem_info) diff --git a/aidge_export_cpp/operators/Pool.py b/aidge_export_cpp/operators/Pool.py index 10d595e5ed4a76c22bcc15f90d8c693b8dbf2144..1d6cd53d6d0f2d7c000b8f3acc81ee5771295080 100644 --- a/aidge_export_cpp/operators/Pool.py +++ b/aidge_export_cpp/operators/Pool.py @@ -2,6 +2,7 @@ import aidge_core from aidge_core.export_utils import ExportNodeCpp, get_node_from_metaop from aidge_export_cpp import ROOT from aidge_export_cpp import ExportLibCpp +from .Conv import PaddedInPlace_CP class Pool(ExportNodeCpp): def __init__(self, node, mem_info): @@ -52,7 +53,9 @@ class PoolAct(Pool): aidge_core.Log.error(f"{node.type()} activation is not yet supported.") -@ExportLibCpp.register("MaxPooling2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any))) +@ExportLibCpp.register("MaxPooling2D", + aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc)), + PaddedInPlace_CP.default_model) class MaxPool(Pool): def __init__(self, node, mem_info): super().__init__(node, mem_info) @@ -64,25 +67,33 @@ class MaxPool(Pool): self.attributes["stride_dims"] = PoolNode[0].get_operator().attr.stride_dims -@ExportLibCpp.register_metaop("PadMaxPool", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any))) +@ExportLibCpp.register_metaop(["PaddedMaxPooling2D", "PadMaxPool"], + aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc)), + PaddedInPlace_CP.default_model) class PadMaxPool(MaxPool, PadPool): def __init__(self, node, mem_info): super().__init__(node, mem_info) -@ExportLibCpp.register_metaop("MaxPoolAct", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any))) +@ExportLibCpp.register_metaop("MaxPoolAct", + aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc)), + PaddedInPlace_CP.default_model) class MaxPoolAct(MaxPool, PoolAct): def __init__(self, node, mem_info): super().__init__(node, mem_info) -@ExportLibCpp.register_metaop("PadMaxPoolAct", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any))) +@ExportLibCpp.register_metaop("PadMaxPoolAct", + aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc)), + PaddedInPlace_CP.default_model) class PadMaxPoolAct(PadMaxPool, MaxPoolAct): def __init__(self, node, mem_info): super().__init__(node, mem_info) -@ExportLibCpp.register("AvgPooling2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any))) +@ExportLibCpp.register("AvgPooling2D", + aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc)), + PaddedInPlace_CP.default_model) class AvgPool(Pool): def __init__(self, node, mem_info): super().__init__(node, mem_info) @@ -94,25 +105,33 @@ class AvgPool(Pool): self.attributes["stride_dims"] = PoolNode[0].get_operator().attr.stride_dims -@ExportLibCpp.register_metaop("PadAvgPool", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any))) +@ExportLibCpp.register_metaop(["PaddedAvgPooling2D", "PadAvgPool"], + aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc)), + PaddedInPlace_CP.default_model) class PadAvgPool(AvgPool, PadPool): def __init__(self, node, mem_info): super().__init__(node, mem_info) -@ExportLibCpp.register_metaop("AvgPoolAct", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any))) +@ExportLibCpp.register_metaop("AvgPoolAct", + aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc)), + PaddedInPlace_CP.default_model) class AvgPoolAct(AvgPool, PoolAct): def __init__(self, node, mem_info): super().__init__(node, mem_info) -@ExportLibCpp.register_metaop("PadAvgPoolAct", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any))) +@ExportLibCpp.register_metaop("PadAvgPoolAct", + aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc)), + PaddedInPlace_CP.default_model) class PadAvgPoolAct(PadAvgPool, AvgPoolAct): def __init__(self, node, mem_info): super().__init__(node, mem_info) -@ExportLibCpp.register("GlobalAveragePooling", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any))) +@ExportLibCpp.register("GlobalAveragePooling", + aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc)), + aidge_core.ProdConso.in_place_model) class GlobalAvgPool(Pool): def __init__(self, node, mem_info): super().__init__(node, mem_info) @@ -121,19 +140,25 @@ class GlobalAvgPool(Pool): self.attributes["kernel_dims"] = [self.attributes["in_width"][0], self.attributes["in_height"][0]] -@ExportLibCpp.register_metaop("PadGlobalAvgPool", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any))) +@ExportLibCpp.register_metaop("PadGlobalAvgPool", + aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc)), + PaddedInPlace_CP.default_model) class PadGlobalAvgPool(GlobalAvgPool, PadPool): def __init__(self, node, mem_info): super().__init__(node, mem_info) -@ExportLibCpp.register_metaop("GlobalAvgPoolAct", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any))) +@ExportLibCpp.register_metaop("GlobalAvgPoolAct", + aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc)), + aidge_core.ProdConso.in_place_model) class GlobalAvgPoolAct(GlobalAvgPool, PoolAct): def __init__(self, node, mem_info): super().__init__(node, mem_info) -@ExportLibCpp.register_metaop("PadGlobalAvgPoolAct", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any))) +@ExportLibCpp.register_metaop("PadGlobalAvgPoolAct", + aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.nhwc)), + PaddedInPlace_CP.default_model) class PadGlobalAvgPoolAct(PadGlobalAvgPool, GlobalAvgPoolAct): def __init__(self, node, mem_info): super().__init__(node, mem_info) \ No newline at end of file diff --git a/aidge_export_cpp/operators/Producer.py b/aidge_export_cpp/operators/Producer.py index 627dcb29ed21e2779a30978b5b22768c0c2cc9c4..5bd63323155b3342dcefc6b92e3c9861c8333127 100644 --- a/aidge_export_cpp/operators/Producer.py +++ b/aidge_export_cpp/operators/Producer.py @@ -2,29 +2,12 @@ import os from pathlib import Path import numpy as np import aidge_core -from aidge_core.export_utils import ExportNode, generate_file +from aidge_core.export_utils import ExportNodeCpp, generate_file, aidge2c from aidge_export_cpp import ROOT from aidge_export_cpp import ExportLibCpp -def numpy_dtype2ctype(dtype): - if dtype == np.int8: - return "int8_t" - elif dtype == np.int16: - return "int16_t" - elif dtype == np.int32: - return "int32_t" - elif dtype == np.int64: - return "int64_t" - elif dtype == np.float32: - return "float" - elif dtype == np.float64: - return "double" - # Add more dtype mappings as needed - else: - raise ValueError(f"Unsupported {dtype} dtype") - def export_params(name: str, - array: np.ndarray, + output: aidge_core.Tensor, filepath: str): # Get directory name of the file @@ -38,30 +21,40 @@ def export_params(name: str, filepath, str(ROOT / "templates" / "data" / "parameters.jinja"), name=name, - data_t=numpy_dtype2ctype(array.dtype), - values=array.tolist() + dims=output.dims(), + dtype=aidge2c(output.dtype()), + values=np.array(output).tolist() ) @ExportLibCpp.register("Producer", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any))) -class ProducerCPP(ExportNode): +class ProducerCPP(ExportNodeCpp): def __init__(self, node, mem_info): super().__init__(node, mem_info) - self.values = np.array(self.operator.get_output(0)) + self.output = self.operator.get_output(0) self.ignore = node.attributes().has_attr("ignore") - if len(self.values.shape) == 4: # Note: export in HWC - self.values = np.transpose(self.values, (0, 2, 3, 1)) - def export(self, export_folder: Path): - if not self.ignore : - header_path = f"include/parameters/{self.attributes['name']}.h" - export_params( - self.attributes['out_name'][0], - self.values.reshape(-1), - str(export_folder / header_path)) - return [header_path] - return [] + if self.ignore: + return [] + + path_to_definition = f"{self.config_path}/{self.attributes['name']}.{self.config_extension}" + + try: + aidge_core.export_utils.code_generation.generate_file( + str(export_folder / path_to_definition), + str(ROOT / "templates" / "configuration" / "producer_config.jinja"), + **self.attributes + ) + except Exception as e: + raise RuntimeError(f"Error when creating config file for {self.node.name()}[{self.node.type()}].") from e + + header_path = f"include/parameters/{self.attributes['name']}.h" + export_params( + self.attributes['out_name'][0], + self.output, + str(export_folder / header_path)) + return [path_to_definition, header_path] def forward(self): # A Producer does nothing during forward - return [] \ No newline at end of file + return [] diff --git a/aidge_export_cpp/operators/Quantizer.py b/aidge_export_cpp/operators/Quantizer.py index 51f5c23da24e7c6a47c162314f54a15c8845fc00..ae2860018406ee93871755243c38414e0e719568 100644 --- a/aidge_export_cpp/operators/Quantizer.py +++ b/aidge_export_cpp/operators/Quantizer.py @@ -28,16 +28,16 @@ class Quantizer(ExportNodeCpp): self.attributes["rescaling"] = "SingleShiftScaling" # Template for layer configutation file generation - self.config_template = str(ROOT / "templates" / "configuration" / "rescaling_config.jinja") + self.config_template = str(ROOT / "templates" / "configuration" / "activation_config.jinja") # Template layer call function generation within the forward file - self.forward_template = str(ROOT / "templates" / "kernel_forward" / "rescaling_forward.jinja") + self.forward_template = str(ROOT / "templates" / "kernel_forward" / "activation_forward.jinja") # Files to include within the generated forward.cpp file self.include_list = [] # Path to the kernel(s) files to copy - self.add_kernel_to_copy(ROOT / "kernels" / "rescaling.hpp") + self.add_kernel_to_copy(ROOT / "kernels" / "activation.hpp") # Include aidge outputs within the fwd file if self.attributes["aidge_cmp"]: diff --git a/aidge_export_cpp/operators/ReLU.py b/aidge_export_cpp/operators/ReLU.py index 55e7e19425e0a5b61790b58a2d36a8f233f75228..10390ea730fb2750fbc5a3b43d9c95372d6b211a 100644 --- a/aidge_export_cpp/operators/ReLU.py +++ b/aidge_export_cpp/operators/ReLU.py @@ -2,7 +2,9 @@ import aidge_core from aidge_core.export_utils import ExportNodeCpp from aidge_export_cpp import ROOT, ExportLibCpp, set_scaling_attributes -@ExportLibCpp.register("ReLU", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any))) +@ExportLibCpp.register("ReLU", + aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)), + aidge_core.ProdConso.in_place_model) class ReLU(ExportNodeCpp): def __init__(self, node, mem_info): super().__init__(node, mem_info) @@ -32,7 +34,34 @@ class ReLU(ExportNodeCpp): if self.attributes["aidge_cmp"]: self.include_list.append("network/utils.hpp") # aidge_cmp function self.include_list.append("data/aidge_outputs/" + node.name() + ".hpp") - + +@ExportLibCpp.register("LeakyReLU", + aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)), + aidge_core.ProdConso.in_place_model) +class LeakyReLU(ExportNodeCpp): + def __init__(self, node, mem_info): + super().__init__(node, mem_info) + + # Initialize kernel attributes + self.attributes["alpha"] = node.get_operator().attr.negative_slope + self.attributes["aidge_cmp"] = node.attributes().has_attr("aidge_cmp") + + # Template for layer configutation file generation + self.config_template = str(ROOT / "templates" / "configuration" / "leakyrelu_config.jinja") + + # Template layer call function generation within the forward file + self.forward_template = str(ROOT / "templates" / "kernel_forward" / "leakyrelu_forward.jinja") + + # Files to include within the generated forward.cpp file + self.include_list = [] + + # Path to the kernel(s) files to copy + self.add_kernel_to_copy(ROOT / "kernels" / "leakyrelu.hpp") + + # Include aidge outputs within the fwd file + if self.attributes["aidge_cmp"]: + self.include_list.append("network/utils.hpp") # aidge_cmp function + self.include_list.append("data/aidge_outputs/" + node.name() + ".hpp") @ExportLibCpp.register_metaop("QReLU", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any))) class QReLU(ReLU): diff --git a/aidge_export_cpp/operators/ReduceMean.py b/aidge_export_cpp/operators/ReduceMean.py new file mode 100644 index 0000000000000000000000000000000000000000..fd0d6316ae6143f9aad062f3feec14b4a6d803bd --- /dev/null +++ b/aidge_export_cpp/operators/ReduceMean.py @@ -0,0 +1,56 @@ +import aidge_core +from aidge_core.export_utils import ExportNodeCpp +from aidge_export_cpp import ROOT, ExportLibCpp, set_scaling_attributes + +@ExportLibCpp.register("ReduceMean", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32))) +class ReducemeanCPP(ExportNodeCpp): + def __init__(self, node, mem_info): + super().__init__(node, mem_info) + self.attributes["aidge_cmp"] = node.attributes().has_attr("aidge_cmp") + + if self.operator.get_input(0) is None: + raise AttributeError("Input 0 not found for operator ReduceMean") + + input_T = self.operator.get_input(0) + + # COMPUTING PRE/POST AXES STRIDES: + # Example : + # input dims {3, 3, 2} + # stride_pre = {1,3,9} + # stride_post = {6, 2, 1} + post_axis_strides = input_T.strides() + + pre_axis_strides = [1] + for i in range(1, len(post_axis_strides)): + pre_axis_strides.append(pre_axis_strides[i - 1] * input_T.dims()[i - 1]) + + in_nb_elts = input_T.dims()[0] * input_T.strides()[0] + out_nb_elts = in_nb_elts + axes_to_reduce = self.node.get_operator().attr.axes + for i in axes_to_reduce: + out_nb_elts = out_nb_elts // input_T.dims()[i] + + self.attributes["in_dims"] = input_T.dims() + self.attributes["in_nb_dims"] = len(input_T.dims()) + self.attributes["in_nb_elts"] = in_nb_elts + self.attributes["out_nb_elts"] = out_nb_elts + self.attributes["nb_axes_to_reduce"] = len(self.operator.attr.axes) + self.attributes["axes_to_reduce"] = self.node.get_operator().attr.axes + self.attributes["pre_axis_strides"] = pre_axis_strides + self.attributes["post_axis_strides"] = post_axis_strides + + # axis = node.get_operator().attr.axis if node.get_operator().attr.axis >= 0 else node.get_operator().attr.axis + nbDims + + self.config_template = str( + ROOT / "templates" / "configuration" / "reducemean_config.jinja") + self.forward_template = str( + ROOT / "templates" / "kernel_forward" / "reducemean_forward.jinja") + self.include_list = [] + self.kernels_to_copy = [ + str(ROOT / "kernels" / "reducemean.hpp"), + ] + + # Include aidge outputs within the fwd file + if self.attributes["aidge_cmp"]: + self.include_list.append("network/utils.hpp") # aidge_cmp function + self.include_list.append("data/aidge_outputs/" + node.name() + ".hpp") diff --git a/aidge_export_cpp/operators/Reshape.py b/aidge_export_cpp/operators/Reshape.py index ed094c94bbffcf84a2370bb24cb9110f9e2e6c68..b7166853a5790d89b40ffc1bd15400614361cfb3 100644 --- a/aidge_export_cpp/operators/Reshape.py +++ b/aidge_export_cpp/operators/Reshape.py @@ -3,15 +3,28 @@ from aidge_core.export_utils import ExportNodeCpp from aidge_export_cpp import ROOT from aidge_export_cpp import ExportLibCpp -@ExportLibCpp.register("Reshape", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32))) +@ExportLibCpp.register("Reshape", + # Reshape cannot accept any format, because its output format does not necessarily + # match its input format. So, if the previous layer is changed from NCHW to NHWC + # by adapt_to_backend(), it won't propagate the new format, ultimately leading + # to a missing transpose for the next layer! + aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any, aidge_core.dformat.default)), + aidge_core.ProdConso.in_place_model) class ReshapeCPP(ExportNodeCpp): def __init__(self, node, mem_info): super().__init__(node, mem_info) + self.attributes["aidge_cmp"] = node.attributes().has_attr("aidge_cmp") + self.config_template = str( - ROOT / "templates" / "configuration" / "reshape_config.jinja") + ROOT / "templates" / "configuration" / "identity_config.jinja") self.forward_template = str( - ROOT / "templates" / "kernel_forward" / "reshape_forward.jinja") + ROOT / "templates" / "kernel_forward" / "identity_forward.jinja") self.include_list = [] self.kernels_to_copy = [ - str(ROOT / "kernels" / "reshape.hpp"), + str(ROOT / "kernels" / "identity.hpp"), ] + + # Include aidge outputs within the fwd file + if self.attributes["aidge_cmp"]: + self.include_list.append("network/utils.hpp") # aidge_cmp function + self.include_list.append("data/aidge_outputs/" + node.name() + ".hpp") diff --git a/aidge_export_cpp/operators/Sigmoid.py b/aidge_export_cpp/operators/Sigmoid.py new file mode 100644 index 0000000000000000000000000000000000000000..de89d08631ab44ef32df99d02b6f2de3a944c822 --- /dev/null +++ b/aidge_export_cpp/operators/Sigmoid.py @@ -0,0 +1,21 @@ +import aidge_core +from aidge_core.export_utils import ExportNodeCpp, get_node_from_metaop +from aidge_export_cpp import ROOT, ExportLibCpp, set_scaling_attributes + +@ExportLibCpp.register("Sigmoid", + aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)), + aidge_core.ProdConso.in_place_model) +class SigmoidCPP(ExportNodeCpp): + def __init__(self, node, mem_info): + super().__init__(node, mem_info) + self.attributes["activation"] = "Linear" + self.attributes["rescaling"] = "NoScaling" + self.config_template = str( + ROOT / "templates" / "configuration" / "sigmoid_config.jinja") + self.forward_template = str( + ROOT / "templates" / "kernel_forward" / "sigmoid_forward.jinja") + self.include_list = [] + self.kernels_to_copy = [ + str(ROOT / "kernels" / "sigmoid.hpp"), + str(ROOT / "kernels" / "activation.hpp") + ] \ No newline at end of file diff --git a/aidge_export_cpp/operators/Slice.py b/aidge_export_cpp/operators/Slice.py new file mode 100644 index 0000000000000000000000000000000000000000..26041b0178dd43d867ca00382fe0a675a3c865d9 --- /dev/null +++ b/aidge_export_cpp/operators/Slice.py @@ -0,0 +1,55 @@ +import aidge_core +from aidge_core.export_utils import ExportNodeCpp +from aidge_export_cpp import ROOT +from aidge_export_cpp import ExportLibCpp + +@ExportLibCpp.register("Slice", + aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any))) +class SliceCPP(ExportNodeCpp): + def __init__(self, node, mem_info): + super().__init__(node, mem_info) + # Secure retrieve parameter attributes + input_dims = self.attributes["in_dims"][0] + axes = [a if a>=0 else a+len(input_dims) for a in node.get_operator().attr.axes] # postive axes + starts, ends, steps = node.get_operator().attr.starts, node.get_operator().attr.ends, node.get_operator().attr.steps + assert len(starts) == len(axes) + assert len(ends) == len(axes) + assert len(steps) == len(steps) + # positive start and ends indices + starts = [s if s>=0 else s+input_dims[axes[i]] for i,s in enumerate(starts)] + ends = [e if e>=0 else e+input_dims[axes[i]] for i,e in enumerate(ends)] + # assert boundaries + for a in axes: assert a>=0 and a < len(input_dims) + for i,e in enumerate(ends): assert e>=0 and e <= input_dims[axes[i]] + for i,s in enumerate(starts): assert s>=0 and s < ends[i] + for st in steps: assert st >= 1 + self.attributes["starts"] = starts + self.attributes["ends"] = ends + self.attributes["steps"] = steps + + #Compute mod and div values that will be used to convert input flat-index to axes-index + axes_mod = [input_dims[a] for a in axes] + axes_div = len(axes) * [1] + for i,ax in enumerate(axes): + for j in range(ax+1, len(input_dims)): + axes_mod[i] *= input_dims[j] + axes_div[i] *= input_dims[j] + self.attributes["axes_mod"] = axes_mod + self.attributes["axes_div"] = axes_div + + + self.attributes["aidge_cmp"] = node.attributes().has_attr("aidge_cmp") + + self.config_template = str( + ROOT / "templates" / "configuration" / "slice_config.jinja") + self.forward_template = str( + ROOT / "templates" / "kernel_forward" / "slice_forward.jinja") + self.include_list = [] + self.kernels_to_copy = [ + str(ROOT / "kernels" / "slice.hpp"), + ] + + # Include aidge outputs within the fwd file + if self.attributes["aidge_cmp"]: + self.include_list.append("network/utils.hpp") # aidge_cmp function + self.include_list.append("data/aidge_outputs/" + node.name() + ".hpp") diff --git a/aidge_export_cpp/operators/Transpose.py b/aidge_export_cpp/operators/Transpose.py index 63082ea67fcd35548681108b9ae6306c600fa252..e0784374d1b6282605b752805bd3d25f0abe0784 100644 --- a/aidge_export_cpp/operators/Transpose.py +++ b/aidge_export_cpp/operators/Transpose.py @@ -7,6 +7,27 @@ from aidge_export_cpp import ExportLibCpp class TransposeCPP(ExportNodeCpp): def __init__(self, node, mem_info): super().__init__(node, mem_info) + + nbdims = len(self.attributes["in_dims"][0]) + + # Compute input strides + in_strides = [0] * nbdims + in_strides[nbdims - 1] = 1 + for i in range(nbdims - 2, -1, -1): + in_strides[i] = in_strides[i + 1] * self.attributes["in_dims"][0][i + 1] + + # Compute output dimensions based on permutation + out_dims = [self.attributes["in_dims"][0][self.attributes["output_dims_order"][i]] for i in range(nbdims)] + + # Compute output strides + out_strides = [0] * nbdims + out_strides[nbdims - 1] = 1 + for i in range(nbdims - 2, -1, -1): + out_strides[i] = out_strides[i + 1] * out_dims[i + 1] + + self.attributes["in_strides"] = in_strides + self.attributes["out_strides"] = out_strides + self.config_template = str( ROOT / "templates" / "configuration" / "transpose_ND_config.jinja") self.forward_template = str( diff --git a/aidge_export_cpp/static/Makefile b/aidge_export_cpp/static/Makefile index 176b8fc1dc3082c5de4cefbba961073b446f4614..5c95c2258040f701c23ee6f56b88b3504bc9918c 100644 --- a/aidge_export_cpp/static/Makefile +++ b/aidge_export_cpp/static/Makefile @@ -1,16 +1,43 @@ CC := g++ -CCFLAGS := ${CCFLAGS} -O2 -Wall -Wextra -MMD -fopenmp +CCFLAGS_COMMON := -Wall -Wextra -MMD -fopenmp +CCFLAGS_RELEASE := -O2 +CCFLAGS_DEBUG := -g -O0 -DDEBUG +CCFLAGS_ASAN := -g -O1 -DDEBUG -fsanitize=address -fno-omit-frame-pointer +CCFLAGS := ${CCFLAGS_COMMON} ${CCFLAGS_RELEASE} LDFLAGS := ${LDFLAGS} -fopenmp +LDFLAGS_ASAN := ${LDFLAGS} -fsanitize=address + OBJDIR := build DNNDIR := dnn BINDIR := bin TARGET := $(BINDIR)/run_export +DEBUG_OBJDIR := build_debug +DEBUG_TARGET := $(BINDIR)/run_export_debug +ASAN_OBJDIR := build_asan +ASAN_TARGET := $(BINDIR)/run_export_asan + +# Export Params +AIDGE_CMP := false # Compare fmaps with Aidge ref +SAVE_OUTPUTS := false # Store fmaps into files + +ifeq ($(AIDGE_CMP), true) + PRMFLAGS := ${PRMFLAGS} -DAIDGE_CMP +endif +ifeq ($(SAVE_OUTPUTS), true) + PRMFLAGS := ${PRMFLAGS} -DSAVE_OUTPUTS +endif INCLUDE_DIRS :=-I. -I./${DNNDIR} -I./${DNNDIR}/include -I./${DNNDIR}/layers -I./${DNNDIR}/parameters CC_SRCS := $(shell find . -iname "*.cpp") CC_OBJS := $(patsubst %.cpp, ${OBJDIR}/%.o, ${CC_SRCS}) +DEBUG_OBJS := $(patsubst %.cpp, ${DEBUG_OBJDIR}/%.o, ${CC_SRCS}) +ASAN_OBJS := $(patsubst %.cpp, ${ASAN_OBJDIR}/%.o, ${CC_SRCS}) DEPENDENCIES := $(patsubst %.o, %.d, ${CC_OBJS}) +DEBUG_DEPENDENCIES := $(patsubst %.o, %.d, ${DEBUG_OBJS}) +ASAN_DEPENDENCIES := $(patsubst %.o, %.d, ${ASAN_OBJS}) + +.PHONY: all build debug asan clean all: build @@ -18,13 +45,35 @@ build: ${CC_OBJS} @mkdir -p $(dir ${TARGET}) ${CC} ${CC_OBJS} ${LDFLAGS} -o ${TARGET} +debug: CCFLAGS := ${CCFLAGS_COMMON} ${CCFLAGS_DEBUG} +debug: ${DEBUG_OBJS} + @mkdir -p $(dir ${DEBUG_TARGET}) + ${CC} ${DEBUG_OBJS} ${LDFLAGS} -o ${DEBUG_TARGET} + +asan: CCFLAGS := ${CCFLAGS_COMMON} ${CCFLAGS_ASAN} +asan: ${ASAN_OBJS} + @mkdir -p $(dir ${ASAN_TARGET}) + ${CC} ${ASAN_OBJS} ${LDFLAGS_ASAN} -o ${ASAN_TARGET} + ${OBJDIR}/%.o: %.cpp @mkdir -p $(dir $@) - ${CC} ${CCFLAGS} ${INCLUDE_DIRS} -c $< -o $@ + ${CC} ${CCFLAGS} ${PRMFLAGS} ${INCLUDE_DIRS} -c $< -o $@ + +${DEBUG_OBJDIR}/%.o: %.cpp + @mkdir -p $(dir $@) + ${CC} ${CCFLAGS} ${PRMFLAGS} ${INCLUDE_DIRS} -c $< -o $@ + +${ASAN_OBJDIR}/%.o: %.cpp + @mkdir -p $(dir $@) + ${CC} ${CCFLAGS} ${PRMFLAGS} ${INCLUDE_DIRS} -c $< -o $@ clean: if [ -d "$(OBJDIR)" ]; then rm -rf $(OBJDIR); fi + if [ -d "$(DEBUG_OBJDIR)" ]; then rm -rf $(DEBUG_OBJDIR); fi + if [ -d "$(ASAN_OBJDIR)" ]; then rm -rf $(ASAN_OBJDIR); fi if [ -d "$(BINDIR)" ]; then rm -rf $(BINDIR); fi -include $(DEPENDENCIES) +-include $(DEBUG_DEPENDENCIES) +-include $(ASAN_DEPENDENCIES) diff --git a/aidge_export_cpp/static/activation_utils.hpp b/aidge_export_cpp/static/activation_utils.hpp index c6a1bcdc0ce289a384519673ca04a001a5ca9692..ddac7fb46061c5ca909d1f0571723f0842dbda64 100644 --- a/aidge_export_cpp/static/activation_utils.hpp +++ b/aidge_export_cpp/static/activation_utils.hpp @@ -48,9 +48,5 @@ Output_T activation_forward_value (Sum_T weightedSum, break; } - // Value fixed here for now but it should be generated by - // the export module or determined by the type of Output_T - // For now only works for int8_t and uint8_t - const uint32_t NB_BITS = 8; - return saturate<Output_T>(rescaling(weightedSum, output), NB_BITS); + return saturate<Output_T>(rescaling(weightedSum, output), 8 * sizeof(Output_T)); } diff --git a/aidge_export_cpp/static/rescaling_utils.hpp b/aidge_export_cpp/static/rescaling_utils.hpp index 4fdb321820f92f8d33e474aabc4665a99cb0d4b0..8825197e743e7a488cbc411dc83af6a1968d9974 100644 --- a/aidge_export_cpp/static/rescaling_utils.hpp +++ b/aidge_export_cpp/static/rescaling_utils.hpp @@ -1,14 +1,16 @@ #pragma once +#include <sys/types.h> + // --------------------------------------------------- // ----------------- Saturate Utils ------------------ // --------------------------------------------------- -static int64_t toInt64(uint32_t lo, uint32_t hi) { +constexpr int64_t toInt64(uint32_t lo, uint32_t hi) { return (int64_t) (((uint64_t) hi) << 32ull) | ((uint64_t) lo); } -static int64_t smlal(int32_t lhs, int32_t rhs, +constexpr int64_t smlal(int32_t lhs, int32_t rhs, uint32_t accumLo, uint32_t accumHi) { return ((int64_t) lhs) * ((int64_t) rhs) + toInt64(accumLo, accumHi); @@ -52,9 +54,9 @@ struct FixedPointScaling { } // Attributes - static const uint32_t HALF_LO = (SHIFT > 0) + static constexpr uint32_t HALF_LO = (SHIFT > 0) ? (1ull << (SHIFT - 1)) & 0xFFFFFFFF : 0; - static const uint32_t HALF_HI = (SHIFT > 0) + static constexpr uint32_t HALF_HI = (SHIFT > 0) ? (1ull << (SHIFT - 1)) >> 32u : 0; // static const int32_t mScaling = SCALING; diff --git a/aidge_export_cpp/static/typedefs.hpp b/aidge_export_cpp/static/typedefs.hpp index acece91115f73a57197c8a423cd34ec37b2f2e2a..8ecd095639a1afeb03b96f8f59149545cfd3a825 100644 --- a/aidge_export_cpp/static/typedefs.hpp +++ b/aidge_export_cpp/static/typedefs.hpp @@ -19,7 +19,8 @@ typedef enum { typedef enum { Add, Sub, - Mul + Mul, + Div } ElemWise_T; typedef enum { diff --git a/aidge_export_cpp/static/utils.hpp b/aidge_export_cpp/static/utils.hpp index b9b739269216f6e02e05f5da3bcb3c2a8df30150..6142ac3f34951c424910b7294e8ba0af886c091e 100644 --- a/aidge_export_cpp/static/utils.hpp +++ b/aidge_export_cpp/static/utils.hpp @@ -52,10 +52,24 @@ int min (int lhs, int rhs) return (lhs <= rhs) ? lhs : rhs; } +template <class InputIt, class Size, class OutputIt> +__attribute__((always_inline)) +static inline OutputIt copy_n(InputIt first, Size count, OutputIt result) { + if (count > 0) { + *result = *first; + ++result; + for (Size i = 1; i != count; ++i, ++result) { + *result = *++first; + } + } + + return result; +} + +#if SAVE_OUTPUTS || AIDGE_CMP -#if SAVE_OUTPUTS enum class Format { - Default, + DEFAULT, NCHW, NHWC, CHWN, @@ -64,105 +78,152 @@ enum class Format { CDHWN }; +#endif // SAVE_OUTPUTS || AIDGE_CMP + +#if SAVE_OUTPUTS + +template<int NB_OUTPUTS, int OUT_HEIGHT, int OUT_WIDTH, + size_t MEM_CONT_OFFSET, + size_t MEM_CONT_SIZE, + size_t MEM_WRAP_OFFSET, + size_t MEM_WRAP_SIZE, + Format FMT, typename Output_T> +inline void saveOutputs(const Output_T* __restrict outputs, FILE* pFile) { + int offset = 0; + + // NCHW + if (FMT == Format::NCHW || FMT == Format::DEFAULT) { + fprintf(pFile, "{"); + for (auto out = 0; out < NB_OUTPUTS; ++out) { + fprintf(pFile, "{"); + for (auto h = 0; h < OUT_HEIGHT; ++h) { + fprintf(pFile, "{"); + for (auto w = 0; w < OUT_WIDTH; ++w) { + if (MEM_WRAP_SIZE > 0 && offset == static_cast<int>(MEM_CONT_SIZE / sizeof(Output_T))) { + offset += (MEM_WRAP_OFFSET - MEM_CONT_OFFSET + - MEM_CONT_SIZE) / sizeof(Output_T); + } -template<typename Output_T> -inline void saveOutputs( - int NB_OUTPUTS, - int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH, - // int OUTPUT_MEM_CONT_OFFSET, - // int OUTPUT_MEM_CONT_SIZE, - // int OUTPUT_MEM_WRAP_OFFSET, - // int OUTPUT_MEM_WRAP_SIZE, - // int OUTPUT_MEM_STRIDE, - const Output_T* __restrict outputs, - FILE* pFile, - Format format) -{ - // default is NHCW ! - if (format == Format::NHWC) { - fprintf(pFile, "("); - auto oOffset = 0; - for(int oy = 0; oy < OUTPUTS_HEIGHT; oy++) { - fprintf(pFile, "("); - - for(int ox = 0; ox < OUTPUTS_WIDTH; ox++) { - fprintf(pFile, "("); - - // const int oPos = (ox + OUTPUTS_WIDTH * oy); - // int oOffset = OUTPUT_MEM_STRIDE * oPos; - - // if (OUTPUT_MEM_WRAP_SIZE > 0 - // && oOffset >= OUTPUT_MEM_CONT_SIZE) - // { - // oOffset += OUTPUT_MEM_WRAP_OFFSET - OUTPUT_MEM_CONT_OFFSET - // - OUTPUT_MEM_CONT_SIZE; - // } - - for (int output = 0; output < NB_OUTPUTS; output++) { if (std::is_floating_point<Output_T>::value) - fprintf(pFile, "%f", static_cast<float>(outputs[oOffset])); + fprintf(pFile, "%f", static_cast<float>(outputs[offset])); else - fprintf(pFile, "%d", static_cast<int>(outputs[oOffset])); - oOffset += 1; + fprintf(pFile, "%d", static_cast<int>(outputs[offset])); + ++offset; fprintf(pFile, ", "); - } - fprintf(pFile, "), \n"); + } + fprintf(pFile, "}\n"); } - - fprintf(pFile, "), \n"); + fprintf(pFile, "}\n"); } + fprintf(pFile, "}\n"); - fprintf(pFile, ")\n"); - } - else if (format == Format::NCHW || format == Format::Default) { - auto ofst = 0; - for(int output = 0; output < NB_OUTPUTS; output++) { - fprintf(pFile, "%d:\n", output); - for(int oy = 0; oy < OUTPUTS_HEIGHT; oy++) { - for(int ox = 0; ox < OUTPUTS_WIDTH; ox++) { - fprintf(pFile, "%d", static_cast<int>(outputs[ofst])); - fprintf(pFile, " "); - ofst += 1; - } + // NHWC + } else if (FMT == Format::NHWC) { + fprintf(pFile, "{\n"); // Start outer brace + for (auto h = 0; h < OUT_HEIGHT; ++h) { + fprintf(pFile, " {\n"); // Indent level 1 + for (auto w = 0; w < OUT_WIDTH; ++w) { + fprintf(pFile, " { "); // Indent level 2 and open inner brace + for (auto out = 0; out < NB_OUTPUTS; ++out) { + if (MEM_WRAP_SIZE > 0 && offset == static_cast<int>(MEM_CONT_SIZE / sizeof(Output_T))) { + offset += (MEM_WRAP_OFFSET - MEM_CONT_OFFSET + - MEM_CONT_SIZE) / sizeof(Output_T); + } - fprintf(pFile, "\n"); - } + if (std::is_floating_point<Output_T>::value) + fprintf(pFile, "%f", static_cast<float>(outputs[offset])); + else + fprintf(pFile, "%4d", static_cast<int>(outputs[offset])); + ++offset; - fprintf(pFile, "\n"); + // Add comma except for last element + if (out != NB_OUTPUTS - 1) + fprintf(pFile, ","); + } + fprintf(pFile, " },\n"); // Close inner brace and newline + } + fprintf(pFile, " },\n"); // Close w-loop brace and newline } + fprintf(pFile, "}\n"); // Close outer brace - fprintf(pFile, "\n"); - } - else { - printf("Warning unsupported dataformat.\n"); + } else { + printf("[ERROR] - Format is not supported.\n"); + printf("[ERROR] - Aborting save outputs...\n"); + return; } } #endif // SAVE_OUTPUTS #if AIDGE_CMP -template<int NB_OUTPUTS, int OUT_WIDTH, int OUT_HEIGHT, typename AidgeOutput_T, typename DevOutput_T> +template<int NB_OUTPUTS, int OUT_WIDTH, int OUT_HEIGHT, Format FMT> +int get_ofst_from_fmt(int out, int h, int w) { + if (FMT == Format::NCHW || FMT == Format::DEFAULT) + return out * OUT_HEIGHT * OUT_WIDTH + h * OUT_WIDTH + w; + else if (FMT == Format::NHWC) + return h * OUT_WIDTH * NB_OUTPUTS + w * NB_OUTPUTS + out; + else { + printf("[ERROR] - This data format is not supported.\n"); + return -1; + } +} + +template<int NB_OUTPUTS, int OUT_WIDTH, int OUT_HEIGHT, + size_t MEM_CONT_OFFSET, + size_t MEM_CONT_SIZE, + size_t MEM_WRAP_OFFSET, + size_t MEM_WRAP_SIZE, + Format AIDGE_FMT, Format DEV_FMT, typename AidgeOutput_T, typename DevOutput_T> void aidge_cmp(std::string layer_name, AidgeOutput_T* aidge_output, DevOutput_T* dev_output) { - printf("[AIDGE COMPARE] - %s\n", layer_name.c_str()); + printf("[NOTICE] - Comparing with Aidge ref for node : %s -> ", layer_name.c_str()); + + const float atol = 1e-5f; // Absolute + const float rtol = 1e-3f; // Relative for (auto out = 0; out < NB_OUTPUTS; ++out) { for (auto h = 0; h < OUT_HEIGHT; ++h) { for (auto w = 0; w < OUT_WIDTH; ++w) { - const int aidge_ofst = out * OUT_HEIGHT * OUT_WIDTH + h * OUT_WIDTH + w; - const int dev_ofst = h * OUT_WIDTH * NB_OUTPUTS + w * NB_OUTPUTS + out; - if (aidge_output[aidge_ofst] != dev_output[dev_ofst]) { - if (std::is_floating_point<DevOutput_T>::value) { - printf("[ERROR] - First error detected at %dx%dx%d (out x h x w) : aidge_out = %f vs dev_out = %f\n", - out, h, w, static_cast<double>(aidge_output[aidge_ofst]), static_cast<double>(dev_output[dev_ofst])); - } else { + + const int aidge_ofst = get_ofst_from_fmt<NB_OUTPUTS, OUT_WIDTH, OUT_HEIGHT, AIDGE_FMT>(out, h, w); + int dev_ofst = get_ofst_from_fmt<NB_OUTPUTS, OUT_WIDTH, OUT_HEIGHT, DEV_FMT>(out, h, w); + + if (aidge_ofst == -1 || dev_ofst == -1) { + printf("[FAILURE]\n"); + printf("[ERROR] - Aborting this layer comparison...\n"); + return; + } + + if (MEM_WRAP_SIZE > 0 && dev_ofst >= static_cast<int>(MEM_CONT_SIZE / sizeof(DevOutput_T))) { + dev_ofst += (MEM_WRAP_OFFSET - MEM_CONT_OFFSET + - MEM_CONT_SIZE) / sizeof(DevOutput_T); + } + + // Float Comparison + if (std::is_floating_point<DevOutput_T>::value) { + + const float diff = std::abs(aidge_output[aidge_ofst] - dev_output[dev_ofst]); + const float tolerance = atol + rtol * std::abs(dev_output[dev_ofst]); + + if (diff > tolerance) { + printf("[FAILURE]\n"); + printf("[ERROR] - First error detected at %dx%dx%d (out x h x w) : aidge_out = %f vs dev_out = %f\n", + out, h, w, static_cast<double>(aidge_output[aidge_ofst]), static_cast<double>(dev_output[dev_ofst])); + printf("Abort program.\n"); + exit(1); + } + + // Int Comparison + } else { + if (aidge_output[aidge_ofst] != dev_output[dev_ofst]) { + printf("[FAILURE]\n"); printf("[ERROR] - First error detected at %dx%dx%d (out x h x w) : aidge_out = %d vs dev_out = %d\n", - out, h, w, static_cast<int>(aidge_output[aidge_ofst]), static_cast<int>(dev_output[dev_ofst])); + out, h, w, static_cast<int>(aidge_output[aidge_ofst]), static_cast<int>(dev_output[dev_ofst])); + printf("[ERROR] - Abort program.\n"); + exit(1); } - printf("Abort program.\n"); - exit(1); } } } diff --git a/aidge_export_cpp/templates/configuration/_def_io.jinja b/aidge_export_cpp/templates/configuration/_def_io.jinja index 314ae39a491f2bf1eafec4b0ee5aaab156d3c987..4bbb86666d56656cd2477f6b0d398b58bf08be75 100644 --- a/aidge_export_cpp/templates/configuration/_def_io.jinja +++ b/aidge_export_cpp/templates/configuration/_def_io.jinja @@ -1,9 +1,11 @@ {# NOTE: Suppose input is first #} {# // INPUT CONF {% for inidx in range(nb_in) -%} +{% if in_name[inidx] %} #define {{ in_name[inidx]|upper }}_NB_CHANNELS {{ in_chan[inidx] }} #define {{ in_name[inidx]|upper }}_IN_HEIGHT {{ in_height[inidx] }} #define {{ in_name[inidx]|upper }}_IN_WIDTH {{ in_width[inidx] }} +{% endif %} {% endfor %} #} // INPUT CONF diff --git a/aidge_export_cpp/templates/configuration/_meminfo.jinja b/aidge_export_cpp/templates/configuration/_meminfo.jinja index 15d15425b4330f68b4a97c31e9cf7a1076cc93e8..6c99509c8fc93fd550a1b45e4803e23a072145f2 100644 --- a/aidge_export_cpp/templates/configuration/_meminfo.jinja +++ b/aidge_export_cpp/templates/configuration/_meminfo.jinja @@ -1,11 +1,34 @@ // MEMINFO CONF +{% for inidx in range(nb_in) -%} +{% if in_name[inidx] %} +{# Specify a default memory layout for standalone input tensors -#} +{% if not in_node[inidx] %} +{% if in_dformat[inidx]|string == 'nhwc' or in_dformat[inidx]|string == 'nwc' %} +#define {{ in_name[inidx]|upper }}_MEM_SIZE {{ in_chan[inidx] * in_sizeof[inidx] }} +#define {{ in_name[inidx]|upper }}_MEM_OFFSET 0 +#define {{ in_name[inidx]|upper }}_MEM_STRIDE {{ in_chan[inidx] * in_sizeof[inidx] }} +#define {{ in_name[inidx]|upper }}_MEM_LENGTH {{ in_width[inidx] }} +{% else %} +#define {{ in_name[inidx]|upper }}_MEM_SIZE {{ in_size[inidx] * in_sizeof[inidx] }} +#define {{ in_name[inidx]|upper }}_MEM_OFFSET 0 +#define {{ in_name[inidx]|upper }}_MEM_STRIDE {{ in_size[inidx] * in_sizeof[inidx] }} +#define {{ in_name[inidx]|upper }}_MEM_LENGTH 1 +{% endif %} +#define {{ in_name[inidx]|upper }}_MEM_CONT_SIZE {{ in_size[inidx] * in_sizeof[inidx] }} +#define {{ in_name[inidx]|upper }}_MEM_CONT_OFFSET 0 +#define {{ in_name[inidx]|upper }}_MEM_WRAP_OFFSET 0 +#define {{ in_name[inidx]|upper }}_MEM_WRAP_SIZE 0 +{% endif %} +{% endif %} +{% endfor %} + {% for outidx in range(nb_out) -%} -#define {{ out_name[outidx]|upper }}_SIZE {{ mem_info_size[outidx]}} -#define {{ out_name[outidx]|upper }}_OFFSET {{ mem_info_offset[outidx]}} -#define {{ out_name[outidx]|upper }}_STRIDE {{ mem_info_stride[outidx]}} -#define {{ out_name[outidx]|upper }}_LENGTH {{ mem_info_length[outidx]}} -#define {{ out_name[outidx]|upper }}_CONT_SIZE {{ mem_info_cont_size[outidx]}} -#define {{ out_name[outidx]|upper }}_CONT_OFFSET {{ mem_info_cont_offset[outidx]}} -#define {{ out_name[outidx]|upper }}_WRAP_OFFSET {{ mem_info_wrap_offset[outidx]}} -#define {{ out_name[outidx]|upper }}_WRAP_SIZE {{ mem_info_wrap_size[outidx]}} +#define {{ out_name[outidx]|upper }}_MEM_SIZE {{ mem_info_size[outidx]}} +#define {{ out_name[outidx]|upper }}_MEM_OFFSET {{ mem_info_offset[outidx]}} +#define {{ out_name[outidx]|upper }}_MEM_STRIDE {{ mem_info_stride[outidx]}} +#define {{ out_name[outidx]|upper }}_MEM_LENGTH {{ mem_info_length[outidx]}} +#define {{ out_name[outidx]|upper }}_MEM_CONT_SIZE {{ mem_info_cont_size[outidx]}} +#define {{ out_name[outidx]|upper }}_MEM_CONT_OFFSET {{ mem_info_cont_offset[outidx]}} +#define {{ out_name[outidx]|upper }}_MEM_WRAP_OFFSET {{ mem_info_wrap_offset[outidx]}} +#define {{ out_name[outidx]|upper }}_MEM_WRAP_SIZE {{ mem_info_wrap_size[outidx]}} {% endfor %} diff --git a/aidge_export_cpp/templates/configuration/_save_outputs.jinja b/aidge_export_cpp/templates/configuration/_save_outputs.jinja new file mode 100644 index 0000000000000000000000000000000000000000..358b9cf40e16715882809a1b69be8b86493799b4 --- /dev/null +++ b/aidge_export_cpp/templates/configuration/_save_outputs.jinja @@ -0,0 +1 @@ +#define {{ out_name[0] | upper }}_DEV_FMT Format::{{ out_format[0] | upper }} \ No newline at end of file diff --git a/aidge_export_cpp/templates/configuration/activation_config.jinja b/aidge_export_cpp/templates/configuration/activation_config.jinja index 84b122ba5207f0022d72f35bb4f8e7064bf7fe32..45c2760706fd6e3a07b9cf5c9be8a1e585a2f0b1 100644 --- a/aidge_export_cpp/templates/configuration/activation_config.jinja +++ b/aidge_export_cpp/templates/configuration/activation_config.jinja @@ -4,9 +4,9 @@ #include "network/rescaling_utils.hpp" {# For layer configuration -#} -{%- set nb_data = in_chan[0] * in_height[0] * in_width[0] %} -#define {{ name|upper }}_NB_DATA {{ nb_data }} +#define {{ name|upper }}_NB_ELTS {{ in_dims[0]|join('*') }} #define {{ name|upper }}_ACTIVATION {{ activation }} +{% include "./_save_outputs.jinja" %} {% include "./_def_io.jinja" %} {% include "./_meminfo.jinja" %} {% include "./_rescaling.jinja" %} diff --git a/aidge_export_cpp/templates/configuration/batchnorm_config.jinja b/aidge_export_cpp/templates/configuration/batchnorm_config.jinja index 0c0bc49b521556eee1a4e455486caae44a2b86cb..34412212dbb1ff689d4c04d89f8206e22bad8d3b 100644 --- a/aidge_export_cpp/templates/configuration/batchnorm_config.jinja +++ b/aidge_export_cpp/templates/configuration/batchnorm_config.jinja @@ -8,6 +8,7 @@ {% include "./_meminfo.jinja" %} #define {{ name|upper }}_ACTIVATION {{ activation }} #define {{ name|upper }}_EPSILON {{ epsilon }} +{% include "./_save_outputs.jinja" %} {% include "./_rescaling.jinja" %} #endif /* {{ name|upper }}_LAYER_H */ diff --git a/aidge_export_cpp/templates/configuration/concat_config.jinja b/aidge_export_cpp/templates/configuration/concat_config.jinja index ea8246db9a315a371e0cacea5d45d07fa2b8f7e8..faaf5aad481975a80f5f95e50277fe4547ee1522 100644 --- a/aidge_export_cpp/templates/configuration/concat_config.jinja +++ b/aidge_export_cpp/templates/configuration/concat_config.jinja @@ -2,17 +2,20 @@ #ifndef {{ name|upper }}_LAYER_H #define {{ name|upper }}_LAYER_H +#include <sys/types.h> + {% include "./_def_io.jinja" %} {% include "./_meminfo.jinja" %} // Attributes #define {{ name|upper }}_NB_INPUTS {{ nb_in }} #define {{ name|upper }}_AXIS {{ axis }} -{%- for i in range(nb_in) %} -#define {{ name|upper }}_INPUT_{{i}}_SIZE {{ axis_size[i] }} -{%- endfor %} +constexpr size_t {{name|upper}}_AXIS_SIZE[] = { {{ axis_size|join(", ") }} }; +#define {{name|upper}}_TOTAL_AXIS_SIZE ({{ axis_size|join('+') }}) #define {{ name|upper }}_AXIS_SIZE_POST {{ axis_size_post }} #define {{ name|upper }}_AXIS_SIZE_PRE {{ axis_size_pre }} +{% include "./_save_outputs.jinja" %} + #endif /* {{ name|upper }}_LAYER_H */ diff --git a/aidge_export_cpp/templates/configuration/convolution_config.jinja b/aidge_export_cpp/templates/configuration/convolution_config.jinja index b72df4d10f5342f661e921f4b2a7dbaf79d32e85..5774e031ba553414250886a0896de1f491425e80 100644 --- a/aidge_export_cpp/templates/configuration/convolution_config.jinja +++ b/aidge_export_cpp/templates/configuration/convolution_config.jinja @@ -5,20 +5,21 @@ {# For layer configuration -#} {% include "./_def_io.jinja" %} {% include "./_meminfo.jinja" %} -#define {{ name|upper }}_PADDING_Y {{ padding[0] }} -#define {{ name|upper }}_PADDING_X {{ padding[1] }} -#define {{ name|upper }}_STRIDE_Y {{ stride_dims[0] }} -#define {{ name|upper }}_STRIDE_X {{ stride_dims[1] }} -#define {{ name|upper }}_DILATION_Y {{ dilation_dims[0] }} -#define {{ name|upper }}_DILATION_X {{ dilation_dims[1] }} -#define {{ name|upper }}_KERNEL_HEIGHT {{ kernel_dims[0] }} -#define {{ name|upper }}_KERNEL_WIDTH {{ kernel_dims[1] }} +#define {{ name|upper }}_PADDING_Y {{ padding[0] if padding|length > 2 else 0 }} +#define {{ name|upper }}_PADDING_X {{ padding[1] if padding|length > 2 else padding[0] }} +#define {{ name|upper }}_STRIDE_Y {{ stride_dims[0] if stride_dims|length > 1 else 1 }} +#define {{ name|upper }}_STRIDE_X {{ stride_dims[1] if stride_dims|length > 1 else stride_dims[0] }} +#define {{ name|upper }}_DILATION_Y {{ dilation_dims[0] if dilation_dims|length > 1 else 1 }} +#define {{ name|upper }}_DILATION_X {{ dilation_dims[1] if dilation_dims|length > 1 else dilation_dims[0] }} +#define {{ name|upper }}_KERNEL_HEIGHT {{ kernel_dims[0] if kernel_dims|length > 1 else 1 }} +#define {{ name|upper }}_KERNEL_WIDTH {{ kernel_dims[1] if kernel_dims|length > 1 else kernel_dims[0] }} #define {{ name|upper }}_ACTIVATION {{ activation }} +{% include "./_save_outputs.jinja" %} {% include "./_rescaling.jinja" %} {#- Calculate sizes #} -{%- set weights_size = out_chan[0] * kernel_dims[1] * kernel_dims[0] if depthwise is defined - else out_chan[0] * in_chan[0] * kernel_dims[1] * kernel_dims[0] %} +{%- set weights_size = out_chan[0] * (kernel_dims[1] if kernel_dims|length > 1 else 1) * kernel_dims[0] if depthwise is defined + else out_chan[0] * in_chan[0] * (kernel_dims[1] if kernel_dims|length > 1 else 1) * kernel_dims[0] %} #define {{ name|upper }}_WEIGHTS_SIZE {{ weights_size }} #define {{ name|upper }}_BIASES_SIZE {{ out_chan[0] }} diff --git a/aidge_export_cpp/templates/configuration/elemwise_config.jinja b/aidge_export_cpp/templates/configuration/elemwise_config.jinja index f839602fff707bc4dc30b11835846c977130cab4..2ccf5fa73a1b464ac1145684e92f1e00d0204b9f 100644 --- a/aidge_export_cpp/templates/configuration/elemwise_config.jinja +++ b/aidge_export_cpp/templates/configuration/elemwise_config.jinja @@ -3,12 +3,23 @@ #define {{ name|upper }}_LAYER_H #include "network/rescaling_utils.hpp" +#include <sys/types.h> + {% include "./_def_io.jinja" %} {% include "./_meminfo.jinja" %} {# For layer configuration -#} -#define {{ name|upper }}_NB_ELTS {{ in_dims[0]|join('*') }} +#define {{ name|upper }}_NB_MAT {{ offset_in1|length }} +#define {{ name|upper }}_INPUT1_CONT_SIZE {{ input1_cont_size }} +#define {{ name|upper }}_INPUT2_CONT_SIZE {{ input2_cont_size }} +#define {{ name|upper }}_OUTPUT_CONT_SIZE {{ output_cont_size }} + +constexpr size_t {{name|upper}}_OFFSET_IN1[] = { {{ offset_in1|join(", ") }} }; +constexpr size_t {{name|upper}}_OFFSET_IN2[] = { {{ offset_in2|join(", ") }} }; + #define {{ name|upper }}_ACTIVATION {{ activation }} #define {{ name|upper }}_ELEM_OP {{ elemwise_op }} + +{% include "./_save_outputs.jinja" %} {% include "./_rescaling.jinja" %} #endif /* {{ name|upper }}_LAYER_H */ diff --git a/aidge_export_cpp/templates/configuration/erf_config.jinja b/aidge_export_cpp/templates/configuration/erf_config.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1e8e02c81188ddfeeb0d1c6e821f2509d6bb9a9e --- /dev/null +++ b/aidge_export_cpp/templates/configuration/erf_config.jinja @@ -0,0 +1,11 @@ +{#- For name header -#} +#ifndef {{ name|upper }}_LAYER_H +#define {{ name|upper }}_LAYER_H + +{# For layer configuration -#} +{% include "./_def_io.jinja" %} +{% include "./_meminfo.jinja" %} +#define {{ name|upper }}_NB_ELTS {{ in_dims[0]|join('*') }} +{% include "./_save_outputs.jinja" %} + +#endif /* {{ name|upper }}_LAYER_H */ \ No newline at end of file diff --git a/aidge_export_cpp/templates/configuration/fullyconnected_config.jinja b/aidge_export_cpp/templates/configuration/fullyconnected_config.jinja index 856d727abc11ceb6f914e9d71d286ef5882322d6..b50f64fb500b7ae036f91821b689175c2c9b3e92 100644 --- a/aidge_export_cpp/templates/configuration/fullyconnected_config.jinja +++ b/aidge_export_cpp/templates/configuration/fullyconnected_config.jinja @@ -13,4 +13,6 @@ #define {{ name|upper }}_WEIGHTS_SIZE {{ weights_size }} #define {{ name|upper }}_BIASES_SIZE {{ out_chan[0] }} +{% include "./_save_outputs.jinja" %} + #endif /* {{ name|upper }}_LAYER_H */ diff --git a/aidge_export_cpp/templates/configuration/hardmax_config.jinja b/aidge_export_cpp/templates/configuration/hardmax_config.jinja index b5bf60cfd28ead3b0b787a0750c5ceae648e5e72..247a44cfe9da08864ea06d08264b04f9fb900876 100644 --- a/aidge_export_cpp/templates/configuration/hardmax_config.jinja +++ b/aidge_export_cpp/templates/configuration/hardmax_config.jinja @@ -10,5 +10,6 @@ #define {{ name|upper }}_AXIS_STRIDE {{ axis_stride }} #define {{ name|upper }}_POSTAXIS_STRIDE {{ postaxis_stride }} #define {{ name|upper }}_INOUT_NB_ELTS {{ out_nb_elts }} +{% include "./_save_outputs.jinja" %} #endif /* {{ name|upper }}_LAYER_H */ diff --git a/aidge_export_cpp/templates/configuration/reshape_config.jinja b/aidge_export_cpp/templates/configuration/identity_config.jinja similarity index 88% rename from aidge_export_cpp/templates/configuration/reshape_config.jinja rename to aidge_export_cpp/templates/configuration/identity_config.jinja index 88c8cc21f134b333e354c0df96dd07f41e948506..80318d1fa23d8e6caa401d0d40f080339343c2be 100644 --- a/aidge_export_cpp/templates/configuration/reshape_config.jinja +++ b/aidge_export_cpp/templates/configuration/identity_config.jinja @@ -6,5 +6,6 @@ {% include "./_meminfo.jinja" %} {# For layer configuration -#} #define {{ name|upper }}_NB_ELTS {{ in_dims[0]|join('*') }} +{% include "./_save_outputs.jinja" %} #endif /* {{ name|upper }}_LAYER_H */ diff --git a/aidge_export_cpp/templates/configuration/leakyrelu_config.jinja b/aidge_export_cpp/templates/configuration/leakyrelu_config.jinja index 80903622d394bac9132ae3015f82ef72ac2242ea..2d6bd87ff2eb64026dd25a5741bf1fb89f1ebc6d 100644 --- a/aidge_export_cpp/templates/configuration/leakyrelu_config.jinja +++ b/aidge_export_cpp/templates/configuration/leakyrelu_config.jinja @@ -5,7 +5,8 @@ {# For layer configuration -#} {% include "./_def_io.jinja" %} {% include "./_meminfo.jinja" %} -#define {{ name|upper }}_NB_DATA {{ nb_data }} +#define {{ name|upper }}_NB_ELTS {{ in_dims[0]|join('*') }} #define {{ name|upper }}_ALPHA {{ alpha }} +{% include "./_save_outputs.jinja" %} #endif /* {{ name|upper }}_LAYER_H */ diff --git a/aidge_export_cpp/templates/configuration/matmul_config.jinja b/aidge_export_cpp/templates/configuration/matmul_config.jinja index d0d4958e505b3208598fe387bba357a0c3d84602..0c4982867482738af9fddcd15cee59c308244c88 100644 --- a/aidge_export_cpp/templates/configuration/matmul_config.jinja +++ b/aidge_export_cpp/templates/configuration/matmul_config.jinja @@ -3,16 +3,24 @@ #define {{ name|upper }}_LAYER_H #include "network/rescaling_utils.hpp" +#include <sys/types.h> + +{# For layer configuration -#} {% include "./_def_io.jinja" %} {% include "./_meminfo.jinja" %} +#define {{ name|upper }}_NB_MAT {{ offset_in1|length }} +constexpr size_t {{name|upper}}_OFFSET_IN1[] = { {{ offset_in1|join(", ") }} }; +constexpr size_t {{name|upper}}_OFFSET_IN2[] = { {{ offset_in2|join(", ") }} }; + +#define {{ name|upper }}_N {{ n }} +#define {{ name|upper }}_M {{ m }} +#define {{ name|upper }}_K {{ k }} -{# For layer configuration -#} -#define {{ name|upper }}_M {{ in_dims[0][0] }} -#define {{ name|upper }}_K {{ in_dims[0][1] }} -#define {{ name|upper }}_N {{ in_dims[1][1] }} #define {{ name|upper }}_ACTIVATION {{ activation }} -{% include "./_rescaling.jinja" %} +{% include "./_save_outputs.jinja" %} + +{% include "./_rescaling.jinja" %} {#- Calculate sizes #} diff --git a/aidge_export_cpp/templates/configuration/pad_config.jinja b/aidge_export_cpp/templates/configuration/pad_config.jinja index 8b21577fe4d6f52ddb36ae796740f265db3d45cc..ed6f8ae79d19925041d7c2fec587a53f59770200 100644 --- a/aidge_export_cpp/templates/configuration/pad_config.jinja +++ b/aidge_export_cpp/templates/configuration/pad_config.jinja @@ -9,5 +9,6 @@ #define {{ name|upper }}_PADDING_TOP {{ padding[0] }} #define {{ name|upper }}_PADDING_LEFT {{ padding[1] }} #define {{ name|upper }}_BORDER_VALUE {{ border_value }} +{% include "./_save_outputs.jinja" %} #endif /* {{ name|upper }}_LAYER_H */ diff --git a/aidge_export_cpp/templates/configuration/pooling_config.jinja b/aidge_export_cpp/templates/configuration/pooling_config.jinja index 57608fcb9c8071a1c686b10565d6adab78735079..ad6a7aebe54d2386e5bc0d47b2e80b0ea019aad6 100644 --- a/aidge_export_cpp/templates/configuration/pooling_config.jinja +++ b/aidge_export_cpp/templates/configuration/pooling_config.jinja @@ -13,5 +13,6 @@ #define {{ name|upper }}_KERNEL_WIDTH {{ kernel_dims[1] }} #define {{ name|upper }}_POOLING_TYPE {{ pool_type }} #define {{ name|upper }}_ACTIVATION {{ activation }} +{% include "./_save_outputs.jinja" %} #endif /* {{ name|upper }}_LAYER_H */ diff --git a/aidge_export_cpp/templates/configuration/producer_config.jinja b/aidge_export_cpp/templates/configuration/producer_config.jinja new file mode 100644 index 0000000000000000000000000000000000000000..ad0660f50d94984b1113af2cc4fae4cca5ac00c5 --- /dev/null +++ b/aidge_export_cpp/templates/configuration/producer_config.jinja @@ -0,0 +1,8 @@ +{#- For name header -#} +#ifndef {{ name|upper }}_LAYER_H +#define {{ name|upper }}_LAYER_H + +{# For layer configuration -#} +{% include "./_meminfo.jinja" %} + +#endif /* {{ name|upper }}_LAYER_H */ \ No newline at end of file diff --git a/aidge_export_cpp/templates/configuration/reducemean_config.jinja b/aidge_export_cpp/templates/configuration/reducemean_config.jinja new file mode 100644 index 0000000000000000000000000000000000000000..369ac8a3e7d70f8949e1c18e65cd7b0eb319e71f --- /dev/null +++ b/aidge_export_cpp/templates/configuration/reducemean_config.jinja @@ -0,0 +1,27 @@ +{#- For name header -#} +#ifndef {{ name|upper }}_LAYER_H +#define {{ name|upper }}_LAYER_H + +#include <sys/types.h> + +{# For layer configuration -#} +{% include "./_def_io.jinja" %} +{% include "./_meminfo.jinja" %} + +#define {{ in_name[0]|upper }}_NB_DIMS {{ in_nb_dims }} +#define {{ in_name[0]|upper }}_NB_ELTS {{ in_nb_elts }} +#define {{ out_name[0]|upper }}_NB_ELTS {{ out_nb_elts }} +#define {{ name|upper }}_NB_AXES_TO_REDUCES {{ nb_axes_to_reduce }} +constexpr size_t {{ in_name[0]|upper }}_DIMS[{{ in_nb_dims }}] = + { {{ in_dims|join(", ") }} }; +constexpr size_t {{ name|upper }}_AXES_TO_REDUCE[{{ nb_axes_to_reduce }}] = + { {{ axes_to_reduce|join(", ") }} }; +constexpr size_t {{ name|upper }}_PREAXIS_STRIDES[{{ in_nb_dims }}] = + { {{ pre_axis_strides|join(", ") }} }; +constexpr size_t {{ name|upper }}_POSTAXIS_STRIDES[{{ in_nb_dims }}] = + { {{ post_axis_strides|join(", ") }} }; + +{% include "./_save_outputs.jinja" %} + +#endif /* {{ name|upper }}_LAYER_H */ + diff --git a/aidge_export_cpp/templates/configuration/rescaling_config.jinja b/aidge_export_cpp/templates/configuration/rescaling_config.jinja deleted file mode 100644 index 6f4e3ad80e4f72d180a9add5fd7978181a71031d..0000000000000000000000000000000000000000 --- a/aidge_export_cpp/templates/configuration/rescaling_config.jinja +++ /dev/null @@ -1,16 +0,0 @@ -{#- For name header -#} -#ifndef {{ name|upper }}_LAYER_H -#define {{ name|upper }}_LAYER_H -#include "network/rescaling_utils.hpp" - -{# For layer configuration -#} -{% include "./_def_io.jinja" %} -{% include "./_meminfo.jinja" %} - -#define {{ name|upper }}_NB_DATA {{ in_chan[0] * in_height[0] * in_width[0] }} - -// Activation -#define {{ name|upper }}_ACTIVATION {{ activation }} -{% include "./_rescaling.jinja" %} - -#endif /* {{ name|upper }}_LAYER_H */ diff --git a/aidge_export_cpp/templates/configuration/sigmoid_config.jinja b/aidge_export_cpp/templates/configuration/sigmoid_config.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1e8e02c81188ddfeeb0d1c6e821f2509d6bb9a9e --- /dev/null +++ b/aidge_export_cpp/templates/configuration/sigmoid_config.jinja @@ -0,0 +1,11 @@ +{#- For name header -#} +#ifndef {{ name|upper }}_LAYER_H +#define {{ name|upper }}_LAYER_H + +{# For layer configuration -#} +{% include "./_def_io.jinja" %} +{% include "./_meminfo.jinja" %} +#define {{ name|upper }}_NB_ELTS {{ in_dims[0]|join('*') }} +{% include "./_save_outputs.jinja" %} + +#endif /* {{ name|upper }}_LAYER_H */ \ No newline at end of file diff --git a/aidge_export_cpp/templates/configuration/slice_config.jinja b/aidge_export_cpp/templates/configuration/slice_config.jinja new file mode 100644 index 0000000000000000000000000000000000000000..e3af737ba8aef29937ea95da05a46310bc1f30a6 --- /dev/null +++ b/aidge_export_cpp/templates/configuration/slice_config.jinja @@ -0,0 +1,20 @@ +{#- For name header -#} +#ifndef {{ name|upper }}_LAYER_H +#define {{ name|upper }}_LAYER_H + +#include <sys/types.h> + +{% include "./_def_io.jinja" %} +{% include "./_meminfo.jinja" %} +{# For layer configuration -#} +#define {{ name|upper }}_NB_DIMS {{ in_dims[0] | length }} +#define {{ name|upper }}_NB_ELTS {{ in_dims[0]|join('*') }} +#define {{ name|upper }}_NB_AXES {{ axes| length }} +constexpr size_t {{ name|upper }}_STARTS[] = { {{ starts | join(', ') }} }; +constexpr size_t {{ name|upper }}_ENDS[] = { {{ ends | join(', ') }} }; +constexpr size_t {{ name|upper }}_STEPS[] = { {{ steps | join(', ') }} }; +constexpr size_t {{ name|upper }}_AXES_MOD[] = { {{ axes_mod | join(', ') }} }; +constexpr size_t {{ name|upper }}_AXES_DIV[] = { {{ axes_div | join(', ') }} }; +{% include "./_save_outputs.jinja" %} + +#endif /* {{ name|upper }}_LAYER_H */ diff --git a/aidge_export_cpp/templates/configuration/softmax_config.jinja b/aidge_export_cpp/templates/configuration/softmax_config.jinja index 6dc6b3453e103757c184e7a6efdfd9d21c5cbd43..095bce86e3d30ef77a5599f47b031aa0ba25775e 100644 --- a/aidge_export_cpp/templates/configuration/softmax_config.jinja +++ b/aidge_export_cpp/templates/configuration/softmax_config.jinja @@ -8,5 +8,6 @@ #define {{ name|upper }}_AXIS_SIZE {{ axis_size }} #define {{ name|upper }}_AXIS_SIZE_POST {{ axis_size_post }} #define {{ name|upper }}_AXIS_SIZE_PRE {{ axis_size_pre }} +{% include "./_save_outputs.jinja" %} #endif /* {{ name|upper }}_LAYER_H */ diff --git a/aidge_export_cpp/templates/configuration/transpose_ND_config.jinja b/aidge_export_cpp/templates/configuration/transpose_ND_config.jinja index 56bb5bd524a5927d7461391c18df6077c97766db..6ccd728adbd3256b98ff00284ba7d4aeb7ffcb3a 100644 --- a/aidge_export_cpp/templates/configuration/transpose_ND_config.jinja +++ b/aidge_export_cpp/templates/configuration/transpose_ND_config.jinja @@ -2,14 +2,19 @@ #ifndef {{ name|upper }}_LAYER_H #define {{ name|upper }}_LAYER_H +#include <sys/types.h> + {# For layer configuration -#} {% include "./_def_io.jinja" %} {% include "./_meminfo.jinja" %} {# Export suppose that batchsize = 1#} #define {{ name|upper }}_NB_ELTS {{ in_dims[0]|join('*') }} #define {{ name|upper }}_NB_DIMS {{ in_dims[0] | length }} +constexpr size_t {{ name|upper }}_IN_STRIDES[] = { {{ in_strides | join(', ') }} }; +constexpr size_t {{ name|upper }}_OUT_STRIDES[] = { {{ out_strides | join(', ') }} }; -static constexpr unsigned int {{ name|upper }}_PERMUTE[] = { {{ output_dims_order | join(', ') }} }; -static constexpr unsigned int {{ name|upper }}_DIMS[] = { {{ in_dims[0] | join(', ') }}}; +constexpr size_t {{ name|upper }}_PERMUTE[] = { {{ output_dims_order | join(', ') }} }; +constexpr size_t {{ name|upper }}_DIMS[] = { {{ in_dims[0] | join(', ') }} }; +{% include "./_save_outputs.jinja" %} #endif /* {{ name|upper }}_LAYER_H */ \ No newline at end of file diff --git a/aidge_export_cpp/templates/data/aidge_tensor.jinja b/aidge_export_cpp/templates/data/aidge_tensor.jinja index 3f086afd0a8f9a479a6073b463cefce21fc3e752..a1d5a20dff9a48e364eca0a588ac193eb2f5ad83 100644 --- a/aidge_export_cpp/templates/data/aidge_tensor.jinja +++ b/aidge_export_cpp/templates/data/aidge_tensor.jinja @@ -1,7 +1,79 @@ #include <stdint.h> -static const {{ data_t }} {{ name }} +#define {{name|upper}}_FMT Format::{{ dformat | upper }} + +{%- set format_map = { + "int8_t": "%4d", + "int16_t": "%6d", + "int32_t": "%6d", + "int64_t": "%8d", + "uint8_t": "%4d", + "uint16_t": "%6d", + "uint32_t": "%6d", + "uint64_t": "%8d", + "float": "%.9f", + "half_float::half": "%.9f", + "double": "%.17f" +} %} + +static const {{ dtype }} {{ name }} {%- for dim in dims -%} [{{ dim }}] {%- endfor %} = -{{ values }}; + +{{ '{' -}} + +{# 1D #} +{%- if dims | length == 1 -%} +{{ '{' }} +{%- for x in range(dims[0]) -%} +{{ format_map[dtype] | format(values[x]) }}, +{%- endfor -%} +{{ '}' }}; +{%- endif -%} + +{#- 2D #} +{%- if dims | length == 2 -%} +{%- for y in range(dims[0]) -%} +{{ '{' }} + {%- for x in range(dims[1]) -%} + {{ format_map[dtype] | format(values[y][x]) }}, + {%- endfor -%} +{{ '}' }}, +{%- endfor -%} +{%- endif -%} + +{#- 3D #} +{%- if dims | length == 3 -%} +{%- for z in range(dims[0]) %} +{{ '{' }} + {%- for y in range(dims[1]) %} + {{ '{' }} + {%- for x in range(dims[2]) -%} + {{ format_map[dtype] | format(values[z][y][x]) }}, + {%- endfor -%} + {{ '}' }}, + {%- endfor %} +{{ '}' }}, +{%- endfor -%} +{%- endif -%} + +{#- 4D #} +{%- if dims | length == 4 -%} +{%- for n in range(dims[0]) %} +{{ '{' }} + {%- for z in range(dims[1]) %} + {{ '{' }} + {%- for y in range(dims[2]) %} + {{ '{' }} + {%- for x in range(dims[3]) -%} + {{ format_map[dtype] | format(values[n][z][y][x]) }}, + {%- endfor -%} + {{ '}' }}, + {%- endfor %} + {{ '}' }}, + {%- endfor %} +{{ '}' }}, +{%- endfor %} +{%- endif -%} +{{ '};' }} diff --git a/aidge_export_cpp/templates/data/parameters.jinja b/aidge_export_cpp/templates/data/parameters.jinja index b58ca9c1977a5e8f9af69401c0eb3e5f47fc6cc4..70e2b7dcc5b5728bcc252c4974d4e09b79f975f0 100644 --- a/aidge_export_cpp/templates/data/parameters.jinja +++ b/aidge_export_cpp/templates/data/parameters.jinja @@ -1,21 +1,66 @@ {#- For libraries -#} #include <stdint.h> +{%- set format_map = { + "int8_t": "%4d", + "int16_t": "%6d", + "int32_t": "%6d", + "int64_t": "%8d", + "uint8_t": "%4d", + "uint16_t": "%6d", + "uint32_t": "%6d", + "uint64_t": "%8d", + "float": "%.9f", + "half_float::half": "%.9f", + "double": "%.17f" +} %} + {# Design header of the array -#} -static const {{ data_t }} {{ name }}[{{ values|length }}] __attribute__((section(".nn_data"))) = +static const {{ dtype }} {{ name }}[{{ dims | join("*") }}] __attribute__((section(".nn_data"))) = { - {# For loop to add new elements -#} - {%- for i in range(values|length) %} +{# 1D #} +{%- if dims | length == 1 -%} +{%- for x in range(dims[0]) -%} +{{ format_map[dtype] | format(values[x]) }}, +{%- endfor -%} +{%- endif -%} + +{#- 2D #} +{%- if dims | length == 2 -%} +{%- for y in range(dims[0]) %} +{{ ' ' }} + {%- for x in range(dims[1]) -%} + {{ format_map[dtype] | format(values[y][x]) }}, + {%- endfor %} +{%- endfor -%} +{%- endif -%} + +{#- 3D #} +{%- if dims | length == 3 -%} +{%- for z in range(dims[0]) %} +{{ ' ' }} + {%- for y in range(dims[1]) %} + {{ ' ' }} + {%- for x in range(dims[2]) -%} + {{ format_map[dtype] | format(values[z][y][x]) }}, + {%- endfor -%} + {%- endfor %} +{%- endfor -%} +{%- endif -%} - {#- Last value -#} - {%- if (i+1) == values|length -%} - {{ values[i]|string }} - {%- else -%} - {%- if (i+1) % 5 == 0 -%} - {{ values[i]|string + ",\n\t" }} - {%- else -%} - {{ values[i]|string + ", " }} - {%- endif -%} - {%- endif -%} +{#- 4D #} +{%- if dims | length == 4 -%} +{%- for n in range(dims[0]) %} +{{ ' ' }} + {%- for z in range(dims[1]) %} + {{ ' ' }} + {%- for y in range(dims[2]) %} + {{ ' ' }} + {%- for x in range(dims[3]) -%} + {{ format_map[dtype] | format(values[n][z][y][x]) }}, + {%- endfor -%} + {%- endfor %} {%- endfor %} +{%- endfor %} +{%- endif %} }; diff --git a/aidge_export_cpp/templates/kernel_forward/_aidge_cmp.jinja b/aidge_export_cpp/templates/kernel_forward/_aidge_cmp.jinja index 98778c1deff2a8bc2b32eac395f8113279f03d68..b30a03545271812ca5a17934caea2c3fc17c26e6 100644 --- a/aidge_export_cpp/templates/kernel_forward/_aidge_cmp.jinja +++ b/aidge_export_cpp/templates/kernel_forward/_aidge_cmp.jinja @@ -1,8 +1,14 @@ {%- if aidge_cmp is defined and aidge_cmp %} -#if AIDGE_CMP +#ifdef AIDGE_CMP aidge_cmp<{{ out_name[0] | upper }}_NB_OUTPUTS, {{ out_name[0] | upper }}_OUT_HEIGHT, - {{ out_name[0] | upper }}_OUT_WIDTH> + {{ out_name[0] | upper }}_OUT_WIDTH, + {{ out_name[0]|upper }}_MEM_CONT_OFFSET, + {{ out_name[0]|upper }}_MEM_CONT_SIZE, + {{ out_name[0]|upper }}_MEM_WRAP_OFFSET, + {{ out_name[0]|upper }}_MEM_WRAP_SIZE, + {{ out_name[0] | upper }}_AIDGE_FMT, + {{ out_name[0] | upper }}_DEV_FMT> ("{{ name }}", ({{out_cdtype[0]}}*) {{ out_name[0] }}_aidge, {{ out_name[0] }}); #endif {%- endif %} \ No newline at end of file diff --git a/aidge_export_cpp/templates/kernel_forward/_mem_offset.jinja b/aidge_export_cpp/templates/kernel_forward/_mem_offset.jinja index f3bea038c52dc1500f6a827a24557f2bc4f7dcc9..e3853184e7bedf0195abe6867eb12439590001d5 100644 --- a/aidge_export_cpp/templates/kernel_forward/_mem_offset.jinja +++ b/aidge_export_cpp/templates/kernel_forward/_mem_offset.jinja @@ -1,3 +1,3 @@ {%- for outidx in range(nb_out) %} -{{out_cdtype[outidx]}}* {{out_name[outidx]}} = ({{out_cdtype[outidx]}}*) mem + {{out_name[outidx]|upper}}_OFFSET; +{{out_cdtype[outidx]}}* {{out_name[outidx]}} = ({{out_cdtype[outidx]}}*) (mem + {{out_name[outidx]|upper}}_MEM_OFFSET); {%- endfor %} diff --git a/aidge_export_cpp/templates/kernel_forward/_save_outputs.jinja b/aidge_export_cpp/templates/kernel_forward/_save_outputs.jinja index 6865be575a613af16fc6a88fd969525abba80d0d..c0956d290f6c0f4b4cebd901cd1d531a69584c89 100644 --- a/aidge_export_cpp/templates/kernel_forward/_save_outputs.jinja +++ b/aidge_export_cpp/templates/kernel_forward/_save_outputs.jinja @@ -1,19 +1,16 @@ -#if SAVE_OUTPUTS -{% for outidx in range(nb_out) -%} - FILE* {{out_name[outidx]|upper}}_STREAM = fopen("data/export_outputs/{{out_name[outidx]}}.txt", "w"); - saveOutputs<{{out_cdtype[outidx]}}>( - {{out_name[outidx]|upper}}_NB_OUTPUTS, - {{out_name[outidx]|upper}}_OUT_HEIGHT, - {{out_name[outidx]|upper}}_OUT_WIDTH, - {#- {{out_name[outidx]|upper}}_CONT_OFFSET, - {{out_name[outidx]|upper}}_CONT_SIZE, - {{out_name[outidx]|upper}}_WRAP_OFFSET, - {{out_name[outidx]|upper}}_WRAP_SIZE, - {{out_name[outidx]|upper}}_STRIDE, #} - {{out_name[outidx]}}, - {{out_name[outidx]|upper}}_STREAM, - Format::NHWC); - fclose({{out_name[outidx]|upper}}_STREAM); -{% endfor %} +#ifdef SAVE_OUTPUTS +printf("[NOTICE] - Saving outputs of node {{ name }}\n"); +FILE* {{ out_name[0] | upper }}_STREAM = fopen("data/export_outputs/{{out_name[0]}}.txt", "w"); +saveOutputs<{{ out_name[0] | upper }}_NB_OUTPUTS, + {{ out_name[0] | upper }}_OUT_HEIGHT, + {{ out_name[0] | upper }}_OUT_WIDTH, + {{ out_name[0]|upper }}_MEM_CONT_OFFSET, + {{ out_name[0]|upper }}_MEM_CONT_SIZE, + {{ out_name[0]|upper }}_MEM_WRAP_OFFSET, + {{ out_name[0]|upper }}_MEM_WRAP_SIZE, + {{ out_name[0] | upper }}_DEV_FMT> + ({{ out_name[0] }}, + {{ out_name[0] | upper }}_STREAM); +fclose({{ out_name[0] | upper }}_STREAM); #endif diff --git a/aidge_export_cpp/templates/kernel_forward/activation_forward.jinja b/aidge_export_cpp/templates/kernel_forward/activation_forward.jinja index 1dc4eb530aeafdac10b59b3b2c8a0313d4411659..b1fd417bc50ae495a537f505a7ea7729778d04a4 100644 --- a/aidge_export_cpp/templates/kernel_forward/activation_forward.jinja +++ b/aidge_export_cpp/templates/kernel_forward/activation_forward.jinja @@ -1,7 +1,17 @@ {% filter indent(width=4, first=False) %} {% include "./_mem_offset.jinja" %} -activation_forward<{{name|upper}}_NB_DATA, - {{name|upper}}_ACTIVATION> +activation_forward<{{name|upper}}_NB_ELTS, + {{name|upper}}_ACTIVATION, + {{ in_name[0]|upper }}_MEM_CONT_OFFSET, + {{ in_name[0]|upper }}_MEM_CONT_SIZE, + {{ in_name[0]|upper }}_MEM_WRAP_OFFSET, + {{ in_name[0]|upper }}_MEM_WRAP_SIZE, + {{ in_name[0]|upper }}_MEM_STRIDE, + {{ out_name[0]|upper }}_MEM_CONT_OFFSET, + {{ out_name[0]|upper }}_MEM_CONT_SIZE, + {{ out_name[0]|upper }}_MEM_WRAP_OFFSET, + {{ out_name[0]|upper }}_MEM_WRAP_SIZE, + {{ out_name[0]|upper }}_MEM_STRIDE> ({{in_name[0]}}, {{out_name[0]}}, {{name|upper}}_RESCALING); {% include "./_save_outputs.jinja" %} {% include "./_aidge_cmp.jinja" %} diff --git a/aidge_export_cpp/templates/kernel_forward/concat_forward.jinja b/aidge_export_cpp/templates/kernel_forward/concat_forward.jinja index 88cbc9a25f6e5342c2d3cc14f8e40fe452716944..aff15f26beae9dc68fae177a47e9546d49804073 100644 --- a/aidge_export_cpp/templates/kernel_forward/concat_forward.jinja +++ b/aidge_export_cpp/templates/kernel_forward/concat_forward.jinja @@ -1,22 +1,12 @@ {% filter indent(width=4, first=False) %} {% include "./_mem_offset.jinja" %} -const float* {{ name|upper }}_INPUTS[] = { - {%- for i in range(nb_in) -%} - {{ in_name[i] }}{{ ", " if not loop.last else "" }} - {%- endfor -%} -}; - -unsigned int {{ name|upper }}_SIZES[] = { - {%- for i in range(nb_in) -%} - {{ name|upper }}_INPUT_{{i}}_SIZE{{ ", " if not loop.last else "" }} - {%- endfor -%} -}; - +const {{ out_cdtype[0] }}* {{ name }}_inputs[] = { {{ in_name|join(", ") }} }; concat_forward<{{ name|upper }}_AXIS_SIZE_POST, {{ name|upper }}_AXIS_SIZE_PRE, - {{ nb_in }}, float> - ({{ name|upper }}_INPUTS, - {{ name|upper }}_SIZES, + {{ name|upper }}_AXIS_SIZE, + {{ name|upper }}_TOTAL_AXIS_SIZE, + {{ nb_in }}> + ({{ name }}_inputs, {{ out_name[0] }}); {%- endfilter %} diff --git a/aidge_export_cpp/templates/kernel_forward/convolution_forward.jinja b/aidge_export_cpp/templates/kernel_forward/convolution_forward.jinja index bdde325707eeb497a93ba2084c0672bd7f7e5daa..e4a5642f9f10057eac426a01ae2dadab186bf248 100644 --- a/aidge_export_cpp/templates/kernel_forward/convolution_forward.jinja +++ b/aidge_export_cpp/templates/kernel_forward/convolution_forward.jinja @@ -14,7 +14,17 @@ convolution{{ "_depthwise" if depthwise is defined else "" }}_forward<{{ in_name {{name|upper}}_DILATION_X, {{name|upper}}_KERNEL_HEIGHT, {{name|upper}}_KERNEL_WIDTH, - {{name|upper}}_ACTIVATION> + {{name|upper}}_ACTIVATION, + {{ in_name[0]|upper }}_MEM_CONT_OFFSET, + {{ in_name[0]|upper }}_MEM_CONT_SIZE, + {{ in_name[0]|upper }}_MEM_WRAP_OFFSET, + {{ in_name[0]|upper }}_MEM_WRAP_SIZE, + {{ in_name[0]|upper }}_MEM_STRIDE, + {{ out_name[0]|upper }}_MEM_CONT_OFFSET, + {{ out_name[0]|upper }}_MEM_CONT_SIZE, + {{ out_name[0]|upper }}_MEM_WRAP_OFFSET, + {{ out_name[0]|upper }}_MEM_WRAP_SIZE, + {{ out_name[0]|upper }}_MEM_STRIDE> ({{in_name[0]}}, {{out_name[0]}}, {{in_name[1]}}, {{in_name[2]}}, {{name|upper}}_RESCALING); {% include "./_save_outputs.jinja" %} {% include "./_aidge_cmp.jinja" %} diff --git a/aidge_export_cpp/templates/kernel_forward/elemwise_forward.jinja b/aidge_export_cpp/templates/kernel_forward/elemwise_forward.jinja index 1a99921c185d14f4494c923092e2c36ab684945e..300be27a7e6ddbfacd18bb259303aac201787252 100644 --- a/aidge_export_cpp/templates/kernel_forward/elemwise_forward.jinja +++ b/aidge_export_cpp/templates/kernel_forward/elemwise_forward.jinja @@ -1,12 +1,29 @@ {% filter indent(width=4, first=False) %} {% include "./_mem_offset.jinja" %} -elemwise_forward<{{name|upper}}_NB_ELTS, - {{name|upper}}_ELEM_OP, - {{name|upper}}_ACTIVATION> - ({{out_name[0]}}, - {{name|upper}}_RESCALING, - {{in_name[0]}}, - {{in_name[1]}}); +elemwise_forward<{{name|upper}}_NB_MAT, + {{name|upper}}_ELEM_OP, + {{name|upper}}_INPUT1_CONT_SIZE, + {{name|upper}}_INPUT2_CONT_SIZE, + {{name|upper}}_OUTPUT_CONT_SIZE, + {{name|upper}}_OFFSET_IN1, + {{name|upper}}_OFFSET_IN2, + {{name|upper}}_ACTIVATION, + {{ in_name[0]|upper }}_MEM_CONT_OFFSET, + {{ in_name[0]|upper }}_MEM_CONT_SIZE, + {{ in_name[0]|upper }}_MEM_WRAP_OFFSET, + {{ in_name[0]|upper }}_MEM_WRAP_SIZE, + {{ in_name[0]|upper }}_MEM_STRIDE, + {{ in_name[1]|upper }}_MEM_CONT_OFFSET, + {{ in_name[1]|upper }}_MEM_CONT_SIZE, + {{ in_name[1]|upper }}_MEM_WRAP_OFFSET, + {{ in_name[1]|upper }}_MEM_WRAP_SIZE, + {{ in_name[1]|upper }}_MEM_STRIDE, + {{ out_name[0]|upper }}_MEM_CONT_OFFSET, + {{ out_name[0]|upper }}_MEM_CONT_SIZE, + {{ out_name[0]|upper }}_MEM_WRAP_OFFSET, + {{ out_name[0]|upper }}_MEM_WRAP_SIZE, + {{ out_name[0]|upper }}_MEM_STRIDE> + ({{out_name[0]}}, {{name|upper}}_RESCALING, {{in_name[0]}}, {{in_name[1]}}); {% include "./_save_outputs.jinja" %} {% include "./_aidge_cmp.jinja" %} {% endfilter %} diff --git a/aidge_export_cpp/templates/kernel_forward/erf_forward.jinja b/aidge_export_cpp/templates/kernel_forward/erf_forward.jinja new file mode 100644 index 0000000000000000000000000000000000000000..9f3fbf3009c96454aceb306042358355e6b60a22 --- /dev/null +++ b/aidge_export_cpp/templates/kernel_forward/erf_forward.jinja @@ -0,0 +1,6 @@ +{% filter indent(width=4, first=False) %} +{% include "./_mem_offset.jinja" %} +erf_forward<{{name|upper}}_NB_ELTS> + ({{in_name[0]}}, {{out_name[0]}}); +{% include "./_save_outputs.jinja" %} +{% endfilter %} \ No newline at end of file diff --git a/aidge_export_cpp/templates/kernel_forward/fullyconnected_forward.jinja b/aidge_export_cpp/templates/kernel_forward/fullyconnected_forward.jinja index 9a35d799be09d1bc5b311f750e64b38656f723c1..ad4e7479fd2b27fc7c3ba2c5e7b0f37b4bc8262a 100644 --- a/aidge_export_cpp/templates/kernel_forward/fullyconnected_forward.jinja +++ b/aidge_export_cpp/templates/kernel_forward/fullyconnected_forward.jinja @@ -1,12 +1,22 @@ {% filter indent(width=4, first=False) %} {% include "./_mem_offset.jinja" %} -fullyconnected_forward<{{ in_name[0]|upper }}_NB_CHANNELS, +fullyconnected{{ "_" ~ kernel if kernel != "" else "" }}_forward<{{ in_name[0]|upper }}_NB_CHANNELS, {{ in_name[0]|upper }}_IN_HEIGHT, {{ in_name[0]|upper }}_IN_WIDTH, {{ out_name[0]|upper }}_NB_OUTPUTS, {{ out_name[0]|upper }}_OUT_HEIGHT, {{ out_name[0]|upper }}_OUT_WIDTH, - {{name|upper}}_ACTIVATION> + {{name|upper}}_ACTIVATION, + {{ in_name[0]|upper }}_MEM_CONT_OFFSET, + {{ in_name[0]|upper }}_MEM_CONT_SIZE, + {{ in_name[0]|upper }}_MEM_WRAP_OFFSET, + {{ in_name[0]|upper }}_MEM_WRAP_SIZE, + {{ in_name[0]|upper }}_MEM_STRIDE, + {{ out_name[0]|upper }}_MEM_CONT_OFFSET, + {{ out_name[0]|upper }}_MEM_CONT_SIZE, + {{ out_name[0]|upper }}_MEM_WRAP_OFFSET, + {{ out_name[0]|upper }}_MEM_WRAP_SIZE, + {{ out_name[0]|upper }}_MEM_STRIDE> ({{in_name[0]}}, {{out_name[0]}}, {{in_name[1]}}, {{in_name[2]}}, {{name|upper}}_RESCALING); {% include "./_save_outputs.jinja" %} {% include "./_aidge_cmp.jinja" %} diff --git a/aidge_export_cpp/templates/kernel_forward/reshape_forward.jinja b/aidge_export_cpp/templates/kernel_forward/identity_forward.jinja similarity index 60% rename from aidge_export_cpp/templates/kernel_forward/reshape_forward.jinja rename to aidge_export_cpp/templates/kernel_forward/identity_forward.jinja index 6af8ece3428b563689f8373c0dc7560d867fff1d..bf83d0809c8c5ca1e8796c90d533b7ad856000ed 100644 --- a/aidge_export_cpp/templates/kernel_forward/reshape_forward.jinja +++ b/aidge_export_cpp/templates/kernel_forward/identity_forward.jinja @@ -1,7 +1,7 @@ {% filter indent(width=4, first=False) %} {% include "./_mem_offset.jinja" %} -reshape_forward<{{name|upper}}_NB_ELTS> - ({{in_name[0]}}, {{in_name[1]}}, {{out_name[0]}}); +identity_forward<{{name|upper}}_NB_ELTS> + ({{in_name[0]}}, {{out_name[0]}}); {% include "./_save_outputs.jinja" %} {% include "./_aidge_cmp.jinja" %} {% endfilter %} diff --git a/aidge_export_cpp/templates/kernel_forward/leakyrelu_forward.jinja b/aidge_export_cpp/templates/kernel_forward/leakyrelu_forward.jinja index 89cf2591139ef2719d516c2fba522534dbb806c8..15fdd913325854127fc1051c17a4d9924c15f699 100644 --- a/aidge_export_cpp/templates/kernel_forward/leakyrelu_forward.jinja +++ b/aidge_export_cpp/templates/kernel_forward/leakyrelu_forward.jinja @@ -1,7 +1,17 @@ {% filter indent(width=4, first=False) %} {% include "./_mem_offset.jinja" %} -leakyrelu_forward<{{name|upper}}_NB_DATA> - ({{input_name}}, {{output_name}}, {{name|upper}}_ALPHA); +leakyrelu_forward<{{name|upper}}_NB_ELTS, + {{ in_name[0]|upper }}_MEM_CONT_OFFSET, + {{ in_name[0]|upper }}_MEM_CONT_SIZE, + {{ in_name[0]|upper }}_MEM_WRAP_OFFSET, + {{ in_name[0]|upper }}_MEM_WRAP_SIZE, + {{ in_name[0]|upper }}_MEM_STRIDE, + {{ out_name[0]|upper }}_MEM_CONT_OFFSET, + {{ out_name[0]|upper }}_MEM_CONT_SIZE, + {{ out_name[0]|upper }}_MEM_WRAP_OFFSET, + {{ out_name[0]|upper }}_MEM_WRAP_SIZE, + {{ out_name[0]|upper }}_MEM_STRIDE> + ({{in_name[0]}}, {{out_name[0]}}, {{name|upper}}_ALPHA); {% include "./_save_outputs.jinja" %} {% include "./_aidge_cmp.jinja" %} {% endfilter %} diff --git a/aidge_export_cpp/templates/kernel_forward/matmul_forward.jinja b/aidge_export_cpp/templates/kernel_forward/matmul_forward.jinja index 090fbac398b207d29a1f4ebf94eca564e032a53b..0333d2e81322cf1ba34c82306e8a4b7a247c83e0 100644 --- a/aidge_export_cpp/templates/kernel_forward/matmul_forward.jinja +++ b/aidge_export_cpp/templates/kernel_forward/matmul_forward.jinja @@ -1,8 +1,11 @@ {% filter indent(width=4, first=False) %} {% include "./_mem_offset.jinja" %} -matmul_forward<{{name|upper}}_M, - {{name|upper}}_K, +matmul_forward<{{name|upper}}_NB_MAT, {{name|upper}}_N, + {{name|upper}}_M, + {{name|upper}}_K, + {{name|upper}}_OFFSET_IN1, + {{name|upper}}_OFFSET_IN2, {{name|upper}}_ACTIVATION> ({{in_name[0]}}, {{in_name[1]}}, {{out_name[0]}}, {{name|upper}}_RESCALING); {% include "./_save_outputs.jinja" %} diff --git a/aidge_export_cpp/templates/kernel_forward/pooling_forward.jinja b/aidge_export_cpp/templates/kernel_forward/pooling_forward.jinja index fb1f2b7e0a1b33602c93b96856533a93eeec9023..42bed98cb99aaec9f72af2ebdc15e0c46b5331f3 100644 --- a/aidge_export_cpp/templates/kernel_forward/pooling_forward.jinja +++ b/aidge_export_cpp/templates/kernel_forward/pooling_forward.jinja @@ -13,7 +13,17 @@ pooling_forward<{{ in_name[0]|upper }}_NB_CHANNELS, {{name|upper}}_KERNEL_HEIGHT, {{name|upper}}_KERNEL_WIDTH, {{name|upper}}_POOLING_TYPE, - {{name|upper}}_ACTIVATION> + {{name|upper}}_ACTIVATION, + {{ in_name[0]|upper }}_MEM_CONT_OFFSET, + {{ in_name[0]|upper }}_MEM_CONT_SIZE, + {{ in_name[0]|upper }}_MEM_WRAP_OFFSET, + {{ in_name[0]|upper }}_MEM_WRAP_SIZE, + {{ in_name[0]|upper }}_MEM_STRIDE, + {{ out_name[0]|upper }}_MEM_CONT_OFFSET, + {{ out_name[0]|upper }}_MEM_CONT_SIZE, + {{ out_name[0]|upper }}_MEM_WRAP_OFFSET, + {{ out_name[0]|upper }}_MEM_WRAP_SIZE, + {{ out_name[0]|upper }}_MEM_STRIDE> ({{in_name[0]}}, {{out_name[0]}}); {% include "./_save_outputs.jinja" %} {% include "./_aidge_cmp.jinja" %} diff --git a/aidge_export_cpp/templates/kernel_forward/reducemean_forward.jinja b/aidge_export_cpp/templates/kernel_forward/reducemean_forward.jinja new file mode 100644 index 0000000000000000000000000000000000000000..ed75773297b5a5f6c75d99c86ad21b4063d374de --- /dev/null +++ b/aidge_export_cpp/templates/kernel_forward/reducemean_forward.jinja @@ -0,0 +1,17 @@ +{% filter indent(width=4, first=False) %} +{% include "./_mem_offset.jinja" %} +reducemean_forward<{{ in_name[0]|upper }}_NB_DIMS, + {{ in_name[0]|upper }}_NB_ELTS, + {{ out_name[0]|upper }}_NB_ELTS, + {{ name|upper }}_NB_AXES_TO_REDUCES> + ({{ name|upper }}_AXES_TO_REDUCE, + {{ in_name[0]|upper }}_DIMS, + {{ name|upper }}_PREAXIS_STRIDES, + {{ name|upper }}_POSTAXIS_STRIDES, + {{ in_name[0] }}, + {{ out_name[0] }}); +{% include "./_save_outputs.jinja" %} +{% include "./_aidge_cmp.jinja" %} +{% endfilter %} + + diff --git a/aidge_export_cpp/templates/kernel_forward/rescaling_forward.jinja b/aidge_export_cpp/templates/kernel_forward/rescaling_forward.jinja deleted file mode 100644 index ce4ffb869c7f99ac789311a9bc98b926253f968f..0000000000000000000000000000000000000000 --- a/aidge_export_cpp/templates/kernel_forward/rescaling_forward.jinja +++ /dev/null @@ -1,9 +0,0 @@ -{% filter indent(width=4, first=False) %} -{% include "./_mem_offset.jinja" %} -rescaling_forward<{{name|upper}}_NB_DATA, - {{name|upper}}_ACTIVATION> - ({{in_name[0]}}, - {{out_name[0]}}, - {{name|upper}}_RESCALING); -{% include "./_save_outputs.jinja" %} -{% endfilter %} diff --git a/aidge_export_cpp/templates/kernel_forward/sigmoid_forward.jinja b/aidge_export_cpp/templates/kernel_forward/sigmoid_forward.jinja new file mode 100644 index 0000000000000000000000000000000000000000..6ce49f952a6a8411150720987d6f3d00091c61c5 --- /dev/null +++ b/aidge_export_cpp/templates/kernel_forward/sigmoid_forward.jinja @@ -0,0 +1,6 @@ +{% filter indent(width=4, first=False) %} +{% include "./_mem_offset.jinja" %} +sigmoid_forward<{{name|upper}}_NB_ELTS> + ({{in_name[0]}}, {{out_name[0]}}); +{% include "./_save_outputs.jinja" %} +{% endfilter %} \ No newline at end of file diff --git a/aidge_export_cpp/templates/kernel_forward/slice_forward.jinja b/aidge_export_cpp/templates/kernel_forward/slice_forward.jinja new file mode 100644 index 0000000000000000000000000000000000000000..41ed8a2a0197d3ff1e60967663678054800fd519 --- /dev/null +++ b/aidge_export_cpp/templates/kernel_forward/slice_forward.jinja @@ -0,0 +1,16 @@ +{% filter indent(width=4, first=False) %} +{% include "./_mem_offset.jinja" %} +slice_forward<{{in_cdtype[0]}}, + {{name|upper}}_NB_DIMS, + {{name|upper}}_NB_ELTS, + {{name|upper}}_NB_AXES, + {{name|upper}}_STARTS, + {{name|upper}}_ENDS, + {{name|upper}}_STEPS, + {{name|upper}}_AXES_MOD, + {{name|upper}}_AXES_DIV> + ({{in_name[0]}}, + {{out_name[0]}}); +{% include "./_save_outputs.jinja" %} +{% include "./_aidge_cmp.jinja" %} +{% endfilter %} diff --git a/aidge_export_cpp/templates/kernel_forward/transpose_ND_forward.jinja b/aidge_export_cpp/templates/kernel_forward/transpose_ND_forward.jinja index 58a1e01cd9dd1db49d210ac16579675f90c5e2d4..cd7a306b62957b4801e2b7c8ac9a2abd95a9cb66 100644 --- a/aidge_export_cpp/templates/kernel_forward/transpose_ND_forward.jinja +++ b/aidge_export_cpp/templates/kernel_forward/transpose_ND_forward.jinja @@ -1,11 +1,12 @@ {% filter indent(width=4, first=False) %} {% include "./_mem_offset.jinja" %} transpose_ND_forward<{{in_cdtype[0]}}, - {{name|upper}}_NB_DIMS> - ({{in_name[0]}}, - {{name|upper}}_DIMS, - {{name|upper}}_PERMUTE, + {{name|upper}}_NB_DIMS, {{name|upper}}_NB_ELTS, + {{name|upper}}_PERMUTE, + {{name|upper}}_DIMS, + {{name|upper}}_OUT_STRIDES> + ({{in_name[0]}}, {{out_name[0]}}); {% include "./_save_outputs.jinja" %} {% include "./_aidge_cmp.jinja" %} diff --git a/aidge_export_cpp/unit_tests/test_examples.py b/aidge_export_cpp/unit_tests/test_examples.py index d3bb40636eba420b457263873ca00fd6b2d3b6a9..98b5e749a44d2e9e5642b625045ed4790f165890 100644 --- a/aidge_export_cpp/unit_tests/test_examples.py +++ b/aidge_export_cpp/unit_tests/test_examples.py @@ -9,20 +9,21 @@ EXAMPLES_DIR = CURRENT_DIR / "../../examples" # Dictionary of test cases: {id: (script_name, script_args)} TEST_CASES = { - "lenet-no-args": ("export_LeNet/lenet.py", []), - "lenet-int8": ("export_LeNet/lenet.py", ["--dtype=int8"]), - "resnet18-no-args": ("export_ResNet18/resnet18.py", ["--mock_db"]), - "resnet18-int8": ("export_ResNet18/resnet18.py", ["--mock_db", "--dtype=int8"]) + "lenet-no-args": ("export_LeNet/lenet.py", [], ["MODEL ACCURACY = 100.0 %", "Prediction out#0: 7 (1)"]), + "lenet-int8": ("export_LeNet/lenet.py", ["--dtype=int8"], ["MODEL ACCURACY = 100.0 %", "MODEL ACCURACY = 100.0 %", "QUANTIZED ACCURACY = 100.0 %", "Prediction out#0: 7 (119)"]), + "lenet-int8-wrap": ("export_LeNet/lenet.py", ["--dtype=int8", "--mem_wrap"], ["MODEL ACCURACY = 100.0 %", "MODEL ACCURACY = 100.0 %", "QUANTIZED ACCURACY = 100.0 %", "Prediction out#0: 7 (119)"]), + "resnet18-no-args": ("export_ResNet18/resnet18.py", ["--mock_db"], []), + "resnet18-int8": ("export_ResNet18/resnet18.py", ["--mock_db", "--dtype=int8"], []) } def generate_test_cases(): """Parse TEST_CASES to provide valid pytest params. """ - for test_id, (script, args) in TEST_CASES.items(): - yield pytest.param(script, args, id=test_id) + for test_id, (script, args, result) in TEST_CASES.items(): + yield pytest.param(script, args, result, id=test_id) -@pytest.mark.parametrize(("script_name", "script_args"), generate_test_cases()) -def test_example_scripts_run_without_error(script_name, script_args): +@pytest.mark.parametrize(("script_name", "script_args", "script_result"), generate_test_cases()) +def test_example_scripts_run_without_error(script_name, script_args, script_result): """Basic test to verify that examples script run withoput raising an Error. This test DO NOT check that the examples are working only that they are not broken. """ @@ -34,6 +35,9 @@ def test_example_scripts_run_without_error(script_name, script_args): ) assert result.returncode == 0, f"{script_name} failed with error:\n{result.stderr}\n\nTraceback:\n{result.stdout}" + for res in script_result: + assert res in result.stdout, f"Expected output '{res}' not found in the script output:\n{result.stdout}" + def main(): import sys diff --git a/aidge_export_cpp/unit_tests/test_export.py b/aidge_export_cpp/unit_tests/test_export.py index 0a34153cebfa405389dd7ccff20a5d359d7967a4..55f558220753723749a601e40cd625173655a089 100644 --- a/aidge_export_cpp/unit_tests/test_export.py +++ b/aidge_export_cpp/unit_tests/test_export.py @@ -12,6 +12,10 @@ import shutil from aidge_core.utils import run_command from aidge_export_cpp import cpp_fuse_to_metaops, set_nodes_names +import pytest +from _pytest.unittest import TestCaseFunction + +aidge_core.Log.set_console_level(aidge_core.Level.Error) # Reduce useless logs def initFiller(model): # Initialize parameters (weights and biases) @@ -22,10 +26,10 @@ def initFiller(model): value.set_backend("cpu") tuple_out = node.output(0)[0] # No conv in current network - if tuple_out[0].type() == "Conv" and tuple_out[1] == 1: + if tuple_out[0].type() == "Conv2D" and tuple_out[1] == 1: # Conv weight aidge_core.xavier_uniform_filler(value) - elif tuple_out[0].type() == "Conv" and tuple_out[1] == 2: + elif tuple_out[0].type() == "Conv2D" and tuple_out[1] == 2: # Conv bias aidge_core.constant_filler(value, 0.01) elif tuple_out[0].type() == "FC" and tuple_out[1] == 1: @@ -64,15 +68,40 @@ def _np_init_ones(shape, default_value=0.01, dtype=np.float32): return data.reshape(shape).astype(dtype) +# Global dictionary to store test reports +test_reports = {} + +@pytest.hookimpl(hookwrapper=True) +def pytest_runtest_makereport(item, call): + # Execute all other hooks to obtain the report object + outcome = yield + rep = outcome.get_result() + # Store the report in the global dictionary + test_reports[item.nodeid] = rep + class test_operator_export(unittest.TestCase): def setUp(self): - # TODO change seed at each test ? RNG_SEED = 1234 np.random.seed(RNG_SEED) aidge_core.random.Generator.set_seed(RNG_SEED) + def tearDown(self): - pass + result = self._outcome.result + test_succeeded = True + if isinstance(result, TestCaseFunction): + print("Test ran with pytest cannot retrieve if the test was a success.") + else: + test_succeeded = not any(test is self for test, _ in result.failures + result.errors) + if test_succeeded: + shutil.rmtree(self.export_folder) + + # rep = test_reports.get(self.id()) + # if rep is not None and rep.passed: + # # Perform teardown actions only if the test passed + # print(g"Removing {self.export_folder}") + # shutil.rmtree(self.export_folder, ignore_errors=True) + def unit_test_export(self, graph_view, op_name, in_dims, random_inputs=True, random_weights=True, default_value=0.01): """ @@ -86,7 +115,8 @@ class test_operator_export(unittest.TestCase): 3- Generate Cpp export with a main that compare the result of the inference with the result obtained at step 2. 4- Retrieve standard output and using regex to now if the results are the same """ - graph_view.compile("cpu", aidge_core.dtype.float32, dims=in_dims) + graph_view.set_backend("cpu") + graph_view.forward_dims(dims=in_dims) for node in graph_view.get_nodes(): if node.type() == "Producer": @@ -95,9 +125,8 @@ class test_operator_export(unittest.TestCase): if (random_weights): tensor = aidge_core.Tensor(_np_init(value.dims())) - node.get_operator().set_output(0, tensor) - else: + elif default_value != None: aidge_core.constant_filler(value, default_value) # Fuse operators to match implemented cpp kernels @@ -117,9 +146,19 @@ class test_operator_export(unittest.TestCase): # Note the convention ``<op_name>_test`` is useful for gitignore to avoid pushing generated export by accident. export_folder = op_name + "_test" - + self.export_folder = export_folder shutil.rmtree(export_folder, ignore_errors=True) + aidge_core.export_utils.generate_main_compare_cpp(export_folder, graph_view) + + graph_view.set_backend(aidge_export_cpp.ExportLibCpp._name) + aidge_core.adapt_to_backend(graph_view) + graph_view.forward_dims(dims=in_dims) + graph_view.save(export_folder + "/graph") + + scheduler = aidge_core.SequentialScheduler(graph_view) + scheduler.generate_scheduling() + # Export the model in C++ standalone aidge_core.export_utils.scheduler_export( scheduler, @@ -129,7 +168,6 @@ class test_operator_export(unittest.TestCase): memory_manager_args={"stats_folder": f"{export_folder}/stats", "wrapping": False } ) - aidge_core.export_utils.generate_main_compare_cpp(export_folder, graph_view) print("COMPILATION") try: @@ -241,7 +279,6 @@ class test_operator_export(unittest.TestCase): self.unit_test_export(model, "HardmaxAxis0", [[10]]) - @unittest.expectedFailure def test_FC_image_in(self): """Test exporting a FC operator with a HWC input. """ @@ -259,6 +296,14 @@ class test_operator_export(unittest.TestCase): self.unit_test_export(model, "ReLU", [[1, 10]]) + def test_leakyrelu(self): + print("LeakyReLU") + model = aidge_core.sequential([ + aidge_core.LeakyReLU(name="leakyrelu0", negative_slope=0.1) + ]) + + self.unit_test_export(model, "LeakyReLU", [[1, 10]]) + def test_add(self): print("Add") model = aidge_core.sequential([ @@ -286,8 +331,6 @@ class test_operator_export(unittest.TestCase): self.unit_test_export(model, "Add", [[1, 5, 7]]) - # "Broadcast not supported yet in export operator" - @unittest.expectedFailure def test_add_simple_broadcast(self): print("AddSimpleBroadcast") model = aidge_core.sequential([ @@ -297,8 +340,6 @@ class test_operator_export(unittest.TestCase): self.unit_test_export(model, "AddSimpleBroadcast", [[1, 7, 5]]) - # "Broadcast not supported yet in export operator" - @unittest.expectedFailure def test_add_double_broadcast(self): print("AddDoubleBroadcast") model = aidge_core.sequential([ @@ -335,8 +376,6 @@ class test_operator_export(unittest.TestCase): self.unit_test_export(model, "Sub", [[1, 5, 7]]) - # "Broadcast not supported yet in export operator" - @unittest.expectedFailure def test_sub_simple_broadcast(self): print("SubSimpleBroadcast") model = aidge_core.sequential([ @@ -346,8 +385,6 @@ class test_operator_export(unittest.TestCase): self.unit_test_export(model, "SubSimpleBroadcast", [[1, 7, 5]]) - # "Broadcast not supported yet in export operator" - @unittest.expectedFailure def test_sub_double_broadcast(self): print("SubDoubleBroadcast") model = aidge_core.sequential([ @@ -384,8 +421,6 @@ class test_operator_export(unittest.TestCase): self.unit_test_export(model, "Mul", [[1, 5, 7]]) - # "Broadcast not supported yet in export operator" - @unittest.expectedFailure def test_mul_simple_broadcast(self): print("MulSimpleBroadcast") model = aidge_core.sequential([ @@ -395,8 +430,6 @@ class test_operator_export(unittest.TestCase): self.unit_test_export(model, "MulSimpleBroadcast", [[1, 7, 5]]) - # "Broadcast not supported yet in export operator" - @unittest.expectedFailure def test_mul_double_broadcast(self): print("MulDoubleBroadcast") model = aidge_core.sequential([ @@ -415,6 +448,54 @@ class test_operator_export(unittest.TestCase): self.unit_test_export(model, "MulBatch", [[3, 5, 7]]) + def test_div(self): + print("Div") + model = aidge_core.sequential([ + aidge_core.Producer([1, 5, 5], name="producer"), + aidge_core.Div(name="div") + ]) + + self.unit_test_export(model, "Div", [[1, 5, 5]]) + + + def test_div_higher(self): + print("Div") + model = aidge_core.sequential([ + aidge_core.Producer([1, 5, 7], name="producer"), + aidge_core.Div(name="div") + ]) + + self.unit_test_export(model, "Div", [[1, 5, 7]]) + + + def test_div_simple_broadcast(self): + print("DivSimpleBroadcast") + model = aidge_core.sequential([ + aidge_core.Producer([1, 1, 5], name="producer"), + aidge_core.Div(name="div") + ]) + + self.unit_test_export(model, "DivSimpleBroadcast", [[1, 7, 5]]) + + + def test_div_double_broadcast(self): + print("DivDoubleBroadcast") + model = aidge_core.sequential([ + aidge_core.Producer([1, 1, 7], name="producer"), + aidge_core.Div(name="div") + ]) + + self.unit_test_export(model, "DivDoubleBroadcast", [[1, 5, 1]]) + + def test_div_batch(self): + print("DivBatch") + model = aidge_core.sequential([ + aidge_core.Producer([3, 5, 7], name="producer"), + aidge_core.Div(name="div") + ]) + + self.unit_test_export(model, "DivBatch", [[3, 5, 7]]) + def test_concat(self): print("Concat") model = aidge_core.sequential([ @@ -432,6 +513,22 @@ class test_operator_export(unittest.TestCase): self.unit_test_export(model, "Transpose", [[1, 7, 8, 2]]) + def test_identity(self): + print("Identity") + model = aidge_core.sequential([ + aidge_core.Identity(name="identity") + ]) + + self.unit_test_export(model, "Identity", [[1, 5, 6, 8]]) + + def test_slice(self): + print("Slice") + model = aidge_core.sequential([ + aidge_core.Slice([1,3,5,2], [5,6,-1,3], [-4,2,1,3], [1,2,1,1], name="slice") + ]) + + self.unit_test_export(model, "Slice", [[5, 7, 12, 3]]) + def test_reshape(self): print("Reshape") model = aidge_core.sequential([ @@ -441,7 +538,7 @@ class test_operator_export(unittest.TestCase): shape.add_child(model.get_node("reshape"), 0, 1) model.add(shape) - self.unit_test_export(model, "Reshape", [[1, 7, 8, 2]], random_weights=False) + self.unit_test_export(model, "Reshape", [[1, 7, 8, 2]], random_weights=False, default_value=None) def test_matmul(self): print("MatMul") @@ -452,6 +549,51 @@ class test_operator_export(unittest.TestCase): self.unit_test_export(model, "MatMul", [[8, 4]]) + def test_matmul_larger(self): + print("MatmulLarger") + model = aidge_core.sequential([ + aidge_core.Producer([1, 5, 7], name="producer"), + aidge_core.MatMul(name="MatMul") + ]) + + self.unit_test_export(model, "MatMul", [[1, 7, 5]]) + + def test_matmul_higher(self): + print("MatMulHigher") + model = aidge_core.sequential([ + aidge_core.Producer([1, 5, 7], name="producer"), + aidge_core.MatMul(name="matmul") + ]) + + self.unit_test_export(model, "MatMul", [[1, 7, 1]]) + + def test_matmul_simple_broadcast(self): + print("MatMulSimpleBroadcast") + model = aidge_core.sequential([ + aidge_core.Producer([1, 1, 5], name="producer"), + aidge_core.MatMul(name="MatMul") + ]) + + self.unit_test_export(model, "MatMulSimpleBroadcast", [[1, 5, 7]]) + + def test_matmul_double_broadcast(self): + print("MatMulDoubleBroadcast") + model = aidge_core.sequential([ + aidge_core.Producer([3, 1, 5], name="producer"), + aidge_core.MatMul(name="MatMul") + ]) + + self.unit_test_export(model, "MatMulDoubleBroadcast", [[3, 5, 5]]) + + def test_matmul_batch(self): + print("MatMulBatch") + model = aidge_core.sequential([ + aidge_core.Producer([3, 5, 7], name="producer"), + aidge_core.MatMul(name="matmul") + ]) + + self.unit_test_export(model, "MatMulBatch", [[3, 7, 7]]) + def test_concat_axis_2(self): print("ConcatAxis2") model = aidge_core.sequential([ @@ -476,7 +618,47 @@ class test_operator_export(unittest.TestCase): aidge_core.Conv2D(in_channels=3, out_channels=3, kernel_dims=(3, 3), name="conv") ]) - self.unit_test_export(model, "Conv2D", [[1, 3, 12, 12]], False, False) + self.unit_test_export(model, "Conv2D", [[1, 3, 12, 12]]) + + def test_conv2D_asym(self): + print("Conv2D_asym") + model = aidge_core.sequential([ + aidge_core.Conv2D(in_channels=3, out_channels=3, kernel_dims=(2, 5), name="conv") + ]) + + self.unit_test_export(model, "Conv2D_asym", [[1, 3, 22, 22]]) + + def test_conv2D_asym2(self): + print("Conv2D_asym2") + model = aidge_core.sequential([ + aidge_core.Conv2D(in_channels=3, out_channels=3, kernel_dims=(2, 5), name="conv") + ]) + + self.unit_test_export(model, "Conv2D_asym2", [[1, 3, 59, 22]]) + + def test_conv1D(self): + print("Conv1D") + model = aidge_core.sequential([ + aidge_core.Conv1D(in_channels=3, out_channels=3, kernel_dims=[3],name="conv1d") + ]) + + self.unit_test_export(model, "Conv1D", [[1, 3, 12]]) + + def test_conv1D_stride(self): + print("Conv1D_stride") + model = aidge_core.sequential([ + aidge_core.Conv1D(in_channels=3, out_channels=3, kernel_dims=[3], stride_dims=[3],name="conv1d") + ]) + + self.unit_test_export(model, "Conv1D_stride", [[1, 3, 24]]) + + def test_conv1D_dilation(self): + print("Conv1D_dilation") + model = aidge_core.sequential([ + aidge_core.Conv1D(in_channels=1, out_channels=8, kernel_dims=[3], stride_dims=[3], dilation_dims=[5],name="conv1d") + ]) + + self.unit_test_export(model, "Conv1D_dilation", [[1, 1, 107]]) def test_convDepthWise2D(self): print("ConvDepthWise2D") @@ -484,15 +666,16 @@ class test_operator_export(unittest.TestCase): aidge_core.ConvDepthWise2D(nb_channels=3, kernel_dims=(3, 3), name="conv") ]) - self.unit_test_export(model, "ConvDepthWise2D", [[1, 3, 12, 12]], False, False) + self.unit_test_export(model, "ConvDepthWise2D", [[1, 3, 12, 12]]) def test_max_pooling(self): print("MaxPooling2D") model = aidge_core.sequential([ aidge_core.MaxPooling2D(kernel_dims=(3, 3), name="max_pool") ]) + model.set_ordered_outputs([(model.get_node("max_pool"), 0)], True) - self.unit_test_export(model, "MaxPooling2D", [[1, 2, 12, 12]], False, False) + self.unit_test_export(model, "MaxPooling2D", [[1, 2, 12, 12]]) def test_avg_pooling(self): print("AvgPooling2D") @@ -500,7 +683,7 @@ class test_operator_export(unittest.TestCase): aidge_core.AvgPooling2D(kernel_dims=(3, 3), name="avg_pool") ]) - self.unit_test_export(model, "AvgPooling2D", [[1, 2, 12, 12]], False, False) + self.unit_test_export(model, "AvgPooling2D", [[1, 2, 12, 12]]) def test_pad2D(self): print("Pad2D") @@ -559,6 +742,46 @@ class test_operator_export(unittest.TestCase): self.unit_test_export(model, "Pad2DNotConstant", [[1, 5, 7, 11]]) + def test_reducemean_2_axes_minimal(self): + print("ReduceMean2Axes_minimal") + model = aidge_core.sequential([ + aidge_core.ReduceMean([1,2], keep_dims=False, name="ReduceMean") + ]) + + self.unit_test_export(model, "ReduceMean2Axes_minimal", [[1,2,3]]) + + def test_reducemean_1_axis(self): + print("ReduceMean1Axis") + model = aidge_core.sequential([ + aidge_core.ReduceMean([2], keep_dims=False, name="ReduceMean") + ]) + + self.unit_test_export(model, "ReduceMean1Axis", [[3,3,2]]) + + def test_reducemean_all_axes(self): + print("ReduceMean") + model = aidge_core.sequential([ + aidge_core.ReduceMean([], keep_dims=False, name="ReduceMean") + ]) + + self.unit_test_export(model, "ReduceMean0Axes", [[3,3,2]]) + + def test_reducemean_no_axes(self): + print("ReduceMean") + model = aidge_core.sequential([ + aidge_core.ReduceMean([], keep_dims=False, noop_with_empty_axes = True, name="ReduceMean") + ]) + + self.unit_test_export(model, "ReduceMean0Axes", [[3,3,2]]) + + def test_reducemean_several_axes(self): + print("ReduceMeanSeveralAxes") + model = aidge_core.sequential([ + aidge_core.ReduceMean([1, 3], keep_dims=False, name="ReduceMean") + ]) + + self.unit_test_export(model, "ReduceMeanSeveralAxes", [[1, 5, 7, 11]]) + def test_batchnorm2D(self): print("BatchNormalization2D") model = aidge_core.sequential([ @@ -624,16 +847,101 @@ class test_operator_export(unittest.TestCase): self.unit_test_export(model, "BatchNorm2DDenser", [[1, 3, 5, 7]], False, False) - - def test_cpp(self): - print("Export test to do") - def test_Conv(self): + print("Conv") model = aidge_core.sequential([ aidge_core.Conv2D(1, 1, [3, 3], name="InputNode") ]) initFiller(model) self.unit_test_export(model, "Conv", [[1, 1, 9, 9]]) + def test_PaddedConv(self): + print("PaddedConv") + model = aidge_core.sequential([ + aidge_core.PaddedConv2D(3, 7, kernel_dims=[7, 7], name="InputNode", stride_dims=[2, 2], padding_dims=[3, 3, 3, 3]) + ]) + initFiller(model) + self.unit_test_export(model, "PaddedConv", [[1, 3, 19, 19]]) + + def test_PaddedConv_asym(self): + print("PaddedConv_asym") + model = aidge_core.sequential([ + aidge_core.PaddedConv2D(3, 7, kernel_dims=[3, 7], name="InputNode", stride_dims=[2, 1], padding_dims=[3, 1, 2, 3]) + ]) + initFiller(model) + self.unit_test_export(model, "PaddedConv_asym", [[1, 3, 19, 19]]) + + def test_PaddedConv_dilated(self): + print("PaddedConv_dilated") + model = aidge_core.sequential([ + aidge_core.PaddedConv2D(3, 7, kernel_dims=[7, 7], name="InputNode", stride_dims=[2, 2], padding_dims=[3, 3, 3, 3], dilation_dims=[2, 2]) + ]) + initFiller(model) + self.unit_test_export(model, "PaddedConv_dilated", [[1, 3, 19, 19]]) + + def test_PaddedConv_dilated_asym(self): + print("PaddedConv_dilated_asym") + model = aidge_core.sequential([ + aidge_core.PaddedConv2D(3, 7, kernel_dims=[7, 5], name="InputNode", stride_dims=[1, 2], padding_dims=[1, 3, 3, 0], dilation_dims=[1, 2]) + ]) + initFiller(model) + self.unit_test_export(model, "PaddedConv_dilated_asym", [[1, 3, 19, 19]]) + + def test_PaddedConvDW(self): + print("PaddedConvDW") + model = aidge_core.sequential([ + aidge_core.PaddedConvDepthWise2D(3, kernel_dims=[7, 7], name="InputNode", stride_dims=[2, 2], padding_dims=[3, 3, 3, 3]) + ]) + initFiller(model) + self.unit_test_export(model, "PaddedConvDW", [[1, 3, 19, 19]]) + + def test_PaddedConvDW_asym(self): + print("PaddedConvDW_asym") + model = aidge_core.sequential([ + aidge_core.PaddedConvDepthWise2D(3, kernel_dims=[3, 7], name="InputNode", stride_dims=[2, 1], padding_dims=[3, 1, 2, 3]) + ]) + initFiller(model) + self.unit_test_export(model, "PaddedConvDW_asym", [[1, 3, 19, 19]]) + + def test_PaddedConvDW_dilated(self): + print("PaddedConvDW_dilated") + model = aidge_core.sequential([ + aidge_core.PaddedConvDepthWise2D(3, kernel_dims=[7, 7], name="InputNode", stride_dims=[2, 2], padding_dims=[3, 3, 3, 3], dilation_dims=[2, 2]) + ]) + initFiller(model) + self.unit_test_export(model, "PaddedConvDW_dilated", [[1, 3, 19, 19]]) + + def test_PaddedConvDW_dilated_asym(self): + print("PaddedConvDW_dilated_asym") + model = aidge_core.sequential([ + aidge_core.PaddedConvDepthWise2D(3, kernel_dims=[7, 5], name="InputNode", stride_dims=[1, 2], padding_dims=[1, 3, 3, 0], dilation_dims=[1, 2]) + ]) + initFiller(model) + self.unit_test_export(model, "PaddedConvDW_dilated_asym", [[1, 3, 19, 19]]) + + def test_Conv2(self): + print("Conv2") + model = aidge_core.sequential([ + aidge_core.Conv2D(2, 2, [3, 3], name="InputNode") + ]) + initFiller(model) + self.unit_test_export(model, "Conv2", [[1, 2, 9, 9]], random_inputs=True, random_weights=False, default_value=None) + + def test_erf(self): + print("Erf") + model = aidge_core.sequential([ + aidge_core.Erf(name="erf") + ]) + + self.unit_test_export(model, "Erf", [[1, 10]]) + + def test_sigmoid(self): + print("Sigmoid") + model = aidge_core.sequential([ + aidge_core.Sigmoid(name="sigmoid") + ]) + + self.unit_test_export(model, "Sigmoid", [[1, 100]]) + if __name__ == '__main__': unittest.main() diff --git a/examples/export_LeNet/.gitignore b/examples/export_LeNet/.gitignore index 98ce649a943a90590bb8f4f067a3c1ac9691dcbc..faa9a65c0125f8177ff3f772833f8ffd5f88d886 100644 --- a/examples/export_LeNet/.gitignore +++ b/examples/export_LeNet/.gitignore @@ -1,6 +1,8 @@ # Exclude export artefacts -export_lenet_int8/ +export_lenet_* log_outputs/* +graph_log/ assets/* data/* log.txt +*.json diff --git a/examples/export_LeNet/lenet.py b/examples/export_LeNet/lenet.py index 1cda87b9583a37007b61ef481d49c2bed037020f..8dccad9a5ce9387871ae48585e76a6ac6986546b 100644 --- a/examples/export_LeNet/lenet.py +++ b/examples/export_LeNet/lenet.py @@ -4,32 +4,20 @@ lenet.py Run this file to export a LeNet using the Aidge CPP Export module. """ -import os -import shutil import random -import aidge_core.utils -import numpy as np import subprocess +import numpy as np # Aidge Modules import aidge_core import aidge_onnx -import aidge_backend_cpu -import aidge_quantization import aidge_export_cpp +import aidge_backend_cpu +import aidge_quantization -from aidge_export_cpp.export_utils import ( - cpp_fuse_to_metaops, - set_nodes_names, - set_nodes_datatypes, - exclude_unwanted_producers) - -from aidge_core.export_utils import remove_optional_inputs, get_node_from_metaop +from aidge_export_cpp.export_utils import * # Torch (Dataset) -import torch -import torch.nn.functional as F -from torch import nn from torchvision import transforms, datasets # Arguments @@ -38,7 +26,8 @@ import argparse supported_types = ["float32", "int8"] parser = argparse.ArgumentParser(description="Export the LeNet model with the aidge_export_cpp module.") -parser.add_argument("--dev", action="store_true", help="Export in dev mode") +parser.add_argument("--dev_mode", action="store_true", help="Export in dev mode") +parser.add_argument("--mem_wrap", action="store_true", help="Use memory wrapping") parser.add_argument("--no_cuda", action="store_true", help="Disable USE_CUDA usage to perform inferences and training.") parser.add_argument("--dtype", type=str, choices=supported_types, default="float32", help="Specify the targeted datatype : [int8, float32]") parser.add_argument("--aidge_cmp", action="store_true", help="Use aidge tensor results as reference.") @@ -59,8 +48,14 @@ parser.add_argument( ) args = parser.parse_args() +# CUDA Usage USE_CUDA = not args.no_cuda +if USE_CUDA: + import aidge_backend_cuda + +backend = "cuda" if USE_CUDA else "cpu" + # Setting Aidge verbose level if args.verbose == 0: aidge_core.Log.set_console_level(aidge_core.Level.Error) @@ -71,236 +66,97 @@ elif args.verbose == 2: elif args.verbose >= 3: aidge_core.Log.set_console_level(aidge_core.Level.Debug) -if USE_CUDA: - import aidge_backend_cuda - # ------------------------------------------------------------ -# EXPORT CONFIG +# DOWNLOAD & LOAD THE MODEL # ------------------------------------------------------------ -""" -Export configuration details : -- RNG_SEED : Fix a random seed for torch to always get the same images from the dataset, - therefore always getting the same output. -- NB_TEST : Number of example inferences to perform (used to get an accuracy approximation). -- NB_CALIB : Number of samples used for the calibration step of quantization. -- MODEL_NAME : Should be the same name as the onnx file you want to load and export. -- DO_EXAMPLES : Perform example inferences (and allow to get accuracy approximation) -- NB_BITS : Quantization output precision. Should be 8 to work with this export. -- TARGET_TYPE : The aidge datatype for tensors to be casted after the quantization step [float64, float32, int32]. -- OPTIM_SIGN : Quantization optional optimization based on data sign. -- SINGLE_SHIFT : Quantization option specifying if inserted scaling nodes should be - single shift or floating point. -- NO_QUANT : Skip the quantization step. -- CLIPPING : Clipping method during quantization. -- FOLD_GRAPH : The quantization step adds cast nodes to cast the graph into the given TARGET_TYPE. - Enabling the FOLD_GRAPH will automatically fold these nodes into the following - ones at the end of quantization step. -- USE_CUDA : Determine if the quantization step uses the GPU. It is generally recommended - to enable this option if you have access to GPUs as the quantization step - may take a while to complete. -- DEV_MODE : The dev mode allows to identify errors more easily exporting the model with - symbolic links enabling to modify the source files directly in the - generated export (make sure you installed the export plugin running - `pip install -e .`). - Enabled running this python file, adding the --dev argument. -- AIDGE_CMP : Saves and export the outputs generated by the aidge inferences in order - to compare it with the export outputs. - Enabled running this python file, adding the --aidge_cmp argument. -""" +# Download the model +file_url = "https://huggingface.co/EclipseAidge/LeNet/resolve/main/lenet_mnist.onnx?download=true" +file_path = "lenet_mnist.onnx" +aidge_core.utils.download_file(file_path, file_url) -print(" Available backends : ", aidge_core.Tensor.get_available_backends()) - -quantize_model = False -NB_BITS = 32 -TARGET_TYPE = aidge_core.dtype.float32 - -if args.dtype == "float32": - quantize_model = False -elif args.dtype == "int8": - quantize_model = True - NB_BITS = 8 - TARGET_TYPE = aidge_core.dtype.int32 # int8 not yet available -else: - print(f"[ERROR] Datatype '{args.dtype}' not supported.") - print(f"[ERROR] Supported datatypes : {supported_types}.") - exit(1) - -RNG_SEED = 1234 -NB_TEST = 10 # Example inferences -NB_CALIB = 20 # Calibration set -MODEL_NAME = 'lenet' -EXPORT_FOLDER = f"export_{MODEL_NAME}_int8" -DO_EXAMPLES = True - -# Quantization params -OPTIM_SIGN = False -SINGLE_SHIFT = True -ROUNDING = True -NO_QUANT = False -CLIPPING = aidge_quantization.Clipping.MSE # 'MAX' -FOLD_GRAPH = True - -# Export modes -DEV_MODE = args.dev -AIDGE_CMP = args.aidge_cmp - -print('\n RNG_SEED = ', RNG_SEED) -print(' MODEL_NAME = ', MODEL_NAME) -print(' NB_TEST = ', NB_TEST) -print(' NB_CALIB = ', NB_CALIB) -print(' NB_BITS = ', NB_BITS) -print(' OPTIM_SIGN = ', OPTIM_SIGN) -print(' NO_QUANT = ', NO_QUANT) -print(' CLIPPING = ', CLIPPING) -print(' SINGLE_SHIFT = ', SINGLE_SHIFT) -print(' USE_CUDA = ', USE_CUDA) -print(' DEV_MODE = ', DEV_MODE) - -torch.manual_seed(RNG_SEED) -random.seed(RNG_SEED) +# Load the model +model = aidge_onnx.load_onnx(file_path, verbose=False) -backend = "cuda" if USE_CUDA else "cpu" +# -------------------------------------------------------------- +# CLEAN THE MODEL +# -------------------------------------------------------------- -# ------------------------------------------------------------ -# CREATE THE LENET MODEL -# ------------------------------------------------------------ -""" -The LeNet model is created and trained using the create_lenet file. -If a lenet.onnx file is already present in the current folder, this step will be skiped. -The generated network is not yet quantized. -""" -# Define the target path and filename -file_url = "https://huggingface.co/EclipseAidge/LeNet/resolve/main/lenet_mnist.onnx?download=true" -file_path = MODEL_NAME + "_mnist.onnx" -aidge_core.utils.download_file(file_path, file_url) +aidge_core.remove_flatten(model) +aidge_core.fuse_batchnorm(model) +aidge_core.expand_metaops(model, name_format="{0}_{1}_{2}") # -------------------------------------------------------------- -# CREATE THE SAMPLES +# TEST THE MODEL # -------------------------------------------------------------- +# Create the samples + +NB_TEST = 10 # XXX 100 - Example inferences +NB_CALIB = 20 # XXX 20 - Calibration set + transform = transforms.ToTensor() test_set = datasets.MNIST(root='./data', train=False, transform=transform, download=True) tensors = [] labels = [] -index = 0 -for in_tensor, label in test_set: - array = np.array(in_tensor) - array = np.reshape(array, (1, 1, 28, 28)) - tensor = aidge_core.Tensor(array) - tensor.set_backend(backend) - tensor.set_datatype(aidge_core.dtype.float32) +for i, (tensor, label) in enumerate(test_set): + tensor = np.reshape(tensor.numpy(), (1, 1, 28, 28)) + tensor = aidge_core.Tensor(tensor) + tensor.set_backend("cpu") tensors.append(tensor) labels.append(label) - index += 1 - if (index == max(NB_TEST, NB_CALIB)): + if i >= max(NB_TEST, NB_CALIB): break -# -------------------------------------------------------------- -# LOAD THE MODEL -# -------------------------------------------------------------- - -""" -Load the .onnx model and perform some usual graph modifications : - - Remove the flatten nodes; - - Fuse the batchnorm nodes into the biases producers. - - Expand the metaOperators to perform the desired fusions. -""" - -model = aidge_onnx.load_onnx(file_path, verbose=False) -aidge_core.remove_flatten(model) -aidge_core.fuse_batchnorm(model) -aidge_core.expand_metaops(model) -model.save("imported_model") - -# -------------------------------------------------------------- -# SET UP THE AIDGE SCHEDULER -# -------------------------------------------------------------- - -""" -The scheduler is an ordered version of the model, allowing to schedule -nodes to be able to run inferences, for instance. -""" - # Set up the backend model.set_datatype(aidge_core.dtype.float32) model.set_backend(backend) -# Create the Scheduler +# Create the Scheduler scheduler = aidge_core.SequentialScheduler(model) -# -------------------------------------------------------------- -# RUN SOME EXAMPLES INFERENCES -# -------------------------------------------------------------- - -def propagate(model, scheduler, tensor): - """ - Propagate the given tensor into the model and return the - output tensor. - """ - print(f"Propagate: {tensor.backend()}") - # Run the inference - scheduler.forward(True, [tensor]) - # Gather the results - output_node = model.get_output_nodes().pop() - output_tensor = output_node.get_operator().get_output(0).clone() - output_tensor.set_backend("cpu") - return np.array(output_tensor) - +# Run some examples accuracy = 0 -if (DO_EXAMPLES): - print('\n EXAMPLE INFERENCES :') - nb_valid = 0 - base_values = [] - for i in range(NB_TEST): - print(f"Inférence: {tensors[i].backend()}") - output_array = propagate(model, scheduler, tensors[i]) - print(labels[i], ' VS ', np.argmax(output_array), ' -> ', np.max(output_array)) - base_values.append(np.max(output_array)) - if (labels[i] == np.argmax(output_array)): - nb_valid += 1 - accuracy = nb_valid / NB_TEST - print('\n MODEL ACCURACY = ', accuracy * 100, '%') +print('\n EXAMPLE INFERENCES :') +nb_valid = 0 +base_values = [] +for i in range(NB_TEST): + output_array = propagate(model, scheduler, tensors[i]) + print(labels[i], ' VS ', np.argmax(output_array), ' -> ', np.max(output_array)) + base_values.append(np.max(output_array)) + if (labels[i] == np.argmax(output_array)): + nb_valid += 1 +accuracy = nb_valid / NB_TEST +print('\n MODEL ACCURACY = ', accuracy * 100, '%') # -------------------------------------------------------------- # PERFORM THE QUANTIZATION # -------------------------------------------------------------- -if quantize_model: +if args.dtype != "float32": + + TARGET_TYPE = aidge_core.dtype.int32 + if args.dtype == "int8": + NB_BITS = 8 + else: + aidge_core.Log.fatal(f"Type {args.dtype} not yet supported. \ + \nSupported types : {supported_types}.") + aidge_quantization.quantize_network( network = model, nb_bits = NB_BITS, calibration_set = tensors[0:NB_CALIB], - clipping_mode = CLIPPING, target_type = TARGET_TYPE, - no_quant = NO_QUANT, - optimize_signs = OPTIM_SIGN, - single_shift = SINGLE_SHIFT, - use_cuda = USE_CUDA, - fold_graph = FOLD_GRAPH) - -# Tag the scaling producers -for node in model.get_nodes(): - if node.type() == "Quantizer": - for SNode in get_node_from_metaop(node, "BitShift"): - SNode.get_parent(1).attributes().shift_prod = True - for CNode in get_node_from_metaop(node, "Mul"): - CNode.get_parent(1).attributes().coef_prod = True - -model.save("post_ptq_model") - -# -------------------------------------------------------------- -# RESCALE THE INPUT SAMPLES -# -------------------------------------------------------------- + single_shift = True, + clipping_mode = aidge_quantization.Clipping.MSE, + use_cuda = USE_CUDA) -""" -Once the quantization is done, the graph now only accepts integer inputs. -So we need to rescale the dataset for the data to be within [0, 255]. -Also, tensors should be casted to be the same type as TARGET_TYPE. -""" + # -------------------------------------------------------------- + # TEST THE QUANTIZED MODEL + # -------------------------------------------------------------- -if quantize_model: + # Quantize the samples rescaling = 2**(NB_BITS-1)-1 for i in range(NB_TEST): tensors[i].set_backend("cpu") @@ -308,39 +164,20 @@ if quantize_model: array = np.round(array).astype(int) tensors[i] = aidge_core.Tensor(array) tensors[i].set_datatype(TARGET_TYPE) - tensors[i].set_backend("cpu") - # Setting modele to CPU for export - model.set_backend("cpu") - - -# -------------------------------------------------------------- -# GENERATE NEW SCHEDULER -# -------------------------------------------------------------- -""" -Each time the graph has been change, it has to be reset. -Here some Quantizer and Cast nodes have been added. -""" - -""" [Issue] -We need first to manually add an input tensor with the correct datatype, -as it is not automatically done in PTQ. -""" -if quantize_model: - input_node = model.get_ordered_inputs()[0] - input_node[0].get_operator().set_input(0, tensors[0]) + # Set the backend back to CPU + if USE_CUDA: + model.set_backend("cpu") + + # Reset the scheduler as the graph may have changed scheduler.reset_scheduling() + scheduler.generate_scheduling() -# -------------------------------------------------------------- -# PERFORM THE EXAMPLE INFERENCES AGAIN -# -------------------------------------------------------------- - -if (DO_EXAMPLES and quantize_model): - print('\n QUANTIZED EXAMPLE INFERENCES:') + # Perform example inferences on quantized model + print('\n QUANTIZED EXAMPLE INFERENCES :') nb_valid = 0 post_values = [] for i in range(NB_TEST): - print(f"QEI: {tensors[i].backend()}") output_array = propagate(model, scheduler, tensors[i]) print(labels[i], ' VS ', np.argmax(output_array), ' -> ', np.max(output_array)) post_values.append(np.max(output_array)) @@ -351,154 +188,38 @@ if (DO_EXAMPLES and quantize_model): print('\n MODEL ACCURACY = ', accuracy * 100, '%') print('\n QUANTIZED ACCURACY = ', quant_accuracy * 100, '%') + output_array = propagate(model, scheduler, tensors[0]) # -------------------------------------------------------------- -# FUSE NODES INTO METAOPS -# -------------------------------------------------------------- - -""" -Here is made the link between the Aidge model and the CPP -kernels implementation. In aidge, all the nodes calculations -are performed separately (Pad -> Conv -> Quantizer -> ReLU -> ...). - -However within the CPP export, some core operators are merged -in meta operators. For instance, the padding, scaling and ReLU are -performed within the Conv kernel. - -In this step, we use graph regex techniques to find the desired patterns -within the graph in order to match the export implementation of the kernels. -""" - -# Exclude unwanted producers -""" -Before fusing the nodes, we set a tag on the Producers in order to exclude -from the export the ones holding coefficients, as they are directly handled -within the layers parameters. -""" -exclude_unwanted_producers(model) - -# Fuse nodes -cpp_fuse_to_metaops(model) - -# Remove optional inputs -""" -Some optional inputs may be added by the quantization step (for instance with the clipping nodes). -Here we make sure that they will not be considered as actual graph inputs by the export, by -excluding them from the ordered_inputs list of the model. -""" -remove_optional_inputs(model) - -# Reset scheduler to apply graph modifications -""" -The scheduler always needs to be reset after graph manipulation. -""" -scheduler.reset_scheduling() - -# Name newly created MetaOps -""" -As names are optional in Aidge, the fuse_to_metaops function will not automatically -give a name to the newly created metaOps. However, in an export context, we need -our operators to be named, as this will be used to name the corresponding files. -""" - -scheduler.generate_scheduling() # Scheduler needs to be generated as it has just been reset -set_nodes_names(scheduler) - -# -------------------------------------------------------------- -# LOG OUTPUTS FOR THE FIRST IMAGE OF THE TEST DATASET -# -------------------------------------------------------------- - -""" -Here a final inference is made on the input we want to export and run. -This will ensure that all the feature maps tensors (between the layers) -hold the data corresponding to this specific input. -Then, the "log_outputs()" function (called later) will store these tensors -into log files that may be exported as well for comparison purpose. -""" - -output_array = propagate(model, scheduler, tensors[0]) - -print("### Exported Sample ###") -print("Aidge prediction :", np.argmax(output_array), "(" + str(np.max(output_array)) + ")") -print("Label :", labels[0]) - -# -------------------------------------------------------------- -# HANDLING DATATYPE -# -------------------------------------------------------------- - -""" -Now, despite the quantization stage, all the tensors of the model are -still "virtually" in Int32. Before exporting the model, we have to set -tensors' datatypes to Int8, except for biases which should remain in Int32. -""" - -if quantize_model: - set_nodes_datatypes(model) - -# Store tensors values into log files -""" -Once the tensors have been casted, the log_outputs() function can be -called to store their values into log files. -""" - -if os.path.isdir("log_outputs"): - shutil.rmtree("log_outputs") -model.log_outputs("log_outputs") - -# -------------------------------------------------------------- -# TEST MODE -# -------------------------------------------------------------- - -""" -The test mode is mainly used for validation and benchmark. The model will be -exported in a way that each layer's result will be compared with the CPU implementation. -The timings for each layer will be displayed. -In case of error, you will be able to enter debug mode, showing in-layer data or -changing the inputs of the layer, to isolate the source of the issue. -""" - -for node in model.get_nodes(): - node.attributes().dev_mode = DEV_MODE - -# -------------------------------------------------------------- -# AIDGE CMP +# EXPORT THE MODEL # -------------------------------------------------------------- -""" -If the --aidge_cmp option is enabled, the feature maps generated by aidge with the -backend cpu will be exported in the generated export. It will be used as reference -to verify that the results with the optimized kernels are correct for the exported -model. -This option has to be passed to each node in order to be used within the Export Nodes. -(JConv, JPad, ...) that you can find in the "operators" folder. -""" +export_folder_name = f"export_lenet_{args.dtype}" -if AIDGE_CMP: - for node in model.get_nodes(): - node.attributes().aidge_cmp = True +aidge_export_cpp.export( + export_folder_name=export_folder_name, + model=model, + scheduler=scheduler, + inputs_tensor=tensors[0], + labels=aidge_core.Tensor(labels[0]), + dev_mode=args.dev_mode, + aidge_cmp=args.aidge_cmp, + memory_manager_args={"wrapping": True} if args.mem_wrap else {}) # -------------------------------------------------------------- -# EXPORT THE MODEL +# COMPILING THE EXPORT # -------------------------------------------------------------- -model.save("exported_model") - -aidge_export_cpp.export(EXPORT_FOLDER, - model, - scheduler, - # tensors[0], - labels = aidge_core.Tensor(labels[0]), - dev_mode = DEV_MODE, - aidge_cmp = AIDGE_CMP) print("\n### Compiling the export ###") try: - for std_line in aidge_core.utils.run_command(["make"], cwd=EXPORT_FOLDER): + command = ["make", f"AIDGE_CMP={str(args.aidge_cmp).lower()}", "SAVE_OUTPUTS=false"] + for std_line in aidge_core.utils.run_command(command, cwd=export_folder_name): print(std_line, end="") except subprocess.CalledProcessError as e: raise RuntimeError(0, f"An error occurred, failed to build export.") from e print("\n### Running the export ###") try: - for std_line in aidge_core.utils.run_command(["./bin/run_export"], cwd=EXPORT_FOLDER): + for std_line in aidge_core.utils.run_command(["./bin/run_export"], cwd=export_folder_name): print(std_line, end="") except subprocess.CalledProcessError as e: - raise RuntimeError(0, f"An error occurred, failed to run export.") from e \ No newline at end of file + raise RuntimeError(0, f"An error occurred, failed to run export.") from e diff --git a/examples/export_ResNet18/.gitignore b/examples/export_ResNet18/.gitignore index a6e4e9706bae169c65e15f7a2a8c090fe21618c7..d4813dedaac332924cfe5242c6c49f4f6f4a14ee 100644 --- a/examples/export_ResNet18/.gitignore +++ b/examples/export_ResNet18/.gitignore @@ -1,6 +1,7 @@ # Exclude export artefacts -export_resnet18_int8/ -log_outputs/* +export_resnet18_* +log_outputs +graph_log assets/* data/* log.txt diff --git a/examples/export_ResNet18/resnet18.py b/examples/export_ResNet18/resnet18.py index 81e335567194a83794bd4be8c2dcbd8056fa3ebf..2eba6431151c627be37b6c4223fa5f6abd75eda2 100644 --- a/examples/export_ResNet18/resnet18.py +++ b/examples/export_ResNet18/resnet18.py @@ -7,41 +7,35 @@ In order for this file to work properly, you should first download the imagenet (search for "ILSVRC2012"). """ +import os import random +import subprocess import numpy as np -import os -import shutil from PIL import Image -import requests from pathlib import Path -import subprocess from random import randint + # Aidge Modules import aidge_core import aidge_onnx +import aidge_export_cpp import aidge_backend_cpu import aidge_quantization -import aidge_export_cpp -from aidge_export_cpp.export_utils import ( - cpp_fuse_to_metaops, - exclude_unwanted_producers, - set_nodes_names, - set_nodes_datatypes, - normalize) - -from aidge_core.export_utils import remove_optional_inputs, get_node_from_metaop +from aidge_export_cpp.export_utils import * # Torch (Dataset) import torch from torchvision import transforms + # Arguments import argparse supported_types = ["float32", "int8"] parser = argparse.ArgumentParser(description="Export the ResNet18 model with the aidge_export_cpp module.") -parser.add_argument("--dev", action="store_true", help="Export in dev mode") +parser.add_argument("--dev_mode", action="store_true", help="Export in dev mode") +parser.add_argument("--mem_wrap", action="store_true", help="Use memory wrapping") parser.add_argument("--no_cuda", action="store_true", help="Disable USE_CUDA usage to perform inferences and training.") parser.add_argument("--dtype", type=str, choices=supported_types, default="float32", help="Specify the targeted datatype : [int8, float32]") parser.add_argument("--aidge_cmp", action="store_true", help="Use aidge tensor results as reference.") @@ -80,8 +74,14 @@ parser.add_argument( args = parser.parse_args() +# CUDA Usage USE_CUDA = not args.no_cuda +if USE_CUDA: + import aidge_backend_cuda + +backend = "cuda" if USE_CUDA else "cpu" + # Setting Aidge verbose level if args.verbose == 0: aidge_core.Log.set_console_level(aidge_core.Level.Error) @@ -92,124 +92,32 @@ elif args.verbose == 2: elif args.verbose >= 3: aidge_core.Log.set_console_level(aidge_core.Level.Debug) -if USE_CUDA: - import aidge_backend_cuda - -# ------------------------------------------------------------ -# EXPORT CONFIG -# ------------------------------------------------------------ - -""" -Export configuration details : -- RNG_SEED : Fix a random seed for torch to always get the same images from the dataset, - therefore always getting the same output. -- NB_TEST : Number of example inferences to perform (used to get an accuracy approximation). -- NB_CALIB : Number of samples used for the calibration step of quantization. -- MODEL_NAME : Should be the same name as the onnx file you want to load and export. -- DO_EXAMPLES : Perform example inferences (and allow to get accuracy approximation) -- NB_BITS : Quantization output precision. Should be 8 to work with this export. -- TARGET_TYPE : The aidge datatype for tensors to be casted after the quantization step. -- OPTIM_SIGN : Quantization optional optimization based on data sign. -- SINGLE_SHIFT : Quantization option specifying if inserted scaling nodes should be - single shift or floating point. -- NO_QUANT : Skip the quantization step. Should be set to False. -- CLIPPING : Clipping method during quantization. -- FOLD_GRAPH : The quantization step adds cast nodes to cast the graph into the given TARGET_TYPE. - Enabling the FOLD_GRAPH will automatically fold these nodes into the following - ones at the end of quantization step. -- USE_CUDA : Determine if the quantization step uses the GPU. It is generally recommended - to enable this option if you have access to GPUs as the quantization step - may take a while to complete. -- DEV_MODE : The dev mode allows to identify errors more easily export the model with - symbolic links enabling to modify the source files directly in the - generated export (make sure you installed the export plugin running - `pip install -e .`). - Enabled running this python file, adding the --test argument. -- AIDGE_MODE : Saves and export the outputs generated by the aidge inferences in order - to compare it with the export outputs. - Enabled running this python file, adding the --aidge_cmp argument. -""" - -print(" Available backends : ", aidge_core.Tensor.get_available_backends()) - -quantize_model = False -NB_BITS = 32 -TARGET_TYPE = aidge_core.dtype.float32 +# Init random seed +RNG_SEED = 1234 +torch.manual_seed(RNG_SEED) +random.seed(RNG_SEED) +np.random.seed(RNG_SEED) -if args.dtype == "float32": - quantize_model = False -elif args.dtype == "int8": - quantize_model = True - NB_BITS = 8 - TARGET_TYPE = aidge_core.dtype.int32 # int8 not yet available -else: - print(f"[ERROR] Datatype '{args.dtype}' not supported.") - print(f"[ERROR] Supported datatypes : {supported_types}.") - exit(1) +# -------------------------------------------------------------- +# CREATE THE SAMPLES +# -------------------------------------------------------------- -RNG_SEED = 1234 NB_TEST = 20 # Test set NB_CALIB = 20 # Calibration set -MODEL_NAME = 'resnet18' -EXPORT_FOLDER = f"export_{MODEL_NAME}_int8" -DO_EXAMPLES = True - -# Quantization params -OPTIM_SIGN = False -SINGLE_SHIFT = True -ROUNDING = True -NO_QUANT = False -CLIPPING = aidge_quantization.Clipping.MSE # 'MAX' -FOLD_GRAPH = True - -# Export modes -DEV_MODE = args.dev -AIDGE_CMP = args.aidge_cmp - -# Path to databases -IMAGENET_PATH = args.imagenet_path # Path to ImageNet database -LABEL_PATH = args.imagenet_labels # File containing labels of image of val folder (Look for val.txt) -########################### - -def print_cfg(): - print("") - print(' RNG_SEED = ', RNG_SEED) - print(' MODEL_NAME = ', MODEL_NAME) - print(' NB_TEST = ', NB_TEST) - print(' NB_CALIB = ', NB_CALIB) - print(' NB_BITS = ', NB_BITS) - print(' OPTIM_SIGN = ', OPTIM_SIGN) - print(' NO_QUANT = ', NO_QUANT) - print(' CLIPPING = ', CLIPPING) - print(' SINGLE_SHIFT = ', SINGLE_SHIFT) - print(' TARGET_TYPE = ', TARGET_TYPE) - print(' FOLD_GRAPH = ', FOLD_GRAPH) - print(' USE_CUDA = ', USE_CUDA) - print(' DEV_MODE = ', DEV_MODE) - print(' IMAGENET_PATH = ', IMAGENET_PATH) - print(' LABEL_PATH = ', LABEL_PATH) - print(' MOCK_DB = ', args.mock_db) - -print_cfg() - -torch.manual_seed(RNG_SEED) -random.seed(RNG_SEED) -np.random.seed(RNG_SEED) -backend = "cuda" if USE_CUDA else "cpu" -aidge_tensors = [] +tensors = [] labels = [] if args.mock_db: - for i in range(NB_TEST): - aidge_tensor = aidge_core.Tensor(dims=(1, 3, 224, 224)) - aidge_tensor.set_backend(backend) - aidge_tensor.set_datatype(aidge_core.dtype.float32) - aidge_core.uniform_filler(aidge_tensor, -1.0, 1.0) - aidge_tensors.append(aidge_tensor) + for i in range(max(NB_TEST, NB_CALIB)): + tensor = aidge_core.Tensor(dims=(1, 3, 224, 224)) + tensor.set_backend(backend) + tensor.set_datatype(aidge_core.dtype.float32) + aidge_core.uniform_filler(tensor, -1.0, 1.0) + tensors.append(tensor) labels.append(randint(1, 1000)) else: image_label_pairs = [] - with open(LABEL_PATH, 'r') as f: + with open(args.imagenet_labels, 'r') as f: for line in f: parts = line.strip().split() if len(parts) == 2: @@ -220,23 +128,15 @@ else: NB_SELECT = max(NB_TEST, NB_CALIB) # Check that NB_TEST and NB_CALIB are fixed selected_pairs = image_label_pairs[:NB_SELECT] - # -------------------------------------------------------------- - # CREATE THE SAMPLES - # -------------------------------------------------------------- - transform_val = transforms.Compose([transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) - tensors = [] - labels = [] paths = [] - index = 0 - for image_name, label in selected_pairs: - image_path = os.path.join(IMAGENET_PATH, image_name) + image_path = os.path.join(args.imagenet_path, image_name) if os.path.exists(image_path): try: image = Image.open(image_path) @@ -250,50 +150,39 @@ else: print(f"Error with image {image_path}: {e}") - for tensor in tensors: + for i, tensor in enumerate(tensors): array = tensor.numpy() array = np.reshape(array, (1, 3, 224, 224)) array = normalize(array) aidge_tensor = aidge_core.Tensor(array) aidge_tensor.set_backend(backend) aidge_tensor.set_datatype(aidge_core.dtype.float32) - aidge_tensors.append(aidge_tensor) - + tensors[i] = aidge_tensor # -------------------------------------------------------------- -# LOAD THE MODEL +# DOWNLOAD & LOAD THE MODEL # -------------------------------------------------------------- -""" -Load the .onnx model and perform some usual graph modifications : - - Remove the flatten nodes; - - Fuse the batchnorm nodes into the biases producers. - - Expand the metaOperators to perform the desired fusions. -""" - -# Define the target path and filename +# Download the model file_url = "https://huggingface.co/EclipseAidge/resnet18/resolve/main/resnet18_imagenet_1k.onnx?download=true" -file_path = Path(MODEL_NAME + "_imagenet_1k.onnx") - +file_path = "resnet18_imagenet_1k.onnx" aidge_core.utils.download_file(file_path, file_url) +# Load the model model = aidge_onnx.load_onnx(file_path, verbose=False) -model.save("imported_model") +# -------------------------------------------------------------- +# CLEAN THE MODEL +# -------------------------------------------------------------- + aidge_core.remove_flatten(model) aidge_core.fuse_batchnorm(model) -aidge_core.expand_metaops(model) -model.save("imported_model_fused_bn") +aidge_core.expand_metaops(model, name_format="{0}_{1}_{2}") # -------------------------------------------------------------- -# SET UP THE AIDGE SCHEDULER +# TEST THE MODEL # -------------------------------------------------------------- -""" -The scheduler is an ordered version of the model, allowing to schedule -nodes to be able to run inferences, for instance. -""" - # Set up the backend model.set_datatype(aidge_core.dtype.float32) model.set_backend(backend) @@ -301,113 +190,73 @@ model.set_backend(backend) # Create the Scheduler scheduler = aidge_core.SequentialScheduler(model) -# -------------------------------------------------------------- -# RUN SOME EXAMPLES INFERENCES -# -------------------------------------------------------------- - -def propagate(model, scheduler, aidge_tensor): - """ Propagate the given tensor into the model - """ - # Run the inference - scheduler.forward(True, [aidge_tensor]) - # Gather the results - output_node = model.get_output_nodes().pop() - output_tensor = output_node.get_operator().get_output(0).clone() - output_tensor.set_backend("cpu") - return np.array(output_tensor) - +# Run some examples accuracy = 0 -if (DO_EXAMPLES): - print('\n EXAMPLE INFERENCES :') - nb_valid = 0 - base_values = [] - for i in range(NB_TEST): - output_array = propagate(model, scheduler, aidge_tensors[i]) - print(labels[i], ' VS ', np.argmax(output_array), ' -> ', np.max(output_array)) - base_values.append(np.max(output_array)) - if (labels[i] == np.argmax(output_array)): - nb_valid += 1 - accuracy = nb_valid / NB_TEST - print('\n MODEL ACCURACY = ', accuracy * 100, '%') +print('\n EXAMPLE INFERENCES :') +nb_valid = 0 +base_values = [] +for i in range(NB_TEST): + output_array = propagate(model, scheduler, tensors[i]) + print(labels[i], ' VS ', np.argmax(output_array), ' -> ', np.max(output_array)) + base_values.append(np.max(output_array)) + if (labels[i] == np.argmax(output_array)): + nb_valid += 1 +accuracy = nb_valid / NB_TEST +print('\n MODEL ACCURACY = ', accuracy * 100, '%') - -#-------------------------------------------------------------- +# -------------------------------------------------------------- # PERFORM THE QUANTIZATION # -------------------------------------------------------------- -if quantize_model: +if args.dtype != "float32": + + TARGET_TYPE = aidge_core.dtype.int32 + if args.dtype == "int8": + NB_BITS = 8 + else: + aidge_core.Log.fatal(f"Type {args.dtype} not yet supported. \ + \nSupported types : {supported_types}.") + aidge_quantization.quantize_network( network = model, nb_bits = NB_BITS, - calibration_set = aidge_tensors[0:NB_CALIB], - clipping_mode = CLIPPING, + calibration_set = tensors[0:NB_CALIB], target_type = TARGET_TYPE, - no_quant = NO_QUANT, - optimize_signs = OPTIM_SIGN, - single_shift = SINGLE_SHIFT, - use_cuda = USE_CUDA, - fold_graph = FOLD_GRAPH) - -# Tag the scaling producers -for node in model.get_nodes(): - if node.type() == "Quantizer": - for SNode in get_node_from_metaop(node, "BitShift"): - SNode.get_parent(1).attributes().shift_prod = True - for CNode in get_node_from_metaop(node, "Mul"): - CNode.get_parent(1).attributes().coef_prod = True - -model.save("post_ptq_model") + single_shift = True, + clipping_mode = aidge_quantization.Clipping.MSE, + use_cuda = USE_CUDA) -# -------------------------------------------------------------- -# RESCALE THE INPUT SAMPLES -# -------------------------------------------------------------- + # -------------------------------------------------------------- + # TEST THE QUANTIZED MODEL + # -------------------------------------------------------------- -""" -Once the quantization is done, the graph now only accepts integer inputs. -So we need to rescale the dataset for the data to be within [0, 255]. -Also, tensors should be casted to be the same type as TARGET_TYPE. -""" -if quantize_model: + # The CUDA backend does not yet support all quantized nodes + if USE_CUDA: + model.set_backend("cpu") + + # Quantize the samples rescaling = 2**(NB_BITS-1)-1 - for i in range(max(NB_TEST, NB_CALIB)): - array = np.array(aidge_tensors[i]) * rescaling + for i in range(NB_TEST): + tensors[i].set_backend("cpu") + array = np.array(tensors[i]) * rescaling array = np.round(array).astype(int) - aidge_tensors[i] = aidge_core.Tensor(array) - aidge_tensors[i].set_datatype(TARGET_TYPE) - aidge_tensors[i].set_backend("cpu") - # Setting modele to CPU for export - model.set_backend("cpu") - -# -------------------------------------------------------------- -# GENERATE NEW SCHEDULER -# -------------------------------------------------------------- - -""" -Each time the graph has been change, it has to be reset. -Here some Quantizer and Cast nodes have been added. -""" - -""" [Issue] -We need first to manually add an input tensor with the correct datatype, -as it is not automatically done in PTQ. -""" -if quantize_model: - input_node = model.get_ordered_inputs()[0] - input_node[0].get_operator().set_input(0, aidge_tensors[0]) + tensors[i] = aidge_core.Tensor(array) + tensors[i].set_datatype(TARGET_TYPE) + + # The input has not been casted + for node in model.get_ordered_inputs(): + node[0].get_operator().get_input(0).set_datatype(TARGET_TYPE) + + # Reset the scheduler as the graph may have changed scheduler.reset_scheduling() + scheduler.generate_scheduling() -# -------------------------------------------------------------- -# PERFORM THE EXAMPLE INFERENCES AGAIN -# -------------------------------------------------------------- - -model.save("post_ptq") - -if (DO_EXAMPLES and quantize_model): + # Perform example inferences on quantized model print('\n QUANTIZED EXAMPLE INFERENCES :') nb_valid = 0 post_values = [] for i in range(NB_TEST): - output_array = propagate(model, scheduler, aidge_tensors[i]) + output_array = propagate(model, scheduler, tensors[i]) print(labels[i], ' VS ', np.argmax(output_array), ' -> ', np.max(output_array)) post_values.append(np.max(output_array)) if (labels[i] == np.argmax(output_array)): @@ -417,164 +266,38 @@ if (DO_EXAMPLES and quantize_model): print('\n MODEL ACCURACY = ', accuracy * 100, '%') print('\n QUANTIZED ACCURACY = ', quant_accuracy * 100, '%') - output_array = propagate(model, scheduler, aidge_tensors[0]) - -if USE_CUDA: - model.set_backend("cpu") - for aidge_tensor in aidge_tensors: - aidge_tensor.set_backend("cpu") - -# -------------------------------------------------------------- -# FUSE NODES INTO METAOPS -# -------------------------------------------------------------- - -""" -Here is made the link between the Aidge model and the CPP -kernels implementation. In aidge, all the nodes calculations -are performed separately (Pad -> Conv -> Quantizer -> ReLU -> ...). - -However within the CPP export, some core operators are merged -in meta operators. For instance, the padding, scaling and ReLU are -performed within the Conv kernel. - -In this step, we use graph regex techniques to find the desired patterns -within the graph in order to match the export implementation of the kernels. -""" - -# Exclude unwanted producers -""" -Before fusing the nodes, we set a tag on the Producers in order to exclude -from the export the ones holding coefficients, as they are directly handled -within the layers parameters. -""" -exclude_unwanted_producers(model) - -# Fuse nodes -cpp_fuse_to_metaops(model) - -# Remove optional inputs -""" -Some optional inputs may be added by the quantization step (for instance with the clipping nodes). -Here we make sure that they will not be considered as actual graph inputs by the export, by -excluding them from the ordered_inputs list of the model. -""" -remove_optional_inputs(model) - -# Reset scheduler to apply graph modifications -""" -The scheduler always needs to be reset after graph manipulation. -""" -scheduler.reset_scheduling() - -# Name newly created MetaOps -""" -As names are optional in Aidge, the fuse_to_metaops function will not automatically -give a name to the newly created metaOps. However, in an export context, we need -our operators to be named, as this will be used to name the corresponding files. -""" -scheduler.generate_scheduling() # Scheduler needs to be generated as it has just been reset -set_nodes_names(scheduler) - -# -------------------------------------------------------------- -# LOG OUTPUTS FOR THE LAST IMAGE OF THE TEST DATASET -# -------------------------------------------------------------- - -""" -Here a final inference is made on the input we want to export and run. -This will ensure that all the feature maps tensors (between the layers) -hold the data corresponding to this specific input. -Then, the "log_outputs()" function (called later) will store these tensors -into log files that may be exported as well for comparison purpose. -""" - -output_array = propagate(model, scheduler, aidge_tensors[0]) + output_array = propagate(model, scheduler, tensors[0]) -print("### Exported Sample ###") -print("Aidge prediction after quantization :", np.argmax(output_array), "(" + str(np.max(output_array)) + ")") -print("Label :", labels[0]) - -# -------------------------------------------------------------- -# HANDLING DATATYPE # -------------------------------------------------------------- - -""" -Now, despite the quantization stage, all the tensors of the model are -still "virtually" in Int32. Before exporting the model, we have to set -tensors' datatypes to Int8, except for biases which should remain in Int32. -""" -if quantize_model: - set_nodes_datatypes(model) - -# Store tensors values into log files -""" -Once the tensors has been casted, the log_outputs() function can be -called to store their values into log files. -""" - -if os.path.isdir("log_outputs"): - shutil.rmtree("log_outputs") -model.log_outputs("log_outputs") - -# -------------------------------------------------------------- -# TEST MODE -# -------------------------------------------------------------- - -""" -The test mode is mainly used for validation and benchmark. The model will be -exported in a way that each layer's result will be compared with the CPU implementation. -The timings for each layer will be displayed. -In case of error, you will be able to enter debug mode, showing in-layer data or -changing the inputs of the layer, to isolate the source of the issue. -""" - -for node in model.get_nodes(): - node.attributes().dev_mode = DEV_MODE - -# -------------------------------------------------------------- -# AIDGE CMP +# EXPORT THE MODEL # -------------------------------------------------------------- -""" -If the --aidge_cmp option is enabled, the feature maps generated by aidge with the -backend cpu will be exported in the generated export. It will be used as reference -to verify that the results with the optimized kernels are correct for the exported -model. -This option has to be passed to each node in order to be used within the Export Nodes. -(JConv, JPad, ...) that you can find in the "export_gen/operator_export" folder. -""" +export_folder_name = f"export_resnet18_{args.dtype}" -if AIDGE_CMP: - for node in model.get_nodes(): - node.attributes().aidge_cmp = True +aidge_export_cpp.export( + export_folder_name=export_folder_name, + model=model, + scheduler=scheduler, + inputs_tensor=tensors[0], + labels=aidge_core.Tensor(labels[0]), + dev_mode=args.dev_mode, + aidge_cmp=args.aidge_cmp, + memory_manager_args={"wrapping": True} if args.mem_wrap else {}) # -------------------------------------------------------------- -# EXPORT THE MODEL +# COMPILING THE EXPORT # -------------------------------------------------------------- -model.save("exported_model") -inputs_tensor = aidge_core.Tensor(np.array(aidge_tensors[0])) -inputs_tensor.set_data_format(aidge_core.dformat.nchw) # Init the dataformat (default -> nchw) -inputs_tensor.set_data_format(aidge_core.dformat.nhwc) # Transpose the data (nchw -> nhwc) -if args.dtype == "int8": - inputs_tensor.set_datatype(aidge_core.dtype.int8) - -aidge_export_cpp.export(EXPORT_FOLDER, - model, - scheduler, - labels = aidge_core.Tensor(labels[0]), - inputs_tensor=inputs_tensor, - dev_mode = DEV_MODE, - aidge_cmp = AIDGE_CMP) - print("\n### Compiling the export ###") try: - for std_line in aidge_core.utils.run_command(["make"], cwd=EXPORT_FOLDER): + command = ["make", f"AIDGE_CMP={str(args.aidge_cmp).lower()}", "SAVE_OUTPUTS=false"] + for std_line in aidge_core.utils.run_command(command, cwd=export_folder_name): print(std_line, end="") except subprocess.CalledProcessError as e: raise RuntimeError(0, f"An error occurred, failed to build export.") from e print("\n### Running the export ###") try: - for std_line in aidge_core.utils.run_command(["./bin/run_export"], cwd=EXPORT_FOLDER): + for std_line in aidge_core.utils.run_command(["./bin/run_export"], cwd=export_folder_name): print(std_line, end="") except subprocess.CalledProcessError as e: - raise RuntimeError(0, f"An error occurred, failed to run export.") from e \ No newline at end of file + raise RuntimeError(0, f"An error occurred, failed to run export.") from e diff --git a/version.txt b/version.txt index 9e11b32fcaa96816319e5d0dcff9fb2873f04061..d15723fbe8de36b1c3ae302c77d8095459ea88e6 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -0.3.1 +0.3.2