Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • eclipse/aidge/aidge_export_cpp
  • hrouis/aidge_export_cpp
  • clementgf/aidge_export_cpp
  • cguillon/aidge_export_cpp
  • silvanosky/aidge_export_cpp
  • maab05/aidge_export_cpp
  • mnewson/aidge_export_cpp
  • axelfarr/aidge_export_cpp
  • gallasko/aidge_export_cpp
  • wboussella/aidge_export_cpp
  • mick94/aidge_export_cpp
  • louislerbourg/aidge_export_cpp
  • theodorget/aidge_export_cpp
13 results
Show changes
Commits on Source (397)
Showing
with 1535 additions and 478 deletions
......@@ -13,6 +13,11 @@ __pycache__
dist*/
aidge_export_cpp/_version.py
wheelhouse/*
env_aidge/
# Temp test folders
aidge_export_cpp/unit_tests/*_temp_test
*_test/
# Mermaid
*.mmd
......@@ -22,3 +27,6 @@ xml*/
# ONNX
*.onnx
# GDB
.gdb_history
\ No newline at end of file
......@@ -12,12 +12,21 @@ stages:
- deploy
include:
- project: 'eclipse/aidge/gitlab_shared_files'
- project: 'eclipse/aidge/gitlab_shared_files'
ref: 'main'
file:
file:
# choose which jobs to run by including the corresponding files.
- '.gitlab/ci/ubuntu_python.gitlab-ci.yml'
- '.gitlab/ci/release/pip.gitlab-ci.yml'
# Since aidge_export_cpp is a pure python package building on windows and on ubuntu doesn't differ
# - '.gitlab/ci/windows_python.gitlab-ci.yml'
# - '.gitlab/ci/windows_python.gitlab-ci.yml'
test:ubuntu_python:
before_script:
- !reference [.setup:test:ubuntu_python, before_script]
- DEPS_NAMES=("aidge_onnx" "aidge_quantization")
- DEPENDENCY_JOB="build:ubuntu_python"
- !reference [.ubuntu:download:artifacts, script]
# Need to install extra dependence for tests:
- python -m pip install torch torchvision
\ No newline at end of file
# Version 0.2.1 (January 31, 2025)
# Version 0.2.0 (december 6, 2024)
# Version 0.0.1 (January 23, 2024)
......
......@@ -2,15 +2,15 @@ r"""
Aidge Export for CPP standalone projects
"""
from .export_registry import ExportLibCpp
from .operators import *
from collections import defaultdict
import aidge_core
from pathlib import Path
from aidge_export_cpp.utils import ROOT
from ._version import *
# Constants
FILE = Path(__file__).resolve()
ROOT = FILE.parents[0]
from .export_registry import ExportLibCpp
from .export_utils import *
from .operators import *
from .export import *
from . import benchmark
import contextlib
import os
from shutil import rmtree
from subprocess import run
import numpy as np
import aidge_core
import aidge_backend_cpu
import aidge_export_cpp
def measure_inference_time(model: aidge_core.GraphView, input_data: list[str, np.ndarray], nb_warmup: int = 10, nb_iterations: int = 50) -> list[float]:
# load and set up the model
# model.set_datatype(ai.dtype.float32)
model.set_backend("cpu")
# create input Tensor list for the GraphView
ordered_inputs: list[aidge_core.Tensor] = [aidge_core.Tensor(i[1]) for i in input_data]
# [tmp fix] manual transpositin of data for input of export BEFORE converting to Tensor
# for i in input_data:
# nb_dims = len(i[1].shape)
# if nb_dims == 3:
# ordered_inputs.append(aidge_core.Tensor(i[1].transpose(0,2,1).reshape(i[1].shape).copy()))
# if nb_dims == 4:
# ordered_inputs.append(aidge_core.Tensor(np.transpose(i[1], axes=(0,2,3,1)).reshape(i[1].shape).copy()))
# else:
# ordered_inputs.append(aidge_core.Tensor(i[1]))
# set inputs for the export
for i, inp in enumerate(model.get_ordered_inputs()):
op = inp[0].get_operator()
ordered_inputs[i].set_data_format(aidge_core.dformat.nchw)
op.associate_input(i, ordered_inputs[i])
scheduler = aidge_core.SequentialScheduler(model)
scheduler.generate_scheduling()
for i in range(len(ordered_inputs)):
ordered_inputs[i].set_data_format(aidge_core.dformat.nhwc)
model.set_dataformat(aidge_core.dformat.nhwc)
model.set_backend(aidge_export_cpp.ExportLibCpp._name)
aidge_core.adapt_to_backend(model)
aidge_core.adapt_fc_params_format(model)
model.forward_dims([t.dims() for t in ordered_inputs])
scheduler.reset_scheduling()
scheduler.generate_scheduling()
# for ordered_input in ordered_inputs:
# ordered_input.set_backend("cpu")
operator_type: str = model.get_ordered_outputs()[0][0].get_operator().type()
folder_name: str = f"{operator_type.lower()}_test_export_cpp"
with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f):
aidge_core.export_utils.scheduler_export(
scheduler,
folder_name,
aidge_export_cpp.ExportLibCpp,
memory_manager=aidge_core.mem_info.generate_optimized_memory_info,
memory_manager_args={"wrapping": False }
)
aidge_core.export_utils.generate_main_inference_time_cpp(folder_name, model, nb_iterations, nb_warmup)
with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f):
run(['make'], cwd=folder_name, stdout=f)
timings_str = run(f'./{folder_name}/bin/run_export', capture_output=True, text=True)
folder_path = os.path.abspath(folder_name)
if os.path.exists(folder_path):
rmtree(folder_path, ignore_errors=True)
timings = [float(t) for t in timings_str.stdout.split(' ') if t.strip()]
return timings
def compute_output(model: aidge_core.GraphView, input_data: list[str, np.ndarray]) -> list[np.ndarray]:
# load and set up the model
model.set_backend("cpu")
# create input Tensor list for the GraphView
ordered_inputs: list[aidge_core.Tensor] = [aidge_core.Tensor(i[1]) for i in input_data]
# set inputs for the export
for i, inp in enumerate(model.get_ordered_inputs()):
op = inp[0].get_operator()
ordered_inputs[i].set_data_format(aidge_core.dformat.nchw)
op.associate_input(i, ordered_inputs[i])
scheduler = aidge_core.SequentialScheduler(model)
scheduler.generate_scheduling()
for i in range(len(ordered_inputs)):
ordered_inputs[i].set_data_format(aidge_core.dformat.nhwc)
model.set_dataformat(aidge_core.dformat.nhwc)
model.set_backend(aidge_export_cpp.ExportLibCpp._name)
aidge_core.adapt_to_backend(model)
aidge_core.adapt_fc_params_format(model)
model.forward_dims([t.dims() for t in ordered_inputs])
scheduler.reset_scheduling()
scheduler.generate_scheduling()
operator_type: str = model.get_ordered_outputs()[0][0].get_operator().type()
folder_name: str = f"{operator_type.lower()}_test_export_cpp"
with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f):
aidge_core.export_utils.scheduler_export(
scheduler,
folder_name,
aidge_export_cpp.ExportLibCpp,
memory_manager=aidge_core.mem_info.generate_optimized_memory_info,
memory_manager_args={"wrapping": False }
)
aidge_core.export_utils.generate_main_display_output_cpp(folder_name, model)
with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f):
run(['make'], cwd=folder_name, stdout=f)
output_str: str = run(f'./{folder_name}/bin/run_export', capture_output=True, text=True)
folder_path = os.path.abspath(folder_name)
if os.path.exists(folder_path):
rmtree(folder_path, ignore_errors=True)
outputs_str: list[str] = output_str.stdout.strip().split('\n')
outputs = [np.array([float(val) for val in single_output_str.split(' ') if val.strip()]) for i, single_output_str in enumerate(outputs_str)]
output_tensors = []
outputs_dims = [pair[0].get_operator().get_output(pair[1]).dims() for pair in model.get_ordered_outputs()]
for out_idx, arr in enumerate(outputs):
t = aidge_core.Tensor(arr.reshape(outputs_dims[out_idx]))
t.set_data_format(aidge_core.dformat.nhwc)
t.set_data_format(aidge_core.dformat.nchw)
output_tensors.append(np.array(t))
return output_tensors
import re
import os
import numpy as np
import shutil
from pathlib import Path
from typing import List, Union
import aidge_core
from aidge_core.mem_info import generate_optimized_memory_info
from aidge_core.export_utils import scheduler_export, generate_main_cpp
from aidge_core.export_utils.code_generation import *
from aidge_core.mem_info import compute_default_mem_info
from aidge_export_cpp.utils import ROOT
from aidge_export_cpp.utils.converter import numpy_dtype2ctype
from aidge_export_cpp import ExportLibCpp
from aidge_export_cpp.utils.generation import *
# from aidge_export_cpp.memory import *
def generate_input_file(export_folder:str,
array_name:str,
array: np.ndarray):
# If directory doesn't exist, create it
if not os.path.exists(export_folder):
os.makedirs(export_folder)
generate_file(
file_path=f"{export_folder}/{array_name}.h",
template_path=str(ROOT / "templates" / "data" / "inputs.jinja"),
dims = array.shape,
data_t = numpy_dtype2ctype(array.dtype),
name = array_name,
values = array.tolist()
)
def export(export_folder_name, graphview, scheduler, mem_wrapping=False):
aidge_core.export_utils.scheduler_export(
scheduler,
export_folder_name,
ExportLibCpp,
memory_manager=compute_default_mem_info
)
# export_folder = Path().absolute() / export_folder_name
# os.makedirs(str(export_folder), exist_ok=True)
# dnn_folder = export_folder / "dnn"
# os.makedirs(str(dnn_folder), exist_ok=True)
# list_actions = []
# list_configs = []
# peak_mem, mem_info = compute_default_mem_info(scheduler)
# list_forward_nodes = scheduler.get_static_scheduling()
# for node in list_forward_nodes:
# if ExportLibCpp.exportable(node):
# op = ExportLibCpp.get_export_node(node)(node, mem_info[node])
# # For configuration files
# list_configs = op.export(dnn_folder, list_configs)
# # For forward file
# list_actions = op.forward(list_actions)
# else:
# raise RuntimeError(f"Operator not supported: {node.type()} !")
# # Memory management
# # stats_folder = export_folder / "statistics"
# # os.makedirs(str(stats_folder), exist_ok=True)
# # mem_size, mem_info = generate_optimized_memory_info(stats_folder, scheduler, mem_wrapping)
# # peak_mem, mem_info = compute_default_mem_info(scheduler)
# # Generate the memory file
# # generate_file(
# # str(dnn_folder / "memory" / "mem_info.h"),
# # str(ROOT / "templates" / "memory" / "mem_info.jinja"),
# # mem_size = mem_size,
# # mem_info_legends = MEMORY_INFO_TEMPLATE,
# # mem_info = mem_info
# # )
# # list_configs.append("memory/mem_info.h")
# # Get entry nodes
# # Store the datatype & name
# list_inputs_name = []
# for node in graphview.get_input_nodes():
# for idx, node_input_tuple in enumerate(node.inputs()):
# node_input, _ = node_input_tuple
# if node_input is None:
# export_type = aidge2c(node.get_operator().get_output(0).dtype())
# list_inputs_name.append((export_type, f"{node.name()}_input_{idx}"))
# elif node_input not in graphview.get_nodes():
# export_type = aidge2c(node_input.get_operator().get_output(0).dtype())
# list_inputs_name.append((export_type, node_input.name()))
# # Get output nodes
# # Store the datatype & name, like entry nodes
# list_outputs_name = []
# for node in graphview.get_nodes():
# if len(node.get_children()) == 0:
# export_type = aidge2c(node.get_operator().get_output(0).dtype())
# list_outputs_name.append((export_type, f"{node.name()}_output_0"))
# # Generate forward file
# # TODO: for now the mem type is bound for all intermediate results, should change.
# # Note that we may have all inputs constants, hence select output type
# assert len(list_outputs_name) >= 1, f"TODO: requires some output to determine mem type"
# mem_ctype = list_outputs_name[0][0]
# generate_file(
# str(dnn_folder / "src" / "forward.cpp"),
# str(ROOT / "templates" / "network" / "network_forward.jinja"),
# headers=set(list_configs),
# actions=list_actions,
# inputs= list_inputs_name,
# outputs=list_outputs_name,
# mem_ctype=mem_ctype,
# peak_mem=peak_mem
# )
# # Generate dnn API
# generate_file(
# str(dnn_folder / "include" / "dnn.hpp"),
# str(ROOT / "templates" / "network" / "dnn_header.jinja"),
# libraries=[],
# functions=get_functions_from_c_file(str(dnn_folder / "src" / "forward.cpp")),
# )
# # Copy all static files in the export
# shutil.copy(str(ROOT / "static" / "main.cpp"), str(export_folder))
# shutil.copy(str(ROOT / "static" / "Makefile"), str(export_folder))
# shutil.copytree(str(ROOT / "static" / "include"), str(dnn_folder / "include"), dirs_exist_ok=True)
from aidge_export_cpp.export_utils import *
def export(export_folder_name: str,
model: aidge_core.GraphView,
scheduler: Union[List[aidge_core.Node],
aidge_core.Scheduler],
inputs_tensor: aidge_core.Tensor = None,
labels: aidge_core.Tensor = None,
dev_mode: bool = False,
aidge_cmp: bool = False,
memory_manager = generate_optimized_memory_info,
memory_manager_args = {}):
""" Export an aidge_core.Scheduler to C++ code
:param export_folder_name: Export folder name
:type export_folder_name: str
:param model: An instance of :py:class:`aidge_core.graph_view`, providing access to nodes and
ordered input/output data within the computational graph.
:type model: aidge_core.GraphView
:param scheduler: Scheduler instance managing the computation graph.
Uses `graph_view` and `get_sequential_static_scheduling` methods
:param inputs_tensor: **For future** argument to provide tensor to use in the main function, not implemented yet!
By default, the input of the given graph will be exported.
:type input_tensor: aidge_core.Tensor
to retrieve the computation graph layout and ordered nodes.
:type scheduler: aidge_core.Scheduler
:param labels: Argument to provide labels tensor to generate and use in the main function.
:type labels: aidge_core.Tensor
:param dev_mode: Wether or not the developer mode is enabled. If enabled, the export files
will be symlinks from the aidge_export_cpp module. Therefore, modifying
a file within the export will change the module as well.
:type dev_mode: boolean
"""
# Graph Log (Save the state of the graph through export steps)
os.makedirs("graph_log", exist_ok=True)
model.save(f"graph_log/0_export_start")
# Remove scaling producers from the export
exclude_unwanted_producers(model)
# Fuse nodes into MetaOps adapted to the CPP Export
cpp_fuse_to_metaops(model)
model.save(f"graph_log/1_fused_model")
# Reset the scheduler after graph modification
scheduler = aidge_core.SequentialScheduler(model) if scheduler is None else scheduler
scheduler.reset_scheduling()
scheduler.generate_scheduling()
# Normalize nodes names
set_nodes_names(scheduler)
model.save(f"graph_log/2_named_model")
# Last inference to set the inputs as well as the ifmaps (aidge_cmp)
if inputs_tensor is not None:
output_array = propagate(model, scheduler, inputs_tensor)
aidge_core.Log.notice(f"Exported sample results : {np.argmax(output_array)} ( {str(np.max(output_array))} )")
aidge_core.Log.notice(f"Label : {labels}")
elif aidge_cmp:
aidge_cmp = False
aidge_core.Log.error("aidge_cmp : No input_tensor has been provided to the export() function.\n\
Therefore ifmaps have not been generated and aidge_cmp cannot be used.")
# Set nodes datatypes if the model has been quantized
# TODO : Should be changed with future quantization feature
if inputs_tensor is not None:
if inputs_tensor.dtype() == aidge_core.dtype.int32:
set_nodes_datatypes(model) # Set datatype to int8 only
inputs_tensor.set_datatype(aidge_core.dtype.int8)
model.save(f"graph_log/3_set_datatypes")
# [aidge_cmp] Export feature maps tensors as json
if aidge_cmp:
generate_aidge_ifmaps(model)
# [aidge_cmp] Set flags on each node
if aidge_cmp:
for node in model.get_nodes():
node.attributes().aidge_cmp = True
# Set model's dataformat (NHWC)
## Inputs
for in_node in model.get_ordered_inputs():
input = in_node[0].get_operator().get_input(0)
if input is not None:
# Transpose the input
input_cpy = input.clone()
input_cpy.set_data_format(aidge_core.dformat.nchw)
input_cpy.set_data_format(aidge_core.dformat.nhwc)
in_node[0].get_operator().set_input(0, input_cpy)
## Rest of the graph
model.set_dataformat(aidge_core.dformat.nhwc)
model.save(f"graph_log/4_set_dataformats")
# Set model's backend
model.set_backend(ExportLibCpp._name)
# Adapt the graph to the selected backend
aidge_core.adapt_to_backend(model)
model.save(f"graph_log/5_adapt_to_backend")
aidge_core.adapt_fc_params_format(model)
model.save(f"graph_log/6_adapt_fc_params_format")
# At this point, the graph dimensions are supposed to be statically
# forwardable, thus allow_data_dependency can be safely set to True
dims = []
for in_node in model.get_ordered_inputs():
dims.append(in_node[0].get_operator().get_input(0).dims())
model.forward_dims(dims=dims, allow_data_dependency=True)
# Reset the scheduling as the graph may have been changed
scheduler.reset_scheduling()
scheduler.generate_scheduling()
# Remove existing export
export_folder_name = Path(export_folder_name)
if os.path.isdir(export_folder_name):
print("Removing existing export directory...")
shutil.rmtree(export_folder_name)
# Save the model
model.save("graph_log/7_exported_model")
# Setup stats folder
if "stats_folder" not in memory_manager_args:
memory_manager_args["stats_folder"] = f"{export_folder_name}/stats"
# Generate the export
scheduler_export(scheduler,
export_folder_name,
ExportLibCpp,
memory_manager=memory_manager,
memory_manager_args=memory_manager_args,
dev_mode=dev_mode)
model.save(f"{export_folder_name}/graph")
# Generate main file
generate_main_cpp(export_folder_name, model, labels=labels, inputs_tensor=inputs_tensor)
# Generate log files (aidge_cmp option)
if aidge_cmp:
export_aidge_ifmaps(export_folder_name)
from aidge_core.export_utils import ExportLib
from aidge_export_cpp.utils import ROOT
from aidge_export_cpp import ROOT
class ExportLibCpp(ExportLib):
_name="export_cpp"
static_files={
str(ROOT / "static" / "Makefile"): "",
str(ROOT / "static" / "include" / "network" / "typedefs.hpp"): "dnn/include/network",
str(ROOT / "static" / "include" / "network" / "utils.hpp"): "dnn/include/network",
str(ROOT / "static" / "typedefs.hpp"): "dnn/include/network",
str(ROOT / "static" / "utils.hpp"): "dnn/include/network",
str(ROOT / "static" / "rescaling_utils.hpp"): "dnn/include/network",
str(ROOT / "static" / "activation_utils.hpp"): "dnn/include/network",
}
import os
import json
import numpy as np
from collections import OrderedDict
import aidge_core
from aidge_core.export_utils import get_node_from_metaop, aidge2c, generate_file
from aidge_export_cpp import ROOT
def cpp_fuse_to_metaops(graph_view: aidge_core.GraphView):
"""
Fuse nodes into metaops adapted for the CPP Export
TODO: These recipes should be in aidge_core
:param graph_view: An instance of :py:class:`aidge_core.GraphView`, providing access to nodes and
ordered input/output data within the computational graph.
"""
cpp_recipes = OrderedDict({
# Quantization
"QMul": "Mul->Quantizer", # Fixed Point Scaling
# FC
"QFC": "FC->(Quantizer|QMul)",
"FCAct": "(FC|QFC)->ReLU",
# Conv
"QConv": "Conv2D->(Quantizer|QMul)",
"PadConv": "(QConv|Conv2D)<-Pad2D",
"ConvAct": "(QConv|Conv2D)->ReLU",
"PadConvAct": "PadConv->ReLU",
# ConvDw
"QConvDw": "ConvDepthWise2D->(Quantizer|QMul)",
"ConvDwPad": "(QConvDw|ConvDepthWise2D)->Pad2D",
"ConvDwAct": "(QConvDw|ConvConvDepthWise2D2D)->ReLU",
"ConvDwActPad": "ConvDwAct->Pad2D",
# Max Pooling
"PadMaxPool": "MaxPooling2D<-Pad2D",
"MaxPoolAct": "MaxPooling2D->ReLU",
"PadMaxPoolAct": "PadMaxPool->ReLU",
# Average Pooling
"PadAvgPool": "AvgPooling2D<-Pad2D",
"AvgPoolAct": "AvgPooling2D->ReLU",
"PadAvgPoolAct": "PadAvgPool->ReLU",
# Global Average Pooling
"PadGlobalAvgPool": "GlobalAveragePooling2D<-Pad2D",
"GlobalAvgPoolAct": "GlobalAveragePooling2D->ReLU",
"PadGlobalAvgPoolAct": "PadGlobalAveragePool->ReLU",
# ElemWise
"QAdd": "Add->(Quantizer|QMul)",
"QSub": "Sub->(Quantizer|QMul)",
# "QMul": "Mul->Quantizer", # Already defined
"AddAct": "(QAdd|Add)->ReLU",
"SubAct": "(QSub|Sub)->ReLU",
"MulAct": "(QMul|Mul)->ReLU",
# Activation
"QReLU": "ReLU->(Quantizer|QMul)",
})
for node, recipe in cpp_recipes.items():
aidge_core.fuse_to_metaops(graph_view, recipe, node)
def set_nodes_names(scheduler):
"""
Set the CPP nodes names as well as their producers.
The producers naming is handled from their child node.
[TODO] Fc and Conv layers will always have weights as parent 1 and
possibly biases as parent 2. It may be better to previously label the
producers.
:param scheduler: Scheduler instance managing the computation graph.
Uses `graph_view` and `get_sequential_static_scheduling` methods
to retrieve the computation graph layout and ordered nodes.
:type scheduler: aidge_core.Scheduler
"""
node_ids = {} # Dict holding the node type along with a counter
node_it = 0 # Node Iterator
## MetaOps
for node in scheduler.get_sequential_static_scheduling():
node_type = node.type()
if node_type != "Producer":
if node.type() not in node_ids:
node_ids[node_type] = 0
# Set node name
node.set_name("_" + str(node_it) + "_" +
str(node_type) + "_" + str(node_ids[node_type]))
node_ids[node_type] += 1
node_it += 1
# Set producers names
## Weights & Biases producers
if get_node_from_metaop(node, "FC") or \
get_node_from_metaop(node, "Conv2D") or \
get_node_from_metaop(node, "ConvDepthWise2D"):
node.get_parent(1).set_name(node.name() + "_weights")
if node.get_parent(2) is not None:
node.get_parent(2).set_name(node.name() + "_biases")
## Scaling Producers
for node in scheduler.get_sequential_static_scheduling():
"""
TODO: If multiple quantizer nodes are found, the producers will
all have the same name and this will not work properly.
"""
if node.type() == "Producer":
child_node = node.output(0)[0][0]
if node.attributes().has_attr("shift_prod"):
node.set_name(child_node.name() + "_shift")
if node.attributes().has_attr("coef_prod"):
node.set_name(child_node.name() + "_coef")
def set_nodes_datatypes(graph_view: aidge_core.GraphView):
""" Set the nodes' datatypes
The set_datatype function can't be used on Conv2D and FC nodes directly
as the biases datatype is different from the other inputs.
TODO: Should be using forward_datatype()
:param graph_view: An instance of :py:class:`aidge_core.graph_view`, providing access to nodes and
ordered input/output data within the computational graph.
"""
for node in graph_view.get_nodes():
if node.type() != "Producer":
if get_node_from_metaop(node, "FC") or \
get_node_from_metaop(node, "Conv2D") or \
get_node_from_metaop(node, "ConvDepthWise2D"):
if node.get_operator().get_input(0) is not None:
node.get_operator().get_input(0).set_datatype(aidge_core.dtype.int8) # Input
node.get_operator().get_input(1).set_datatype(aidge_core.dtype.int8) # Weights
if node.get_parent(2) is not None:
node.get_operator().get_input(2).set_datatype(aidge_core.dtype.int32) # Biases
node.get_operator().get_output(0).set_datatype(aidge_core.dtype.int8) # Output
else:
node.get_operator().set_datatype(aidge_core.dtype.int8)
# Set input node's datatype
for n in graph_view.get_input_nodes():
if n.get_operator().get_input(0) is not None:
n.get_operator().get_input(0).set_datatype(aidge_core.dtype.int8)
def exclude_unwanted_producers(model):
""" Exclude some producers not needed for the export
Currently excludes the producers attached to the Mul and BitShift nodes, as they are
tensors holding a single data. This data is retrieved during the export
generation process and passed as argument directly within the Mul layer
configuration.
"""
nodes_to_ignore = ["Mul", "BitShift", "Clip"]
for node in model.get_nodes():
if node.type() == "Producer":
children_nodes = [n.type() for n in node.get_children()]
for node_type in nodes_to_ignore:
if node_type in children_nodes:
node.attributes().ignore = True
break
def set_scaling_attributes(export_node: aidge_core.export_utils.ExportNode, node: aidge_core.Node):
"""
Look recursively for a Quantizer node inside of the given node,
then set shift and coef attributes of the given export node.
[TODO] Should be moved into aidge_core.ExportNode
:param export_node: An instance of :py:class:`aidge_core.export_utils.ExportNode` to set the scaling
attributes needed for a quantized export.
:type export_node: aidge_core.export_utils.ExportNode
:param node: Node which may hold a Quantizer node.
:type node: aidge_core.Node
"""
QNode = get_node_from_metaop(node, "Quantizer")
if QNode:
BNode = get_node_from_metaop(QNode[0], "BitShift")
export_node.attributes["shift_value"] = BNode[0].get_operator().get_input(1)[0]
QMulNode = get_node_from_metaop(node, "QMul")
if QMulNode:
CNode = get_node_from_metaop(QMulNode[0], "Mul")
export_node.attributes["coef_value"] = CNode[0].get_operator().get_input(1)[0]
def normalize(array):
"""
Normalize an input image between -1 and 1
"""
if array.max() == array.min():
return array/array.max()
array = (array - array.min()) / (array.max() - array.min())
return 2 * array - 1
def generate_aidge_ifmaps(model):
json_nodes = []
for node in model.get_nodes():
if node.type() != "Producer":
output = node.get_operator().get_output(0)
data = {
"name": node.name(),
"dims": output.dims(),
"dtype": aidge2c(output.dtype()),
"dformat": str(output.dformat()),
"values": np.array(output).tolist()
}
json_nodes.append(data)
# Write the entire list to the JSON file after the loop
with open('aidge_output.json', 'w') as file:
json.dump(json_nodes, file, indent=2, separators=(",", ": "))
def export_aidge_ifmaps(export_folder_name):
os.makedirs(export_folder_name / "data" / "aidge_outputs")
os.makedirs(export_folder_name / "data" / "export_outputs")
# Load the JSON data from the file
with open('aidge_output.json', 'r') as file:
json_nodes = json.load(file)
# Access the data
for node in json_nodes:
name = node["name"]
dims = node["dims"]
dtype = node["dtype"]
dformat = node["dformat"]
values = node["values"]
generate_file(export_folder_name / "data" / "aidge_outputs" / (name + ".hpp"),
ROOT / "templates" / "data" / "aidge_tensor.jinja",
dtype=dtype,
dformat=dformat,
name=name + "_output_0_aidge",
dims=dims,
values=values)
# Remove the JSON file
os.remove('aidge_output.json')
def propagate(model, scheduler, tensor):
"""
Propagate the given tensor into the model and return the
output tensor.
"""
# Run the inference
scheduler.forward(True, [tensor])
# Gather the results
output_node = model.get_ordered_outputs()[0][0]
output_tensor = output_node.get_operator().get_output(0).clone()
output_tensor.set_backend("cpu")
return np.array(output_tensor)
#ifndef __AIDGE_EXPORT_CPP_KERNELS_ACTIVATION__
#define __AIDGE_EXPORT_CPP_KERNELS_ACTIVATION__
#include <type_traits>
#include "network/typedefs.hpp"
#include "network/utils.hpp"
#include "kernels/rescaling.hpp"
#include "network/activation_utils.hpp"
#include "network/rescaling_utils.hpp"
#include <sys/types.h>
template<typename Output_T, typename T,
typename std::enable_if<std::is_floating_point<T>::value>::type* = nullptr>
__attribute__((always_inline)) inline
Output_T saturate (T value, int32_t /*sat*/)
{
return value;
}
template<typename Output_T, typename T,
typename std::enable_if<!std::is_floating_point<T>::value>::type* = nullptr>
__attribute__((always_inline)) inline
Output_T saturate (T value, uint32_t sat)
{
if (std::is_unsigned<Output_T>::value) {
return clamp(value, T(0), (T(1) << sat) - 1);
} else {
return clamp(value, -(T(1) << (sat - 1)), (T(1) << (sat - 1)) - 1);
}
}
template<typename Output_T,
typename Sum_T,
typename Rescaling_T>
__attribute__((always_inline)) inline
Output_T activation_forward_value (Sum_T weightedSum,
int output,
ActivationFunction_T func,
const Rescaling_T& __restrict rescaling)
{
switch(func) {
case Linear:
case Saturation: {
break;
}
case Rectifier: {
if(weightedSum <= 0) weightedSum = 0;
break;
}
default:
// Unsupported activation function
break;
}
// Value fixed here for now but it should be generated by
// the export module or determined by the type of Output_T
// For now only works for int8_t and uint8_t
const uint32_t NB_BITS = 8;
return saturate<Output_T>(rescaling(weightedSum, output), NB_BITS);
}
template<int NB_DATA,
template<size_t NB_ELTS,
ActivationFunction_T ACTIVATION,
// Memory mapping: inputs
size_t INPUT_MEM_CONT_OFFSET,
size_t INPUT_MEM_CONT_SIZE,
size_t INPUT_MEM_WRAP_OFFSET,
size_t INPUT_MEM_WRAP_SIZE,
size_t INPUT_MEM_STRIDE,
// Memory mapping: outputs
size_t OUTPUT_MEM_CONT_OFFSET,
size_t OUTPUT_MEM_CONT_SIZE,
size_t OUTPUT_MEM_WRAP_OFFSET,
size_t OUTPUT_MEM_WRAP_SIZE,
size_t OUTPUT_MEM_STRIDE,
typename Input_T, typename Output_T, typename Rescaling_T>
__attribute__((always_inline)) inline
void activation_forward (
......@@ -66,12 +26,22 @@ void activation_forward (
Output_T* __restrict outputs,
const Rescaling_T& __restrict rescaling)
{
for (int i = 0; i < NB_DATA; ++i)
{
outputs[i] = activation_forward_value<Output_T>(inputs[i], i, ACTIVATION, rescaling);
}
int inOffset = 0;
int outOffset = 0;
}
for (size_t i = 0; i < NB_ELTS; ++i) {
if (INPUT_MEM_WRAP_SIZE > 0 && i == static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) {
inOffset = (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET
- INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
}
if (OUTPUT_MEM_WRAP_SIZE > 0 && i == static_cast<int>(OUTPUT_MEM_CONT_SIZE / sizeof(Output_T))) {
outOffset = (OUTPUT_MEM_WRAP_OFFSET - OUTPUT_MEM_CONT_OFFSET
- OUTPUT_MEM_CONT_SIZE) / sizeof(Output_T);
}
outputs[outOffset + i] = activation_forward_value<Output_T>(inputs[inOffset + i], i, ACTIVATION, rescaling);
}
}
#endif // __AIDGE_EXPORT_CPP_KERNELS_ACTIVATION__
......@@ -2,16 +2,18 @@
#define __AIDGE_EXPORT_CPP_KERNELS_BATCHNORM__
#include "network/typedefs.hpp"
#include "kernels/rescaling.hpp"
#include "network/activation_utils.hpp"
#include <sys/types.h>
#include <math.h>
// WARNING: this kernel only works for 32-bits floating point values
template<int NB_OUTPUTS,
int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
template<size_t NB_BATCHES, size_t NB_OUTPUTS,
size_t OUTPUTS_HEIGHT, size_t OUTPUTS_WIDTH,
ActivationFunction_T ACTIVATION,
typename Input_T, typename Output_T,
typename Param_T>
typename Param_T,
typename Rescaling_T>
__attribute__((always_inline)) inline
void batchnorm_forward (
const Input_T* __restrict inputs,
......@@ -20,18 +22,22 @@ void batchnorm_forward (
const Param_T* __restrict variances,
const Param_T* __restrict means,
const Param_T* __restrict scales,
const double epsilon)
const double epsilon,
const Rescaling_T& __restrict rescaling)
{
for (unsigned int output = 0; output < NB_OUTPUTS; ++output) {
const Output_T var = sqrt(variances[output] + epsilon);
for (size_t batch = 0; batch < NB_BATCHES; ++batch) {
for (size_t output = 0; output < NB_OUTPUTS; ++output) {
// If the variance is 0, we need to avoid division by 0
Output_T var = sqrt(variances[output] > 0.0 ? variances[output] + epsilon : epsilon);
for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
const int outputOffset = OUTPUTS_HEIGHT * oy + ox;
for (size_t oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
for (size_t ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
const size_t outputOffset = batch * OUTPUTS_WIDTH * OUTPUTS_HEIGHT * NB_OUTPUTS + output * OUTPUTS_WIDTH * OUTPUTS_HEIGHT + OUTPUTS_WIDTH * oy + ox;
const Output_T normalized = (inputs[outputOffset + output] - means[output]) / var;
const Output_T sAs = scales[output] * normalized + biases[output];
outputs[outputOffset + output] = sat<Output_T>(sAs, output, ACTIVATION, NoScaling);
const Output_T normalized = (inputs[outputOffset] - means[output]) / var;
const Output_T sAs = scales[output] * normalized + biases[output];
outputs[outputOffset] = activation_forward_value<Output_T>(sAs, output, ACTIVATION, rescaling);
}
}
}
}
......
#ifndef __AIDGE_EXPORT_CPP_KERNELS_CONCAT__
#define __AIDGE_EXPORT_CPP_KERNELS_CONCAT__
#include <sys/types.h>
template<size_t AXIS_SIZE_POST,
size_t AXIS_SIZE_PRE,
const size_t AXIS_SIZE[],
size_t TOTAL_AXIS_SIZE,
size_t NB_INPUTS,
typename T>
__attribute__((always_inline)) inline static
void concat_forward (
const T* const * __restrict inputs,
T* __restrict output)
{
for (size_t i = 0; i < AXIS_SIZE_PRE; ++i) {
// Loop over post-axis (e.g., dims after axis 1)
for (size_t j = 0; j < AXIS_SIZE_POST; ++j) {
size_t axis_offset = 0;
// Loop over each input tensor
for (size_t n = 0; n < NB_INPUTS; ++n) {
for (size_t k = 0; k < AXIS_SIZE[n]; ++k) {
const size_t input_idx = i * AXIS_SIZE[n] * AXIS_SIZE_POST + k * AXIS_SIZE_POST + j;
output[i * TOTAL_AXIS_SIZE * AXIS_SIZE_POST + (axis_offset + k) * AXIS_SIZE_POST + j] =
inputs[n][input_idx];
}
axis_offset += AXIS_SIZE[n]; // move along axis in output
}
}
}
}
#endif // __AIDGE_EXPORT_CPP_KERNELS_CONCAT__
\ No newline at end of file
......@@ -2,21 +2,33 @@
#define __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION__
#include "network/typedefs.hpp"
#include "kernels/rescaling.hpp"
#include "network/rescaling_utils.hpp"
#include "network/utils.hpp"
#include "kernels/macs.hpp"
#include "kernels/activation.hpp"
template<int NB_CHANNELS,
int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
int NB_OUTPUTS,
int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
int PADDING_Y, int PADDING_X,
int STRIDE_Y, int STRIDE_X,
int DILATION_Y, int DILATION_X,
int KERNEL_HEIGHT, int KERNEL_WIDTH,
#include "network/macs.hpp"
#include "network/activation_utils.hpp"
#include <sys/types.h>
template<size_t NB_CHANNELS,
size_t CHANNELS_HEIGHT, size_t CHANNELS_WIDTH,
size_t NB_OUTPUTS,
size_t OUTPUTS_HEIGHT, size_t OUTPUTS_WIDTH,
size_t PADDING_Y, size_t PADDING_X,
size_t STRIDE_Y, size_t STRIDE_X,
size_t DILATION_Y, size_t DILATION_X,
size_t KERNEL_HEIGHT, size_t KERNEL_WIDTH,
ActivationFunction_T ACTIVATION,
// Memory mapping: inputs
size_t INPUT_MEM_CONT_OFFSET,
size_t INPUT_MEM_CONT_SIZE,
size_t INPUT_MEM_WRAP_OFFSET,
size_t INPUT_MEM_WRAP_SIZE,
size_t INPUT_MEM_STRIDE,
// Memory mapping: outputs
size_t OUTPUT_MEM_CONT_OFFSET,
size_t OUTPUT_MEM_CONT_SIZE,
size_t OUTPUT_MEM_WRAP_OFFSET,
size_t OUTPUT_MEM_WRAP_SIZE,
size_t OUTPUT_MEM_STRIDE,
typename Input_T, typename Output_T,
typename Weight_T, typename Bias_T,
typename Rescaling_T>
......@@ -28,60 +40,82 @@ void convolution_forward(
const Bias_T* __restrict biases,
const Rescaling_T& __restrict rescaling)
{
constexpr int DILATED_KERNEL_HEIGHT
= KERNEL_HEIGHT + (DILATION_Y - 1) * (KERNEL_HEIGHT - 1);
constexpr int DILATED_KERNEL_WIDTH
= KERNEL_WIDTH + (DILATION_X - 1) * (KERNEL_WIDTH - 1);
constexpr int OUTPUTS_HEIGHT_NOPAD
constexpr size_t OUTPUTS_HEIGHT_NOPAD
= (CHANNELS_HEIGHT - DILATION_Y * (KERNEL_HEIGHT - 1) - 1 + STRIDE_Y) / STRIDE_Y;
constexpr int OUTPUTS_WIDTH_NOPAD
constexpr size_t OUTPUTS_WIDTH_NOPAD
= (CHANNELS_WIDTH - DILATION_X * (KERNEL_WIDTH - 1) - 1 + STRIDE_X) / STRIDE_X;
for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
const int syMin = (PADDING_Y == 0) ? 0
: max(PADDING_Y - (oy * STRIDE_Y), 0);
const int syMax = (PADDING_Y == 0
&& OUTPUTS_HEIGHT == OUTPUTS_HEIGHT_NOPAD) ? DILATED_KERNEL_HEIGHT
: clamp(CHANNELS_HEIGHT + PADDING_Y - (oy * STRIDE_Y),
0, DILATED_KERNEL_HEIGHT);
const int iy = (oy * STRIDE_Y) - PADDING_Y;
for (size_t oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
const size_t syMin = (PADDING_Y == 0) ? 0
: max((PADDING_Y - (oy * STRIDE_Y) + DILATION_Y - 1) / DILATION_Y, 0);
const size_t syMax = (PADDING_Y == 0
&& OUTPUTS_HEIGHT == OUTPUTS_HEIGHT_NOPAD) ? KERNEL_HEIGHT
: clamp((CHANNELS_HEIGHT + PADDING_Y - (oy * STRIDE_Y)) / DILATION_Y,
0, KERNEL_HEIGHT);
const int iy = static_cast<int>(oy * STRIDE_Y) - static_cast<int>(PADDING_Y);
#ifdef _OPENMP
#pragma omp parallel for collapse(2)
for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
for (int output = 0; output < NB_OUTPUTS; ++output) {
#endif
for (size_t ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
for (size_t output = 0; output < NB_OUTPUTS; ++output) {
// moved to inner loop for collapsing -->
const int sxMin = (PADDING_X == 0) ? 0
: max(PADDING_X - (ox * STRIDE_X), 0);
const int sxMax = (PADDING_X == 0
const size_t sxMin = (PADDING_X == 0) ? 0
: max((PADDING_X - (ox * STRIDE_X) + DILATION_X - 1) / DILATION_X, 0);
const size_t sxMax = (PADDING_X == 0
&& OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
? DILATED_KERNEL_WIDTH
: clamp(CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X),
0, DILATED_KERNEL_WIDTH);
const int ix = (ox * STRIDE_X) - PADDING_X;
? KERNEL_WIDTH
: clamp((CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X)) / DILATION_X,
0, KERNEL_WIDTH);
const int ix = static_cast<int>(ox * STRIDE_X) - static_cast<int>(PADDING_X);
const int oPos = (ox + OUTPUTS_WIDTH * oy);
int oOffset = NB_OUTPUTS * oPos;
const size_t oPos = (ox + OUTPUTS_WIDTH * oy);
int oOffset = (OUTPUT_MEM_STRIDE / sizeof(Output_T)) * oPos;
// <--
if (OUTPUT_MEM_WRAP_SIZE > 0 && oOffset >= static_cast<int>(OUTPUT_MEM_CONT_SIZE / sizeof(Output_T))) {
oOffset += (OUTPUT_MEM_WRAP_OFFSET - OUTPUT_MEM_CONT_OFFSET
- OUTPUT_MEM_CONT_SIZE) / sizeof(Output_T);
}
Bias_T weightedSum = biases[output];
// <--
// Check if the biases are defined
Bias_T weightedSum = biases ? biases[output] : 0;
for (int sy = 0; sy < KERNEL_HEIGHT; ++sy) {
for (size_t sy = 0; sy < KERNEL_HEIGHT; ++sy) {
if ((PADDING_Y != 0
|| OUTPUTS_HEIGHT != OUTPUTS_HEIGHT_NOPAD)
&& ((sy*DILATION_Y < syMin) || (sy*DILATION_Y >= syMax)))
&& sy >= syMax - syMin)
{
continue;
break;
}
const int iPos = ix + CHANNELS_WIDTH * (iy + sy*DILATION_Y);
int iOffset = NB_CHANNELS * iPos;
const size_t iPos = static_cast<size_t>(sxMin * DILATION_X + ix)
+ CHANNELS_WIDTH * (static_cast<size_t>(iy + (syMin + sy) * DILATION_Y));
int iOffset = (INPUT_MEM_STRIDE / sizeof(Input_T)) * iPos;
const int wOffset = (output*KERNEL_HEIGHT + sy) * KERNEL_WIDTH * NB_CHANNELS;
// Wrapping cannot occur in the middle of a line, except if
// there is only one line (1D)!
bool wrapInRange = false;
if (DILATION_X == 1 && ((PADDING_X == 0 && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
if (INPUT_MEM_WRAP_SIZE > 0
&& iOffset >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
{
iOffset += (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET
- INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
}
else if (INPUT_MEM_WRAP_SIZE > 0 && KERNEL_WIDTH > 1
&& CHANNELS_HEIGHT == 1 // single line (1D)!
&& iOffset + KERNEL_WIDTH * NB_CHANNELS
> (INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
{
wrapInRange = true;
}
const size_t wOffset = NB_CHANNELS * (sxMin
+ KERNEL_WIDTH * (syMin + sy + KERNEL_HEIGHT * output));
if (!wrapInRange && NB_CHANNELS == (INPUT_MEM_STRIDE / sizeof(Input_T))
&& DILATION_X == 1 && ((PADDING_X == 0 && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
|| sxMax - sxMin == KERNEL_WIDTH))
{
macsOnRange<KERNEL_WIDTH * NB_CHANNELS>(
......@@ -90,16 +124,24 @@ void convolution_forward(
weightedSum);
}
else {
for (int sx = 0; sx < KERNEL_WIDTH; ++sx) {
for (size_t sx = 0; sx < KERNEL_WIDTH; ++sx) {
if ((PADDING_X != 0
|| OUTPUTS_WIDTH != OUTPUTS_WIDTH_NOPAD)
&& ((sx*DILATION_X < sxMin) || (sx*DILATION_X >= sxMax)))
&& sx >= sxMax - sxMin)
{
continue;
break;
}
int iOffsetInRange = iOffset
+ sx * DILATION_X * NB_CHANNELS;
+ sx * DILATION_X * (INPUT_MEM_STRIDE / sizeof(Input_T));
if (wrapInRange
&& iOffsetInRange >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
{
iOffsetInRange += (INPUT_MEM_WRAP_OFFSET
- INPUT_MEM_CONT_OFFSET
- INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
}
macsOnRange<NB_CHANNELS>(
// same input line so no wrapping can occur
......@@ -116,4 +158,69 @@ void convolution_forward(
}
}
// Template overloading when biases are not given to the convolution
template<size_t NB_CHANNELS,
size_t CHANNELS_HEIGHT, size_t CHANNELS_WIDTH,
size_t NB_OUTPUTS,
size_t OUTPUTS_HEIGHT, size_t OUTPUTS_WIDTH,
size_t PADDING_Y, size_t PADDING_X,
size_t STRIDE_Y, size_t STRIDE_X,
size_t DILATION_Y, size_t DILATION_X,
size_t KERNEL_HEIGHT, size_t KERNEL_WIDTH,
ActivationFunction_T ACTIVATION,
// Memory mapping: inputs
size_t INPUT_MEM_CONT_OFFSET,
size_t INPUT_MEM_CONT_SIZE,
size_t INPUT_MEM_WRAP_OFFSET,
size_t INPUT_MEM_WRAP_SIZE,
size_t INPUT_MEM_STRIDE,
// Memory mapping: outputs
size_t OUTPUT_MEM_CONT_OFFSET,
size_t OUTPUT_MEM_CONT_SIZE,
size_t OUTPUT_MEM_WRAP_OFFSET,
size_t OUTPUT_MEM_WRAP_SIZE,
size_t OUTPUT_MEM_STRIDE,
typename Input_T, typename Output_T,
typename Weight_T,
typename Rescaling_T>
__attribute__((always_inline)) inline
void convolution_forward(
const Input_T* __restrict inputs,
Output_T* __restrict outputs,
const Weight_T* __restrict weights,
std::nullptr_t __restrict,
const Rescaling_T& __restrict rescaling)
{
const float* b = nullptr;
convolution_forward<NB_CHANNELS,
CHANNELS_HEIGHT,
CHANNELS_WIDTH,
NB_OUTPUTS,
OUTPUTS_HEIGHT,
OUTPUTS_WIDTH,
PADDING_Y,
PADDING_X,
STRIDE_Y,
STRIDE_X,
DILATION_Y,
DILATION_X,
KERNEL_HEIGHT,
KERNEL_WIDTH,
ACTIVATION,
// Memory mapping: inputs
INPUT_MEM_CONT_OFFSET,
INPUT_MEM_CONT_SIZE,
INPUT_MEM_WRAP_OFFSET,
INPUT_MEM_WRAP_SIZE,
INPUT_MEM_STRIDE,
// Memory mapping: outputs
OUTPUT_MEM_CONT_OFFSET,
OUTPUT_MEM_CONT_SIZE,
OUTPUT_MEM_WRAP_OFFSET,
OUTPUT_MEM_WRAP_SIZE,
OUTPUT_MEM_STRIDE>
(inputs, outputs, weights, b, rescaling);
}
#endif // __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION__
#ifndef __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION_DEPTHWISE__
#define __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION_DEPTHWISE__
#include "network/typedefs.hpp"
#include "network/rescaling_utils.hpp"
#include "network/utils.hpp"
#include "network/macs.hpp"
#include "network/activation_utils.hpp"
#include <sys/types.h>
template<size_t NB_CHANNELS,
size_t CHANNELS_HEIGHT, size_t CHANNELS_WIDTH,
size_t NB_OUTPUTS,
size_t OUTPUTS_HEIGHT, size_t OUTPUTS_WIDTH,
size_t PADDING_Y, size_t PADDING_X,
size_t STRIDE_Y, size_t STRIDE_X,
size_t DILATION_Y, size_t DILATION_X,
size_t KERNEL_HEIGHT, size_t KERNEL_WIDTH,
ActivationFunction_T ACTIVATION,
// Memory mapping: inputs
size_t INPUT_MEM_CONT_OFFSET,
size_t INPUT_MEM_CONT_SIZE,
size_t INPUT_MEM_WRAP_OFFSET,
size_t INPUT_MEM_WRAP_SIZE,
size_t INPUT_MEM_STRIDE,
// Memory mapping: outputs
size_t OUTPUT_MEM_CONT_OFFSET,
size_t OUTPUT_MEM_CONT_SIZE,
size_t OUTPUT_MEM_WRAP_OFFSET,
size_t OUTPUT_MEM_WRAP_SIZE,
size_t OUTPUT_MEM_STRIDE,
typename Input_T, typename Output_T,
typename Weight_T, typename Bias_T,
typename Rescaling_T>
__attribute__((always_inline)) inline
void convolution_depthwise_forward(
const Input_T* __restrict inputs,
Output_T* __restrict outputs,
const Weight_T* __restrict weights,
const Bias_T* __restrict biases,
const Rescaling_T& __restrict rescaling)
{
static_assert(NB_OUTPUTS % NB_CHANNELS == 0,
"NB_OUTPUTS should be a multiple of NB_CHANNELS.");
constexpr size_t DILATED_KERNEL_HEIGHT
= KERNEL_HEIGHT + (DILATION_Y - 1) * (KERNEL_HEIGHT - 1);
constexpr size_t DILATED_KERNEL_WIDTH
= KERNEL_WIDTH + (DILATION_X - 1) * (KERNEL_WIDTH - 1);
constexpr size_t OUTPUTS_HEIGHT_NOPAD
= (CHANNELS_HEIGHT - DILATION_Y * (KERNEL_HEIGHT - 1) - 1 + STRIDE_Y) / STRIDE_Y;
constexpr size_t OUTPUTS_WIDTH_NOPAD
= (CHANNELS_WIDTH - DILATION_X * (KERNEL_WIDTH - 1) - 1 + STRIDE_X) / STRIDE_X;
for (size_t oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
const size_t syMin = (PADDING_Y == 0) ? 0
: max(PADDING_Y - (oy * STRIDE_Y), 0);
const size_t syMax = (PADDING_Y == 0
&& OUTPUTS_HEIGHT == OUTPUTS_HEIGHT_NOPAD) ? DILATED_KERNEL_HEIGHT
: clamp(CHANNELS_HEIGHT + PADDING_Y - (oy * STRIDE_Y),
0, DILATED_KERNEL_HEIGHT);
const int iy = static_cast<int>(oy * STRIDE_Y) - static_cast<int>(PADDING_Y);
#ifdef _OPENMP
#pragma omp parallel for collapse(2)
#endif
for (size_t ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
for (size_t output = 0; output < NB_OUTPUTS; ++output) {
// moved to inner loop for collapsing -->
const size_t sxMin = (PADDING_X == 0) ? 0
: max(PADDING_X - (ox * STRIDE_X), 0);
const size_t sxMax = (PADDING_X == 0
&& OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
? DILATED_KERNEL_WIDTH
: clamp(CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X),
0, DILATED_KERNEL_WIDTH);
const int ix = static_cast<int>(ox * STRIDE_X) - static_cast<int>(PADDING_X);
const size_t oPos = (ox + OUTPUTS_WIDTH * oy);
int oOffset = (OUTPUT_MEM_STRIDE / sizeof(Output_T)) * oPos;
if (OUTPUT_MEM_WRAP_SIZE > 0 && oOffset >= static_cast<int>(OUTPUT_MEM_CONT_SIZE / sizeof(Output_T))) {
oOffset += (OUTPUT_MEM_WRAP_OFFSET - OUTPUT_MEM_CONT_OFFSET
- OUTPUT_MEM_CONT_SIZE) / sizeof(Output_T);
}
// <--
const size_t channel = (output * NB_CHANNELS) / NB_OUTPUTS;
Bias_T weightedSum = biases ? biases[output] : 0;
for (size_t sy = 0; sy < KERNEL_HEIGHT; ++sy) {
if ((PADDING_Y != 0
|| OUTPUTS_HEIGHT != OUTPUTS_HEIGHT_NOPAD)
&& ((sy*DILATION_Y < syMin) || (sy*DILATION_Y >= syMax)))
{
continue;
}
const size_t iPos = static_cast<size_t>(ix)
+ CHANNELS_WIDTH * (static_cast<size_t>(iy + sy * DILATION_Y));
int iOffset = (INPUT_MEM_STRIDE / sizeof(Input_T)) * iPos;
// Wrapping cannot occur in the middle of a line, except if
// there is only one line (1D)!
bool wrapInRange = false;
if (INPUT_MEM_WRAP_SIZE > 0
&& iOffset >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
{
iOffset += (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET
- INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
}
else if (INPUT_MEM_WRAP_SIZE > 0 && KERNEL_WIDTH > 1
&& CHANNELS_HEIGHT == 1 // single line (1D)!
&& iOffset + KERNEL_WIDTH * NB_CHANNELS
> (INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
{
wrapInRange = true;
}
const size_t wOffset = (output*KERNEL_HEIGHT + sy)
* KERNEL_WIDTH;
if (!wrapInRange && NB_CHANNELS == (INPUT_MEM_STRIDE / sizeof(Input_T))
&& DILATION_X == 1 && ((PADDING_X == 0
&& OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
|| sxMax - sxMin == KERNEL_WIDTH))
{
macsOnRange<KERNEL_WIDTH, NB_CHANNELS>(
inputs + iOffset + channel,
weights + wOffset,
weightedSum);
}
else {
for (size_t sx = 0; sx < KERNEL_WIDTH; ++sx) {
if ((PADDING_X != 0
|| OUTPUTS_WIDTH != OUTPUTS_WIDTH_NOPAD)
&& ((sx*DILATION_X < sxMin) || (sx*DILATION_X >= sxMax)))
{
continue;
}
int iOffsetInRange = iOffset
+ sx * DILATION_X * (INPUT_MEM_STRIDE / sizeof(Input_T));
if (wrapInRange
&& iOffsetInRange >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
{
iOffsetInRange += (INPUT_MEM_WRAP_OFFSET
- INPUT_MEM_CONT_OFFSET
- INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
}
weightedSum += inputs[iOffsetInRange + channel]
* weights[wOffset + sx];
}
}
}
outputs[oOffset + output] = activation_forward_value<Output_T>(weightedSum, output, ACTIVATION, rescaling);
}
}
}
}
// Template specialization when biases are not given to the convolution
template<int NB_CHANNELS,
int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
int NB_OUTPUTS,
int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
int PADDING_Y, int PADDING_X,
int STRIDE_Y, int STRIDE_X,
int DILATION_Y, int DILATION_X,
int KERNEL_HEIGHT, int KERNEL_WIDTH,
ActivationFunction_T ACTIVATION,
// Memory mapping: inputs
int INPUT_MEM_CONT_OFFSET,
int INPUT_MEM_CONT_SIZE,
int INPUT_MEM_WRAP_OFFSET,
int INPUT_MEM_WRAP_SIZE,
int INPUT_MEM_STRIDE,
// Memory mapping: outputs
int OUTPUT_MEM_CONT_OFFSET,
int OUTPUT_MEM_CONT_SIZE,
int OUTPUT_MEM_WRAP_OFFSET,
int OUTPUT_MEM_WRAP_SIZE,
int OUTPUT_MEM_STRIDE,
typename Input_T, typename Output_T,
typename Weight_T,
typename Rescaling_T>
__attribute__((always_inline)) inline
void convolution_depthwise_forward(
const Input_T* __restrict inputs,
Output_T* __restrict outputs,
const Weight_T* __restrict weights,
std::nullptr_t __restrict,
const Rescaling_T& __restrict rescaling)
{
const float* b = nullptr;
convolution_depthwise_forward<NB_CHANNELS,
CHANNELS_HEIGHT,
CHANNELS_WIDTH,
NB_OUTPUTS,
OUTPUTS_HEIGHT,
OUTPUTS_WIDTH,
PADDING_Y,
PADDING_X,
STRIDE_Y,
STRIDE_X,
DILATION_Y,
DILATION_X,
KERNEL_HEIGHT,
KERNEL_WIDTH,
ACTIVATION,
// Memory mapping: inputs
INPUT_MEM_CONT_OFFSET,
INPUT_MEM_CONT_SIZE,
INPUT_MEM_WRAP_OFFSET,
INPUT_MEM_WRAP_SIZE,
INPUT_MEM_STRIDE,
// Memory mapping: outputs
OUTPUT_MEM_CONT_OFFSET,
OUTPUT_MEM_CONT_SIZE,
OUTPUT_MEM_WRAP_OFFSET,
OUTPUT_MEM_WRAP_SIZE,
OUTPUT_MEM_STRIDE>
(inputs, outputs, weights, b, rescaling);
}
#endif // __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION_DEPTHWISE__
......@@ -2,170 +2,74 @@
#define __AIDGE_EXPORT_CPP_KERNELS_ELEMWISE__
#include "network/typedefs.hpp"
#include "kernels/activation.hpp"
// Generic function for two inputs
template<int NB_ELTS,
ElemWise_T ELEM_OP,
ActivationFunction_T ACTIVATION,
typename Input_T, typename Output_T,
typename Rescaling_T>
#include "network/activation_utils.hpp"
#include <sys/types.h>
template<size_t NB_MAT, ElemWise_T ELEM_OP,
size_t INPUT1_CONT_SIZE, size_t INPUT2_CONT_SIZE, size_t OUTPUT_CONT_SIZE,
const size_t OFFSET_IN1[], const size_t OFFSET_IN2[],
ActivationFunction_T ACTIVATION,
// Memory mapping: inputs
size_t INPUT1_MEM_CONT_OFFSET,
size_t INPUT1_MEM_CONT_SIZE,
size_t INPUT1_MEM_WRAP_OFFSET,
size_t INPUT1_MEM_WRAP_SIZE,
size_t INPUT1_MEM_STRIDE,
size_t INPUT2_MEM_CONT_OFFSET,
size_t INPUT2_MEM_CONT_SIZE,
size_t INPUT2_MEM_WRAP_OFFSET,
size_t INPUT2_MEM_WRAP_SIZE,
size_t INPUT2_MEM_STRIDE,
// Memory mapping: outputs
size_t OUTPUT_MEM_CONT_OFFSET,
size_t OUTPUT_MEM_CONT_SIZE,
size_t OUTPUT_MEM_WRAP_OFFSET,
size_t OUTPUT_MEM_WRAP_SIZE,
size_t OUTPUT_MEM_STRIDE,
typename Input_T, typename Output_T, typename Rescaling_T>
__attribute__((always_inline)) inline
void elemwise_forward (
void elemwise_forward(
Output_T* __restrict outputs,
const Rescaling_T& __restrict rescaling,
const Input_T* __restrict inputs1,
const Input_T* __restrict inputs2)
{
if (std::is_floating_point<Input_T>::value)
{
Input_T val = 0;
switch (ELEM_OP) {
case Add: {
for (int i = 0; i < NB_ELTS; ++i) {
val = inputs1[i] + inputs2[i];
outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
}
break;
}
case Sub: {
for (int i = 0; i < NB_ELTS; ++i) {
val = inputs1[i] - inputs2[i];
outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
}
break;
}
case Mul: {
for (int i = 0; i < NB_ELTS; ++i) {
val = inputs1[i] * inputs2[i];
outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
}
break;
}
default: {
// Copy inputs1 in outputs for default case
for (int i = 0; i < NB_ELTS; ++i) {
val = inputs1[i];
outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
}
break;
}
}
}
else
{
int32_t val = 0;
static_assert(INPUT1_MEM_WRAP_SIZE == 0, "Incompatible input memory wrapping");
static_assert(INPUT2_MEM_WRAP_SIZE == 0, "Incompatible input memory wrapping");
static_assert(OUTPUT_MEM_CONT_SIZE % OUTPUT_CONT_SIZE == 0, "Incompatible output memory wrapping");
auto apply_op = [](auto a, auto b) -> Output_T {
switch (ELEM_OP) {
case Add: {
for (int i = 0; i < NB_ELTS; ++i) {
val = inputs1[i] + inputs2[i];
outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
}
break;
}
case Sub: {
for (int i = 0; i < NB_ELTS; ++i) {
val = inputs1[i] - inputs2[i];
outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
}
break;
}
case Mul: {
for (int i = 0; i < NB_ELTS; ++i) {
val = inputs1[i] * inputs2[i];
outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
}
break;
}
default: {
// Copy inputs1 in outputs for default case
for (int i = 0; i < NB_ELTS; ++i) {
val = inputs1[i];
outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
}
break;
}
case Add: return a + b;
case Sub: return a - b;
case Mul: return a * b;
case Div: return a / b;
default: return a;
}
}
}
};
// Generic function for multiple inputs
// Not working
// template<ElemWise_T ELEM_OP, typename Output_T>
// __attribute__((always_inline)) inline
// Output_T elemWise (int /*pos*/, int /*ch*/)
// {
// return 0;
// }
// template<ElemWise_T ELEM_OP,
// int NB_CHANNELS,
// // For next inputs
// int... ARGS,
// typename... INPUTS,
// // Types
// typename Input_T, typename Output_T>
// __attribute__((always_inline)) inline
// Output_T elemWise (int pos, int ch,
// const Input_T* __restrict firstInputs,
// INPUTS... inputs)
// {
// int iOffset = NB_CHANNELS * pos;
// return firstInputs[iOffset + ch]
// + elemWise<ELEM_OP, ARGS...>(pos, ch, inputs...);
// }
// template<// For all inputs
// int NB_CHANNELS,
// int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
// int NB_ELTS,
// int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
// ElemWise_T ELEM_OP,
// ActivationFunction_T ACTIVATION,
// // For next inputs
// int... ARGS,
// typename... INPUTS,
// // Types
// typename Input_T, typename Output_T,
// typename Rescaling_T>
// __attribute__((always_inline)) inline
// void elemWise_forward (
// Output_T* __restrict outputs,
// const Rescaling_T& __restrict rescaling,
// const Input_T* __restrict firstInputs,
// INPUTS... inputs)
// {
// for (int oy = 0; oy < OUTPUTS_HEIGHT; oy++) {
// for (int ox = 0; ox < OUTPUTS_WIDTH; ox++) {
// const int pos = (ox + OUTPUTS_WIDTH * oy);
// int oOffset = NB_ELTS * pos;
// for (int ch = 0; ch < NB_ELTS; ++ch) {
// const Add_T val = elemWise<ELEM_OP,
// INPUT_NB_CHANNELS,
// INPUT_MEM_CONT_OFFSET,
// INPUT_MEM_CONT_NB_ELTS,
// INPUT_MEM_WRAP_OFFSET,
// INPUT_MEM_WRAP_NB_ELTS,
// INPUT_MEM_STRIDE,
// ARGS...>(pos, ch, firstInputs, inputs...);
// outputs[oOffset + ch]
// = sat<Output_T>(val, ch, ACTIVATION, rescaling);
// }
// }
// }
// }
for (size_t stack = 0; stack < NB_MAT; ++stack) {
const size_t offset_in1 = OFFSET_IN1[stack] * INPUT1_CONT_SIZE;
const size_t offset_in2 = OFFSET_IN2[stack] * INPUT2_CONT_SIZE;
int out_offset = stack * OUTPUT_CONT_SIZE;
if (OUTPUT_MEM_WRAP_SIZE > 0 && out_offset >= static_cast<int>(OUTPUT_MEM_CONT_SIZE / sizeof(Output_T))) {
out_offset += (OUTPUT_MEM_WRAP_OFFSET - OUTPUT_MEM_CONT_OFFSET
- OUTPUT_MEM_CONT_SIZE) / sizeof(Output_T);
}
for (size_t i = 0; i < OUTPUT_CONT_SIZE; ++i) {
const size_t in0_id = (INPUT1_CONT_SIZE != 1) ? i : 0;
const size_t in1_id = (INPUT2_CONT_SIZE != 1) ? i : 0;
const size_t out_id = out_offset + i;
const auto val1 = inputs1[in0_id + offset_in1];
const auto val2 = inputs2[in1_id + offset_in2];
const Output_T val = apply_op(val1, val2);
outputs[out_id] = activation_forward_value<Output_T>(val, out_id, ACTIVATION, rescaling);
}
}
}
#endif // __AIDGE_EXPORT_CPP_KERNELS_ELEMWISE__
#ifndef __AIDGE_EXPORT_CPP_KERNELS_ERP__
#define __AIDGE_EXPORT_CPP_KERNELS_ERP__
#include "network/typedefs.hpp"
#include "math.h"
#include <sys/types.h>
template<size_t NB_ELTS,
typename Input_T, typename Output_T>
__attribute__((always_inline)) inline
void erf_forward (
const Input_T* __restrict inputs,
Output_T* __restrict outputs)
{
constexpr double a1 = 0.254829592;
constexpr double a2 = -0.284496736;
constexpr double a3 = 1.421413741;
constexpr double a4 = -1.453152027;
constexpr double a5 = 1.061405429;
constexpr double p = 0.3275911;
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (size_t i = 0; i < NB_ELTS; ++i) {
int sign = 1;
if (inputs[i] < 0)
sign = -1;
const double abs_value = abs(inputs[i]);
// A&S formula 7.1.26
const double t = 1.0/(1.0 + p*abs_value);
const double y = 1.0 - (((((a5*t + a4)*t) + a3)*t + a2)*t + a1)*t*exp(-abs_value*abs_value);
outputs[i] = sign*y;
}
}
#endif // __AIDGE_EXPORT_CPP_KERNELS_ERP_
\ No newline at end of file
......@@ -2,16 +2,33 @@
#define __AIDGE_EXPORT_CPP_KERNELS_FULLYCONNECTED__
#include "network/typedefs.hpp"
#include "kernels/rescaling.hpp"
#include "network/rescaling_utils.hpp"
#include "network/utils.hpp"
#include "kernels/macs.hpp"
#include "kernels/activation.hpp"
#include "network/macs.hpp"
#include "network/activation_utils.hpp"
#include <sys/types.h>
template<int NB_CHANNELS,
int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
int NB_OUTPUTS,
int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
/**
* @brief Kernel to use when the input is in the NHWC format, and the
* weights have been transposed accordingly.
*/
template<size_t NB_CHANNELS,
size_t CHANNELS_HEIGHT, size_t CHANNELS_WIDTH,
size_t NB_OUTPUTS,
size_t OUTPUTS_HEIGHT, size_t OUTPUTS_WIDTH,
ActivationFunction_T ACTIVATION,
// Memory mapping: inputs
size_t INPUT_MEM_CONT_OFFSET,
size_t INPUT_MEM_CONT_SIZE,
size_t INPUT_MEM_WRAP_OFFSET,
size_t INPUT_MEM_WRAP_SIZE,
size_t INPUT_MEM_STRIDE,
// Memory mapping: outputs
size_t OUTPUT_MEM_CONT_OFFSET,
size_t OUTPUT_MEM_CONT_SIZE,
size_t OUTPUT_MEM_WRAP_OFFSET,
size_t OUTPUT_MEM_WRAP_SIZE,
size_t OUTPUT_MEM_STRIDE,
typename Input_T, typename Output_T,
typename Weight_T, typename Bias_T,
typename Rescaling_T>
......@@ -23,50 +40,209 @@ void fullyconnected_forward (
const Bias_T* __restrict biases,
const Rescaling_T& __restrict rescaling)
{
// Warning, there is a trick here !
// To use this kernel, the inputs have to be in NHWC and the weights are in NCHW
// It is only an issue if the FC was after a flatten layer.
// Otherwise it is not an issue for the other FC because CHANNELS_WIDTH = CHANNELS_HEIGHT = 1
// Solution: Add a system to check dataformat
for (int och = 0; och < NB_OUTPUTS; och++) {
Bias_T weightedSum = biases[och];
for (int iy = 0; iy < CHANNELS_HEIGHT; ++iy) {
for (int ix = 0; ix < CHANNELS_WIDTH; ++ix) {
for (int ch = 0; ch < NB_CHANNELS; ++ch) {
weightedSum += inputs[CHANNELS_WIDTH*NB_CHANNELS*iy + NB_CHANNELS*ix + ch]
* weights[CHANNELS_HEIGHT*CHANNELS_WIDTH*NB_CHANNELS*och + CHANNELS_HEIGHT*CHANNELS_WIDTH*ch + CHANNELS_HEIGHT*iy + ix];
constexpr size_t INPUT_WIDTH_STRIDE = (INPUT_MEM_STRIDE / sizeof(Input_T));
constexpr size_t INPUT_HEIGHT_STRIDE = (INPUT_MEM_STRIDE / sizeof(Input_T))*CHANNELS_WIDTH;
// constexpr size_t INPUT_OUT_CHANNELS_STRIDE = (INPUT_MEM_STRIDE / sizeof(Input_T))*CHANNELS_WIDTH*CHANNELS_HEIGHT;
constexpr size_t WEIGHT_WIDTH_STRIDE = NB_CHANNELS;
constexpr size_t WEIGHT_HEIGHT_STRIDE = NB_CHANNELS*CHANNELS_WIDTH;
constexpr size_t WEIGHT_OUT_CHANNELS_STRIDE = NB_CHANNELS*CHANNELS_WIDTH*CHANNELS_HEIGHT;
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (size_t och = 0; och < NB_OUTPUTS; ++och) {
Bias_T weightedSum = (biases) ? biases[och] : Bias_T(0);
for (size_t iy = 0; iy < CHANNELS_HEIGHT; ++iy) {
int iOffset = INPUT_HEIGHT_STRIDE * iy;
// Wrapping cannot occur in the middle of a line, except if
// there is only one line (1D)!
bool wrapInRange = false;
if (INPUT_MEM_WRAP_SIZE > 0 && iOffset >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) {
iOffset += (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET
- INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
}
else if (INPUT_MEM_WRAP_SIZE > 0 && CHANNELS_WIDTH > 1
&& CHANNELS_HEIGHT == 1 // single line (1D)!
&& iOffset + CHANNELS_WIDTH * NB_CHANNELS
> (INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
{
wrapInRange = true;
}
const size_t wOffset = WEIGHT_HEIGHT_STRIDE * iy + WEIGHT_OUT_CHANNELS_STRIDE * och;
if (!wrapInRange && INPUT_WIDTH_STRIDE == WEIGHT_WIDTH_STRIDE) {
macsOnRange<INPUT_HEIGHT_STRIDE>(
inputs + iOffset,
weights + wOffset,
weightedSum);
}
else {
for (size_t ix = 0; ix < CHANNELS_WIDTH; ++ix) {
int iOffsetInRange = iOffset + ix * INPUT_WIDTH_STRIDE;
if (wrapInRange
&& iOffsetInRange >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
{
iOffsetInRange += (INPUT_MEM_WRAP_OFFSET
- INPUT_MEM_CONT_OFFSET
- INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
}
macsOnRange<INPUT_WIDTH_STRIDE>(
inputs + iOffsetInRange,
weights + wOffset + ix * WEIGHT_WIDTH_STRIDE,
weightedSum);
}
}
}
outputs[och] = activation_forward_value<Output_T>(weightedSum, och, ACTIVATION, rescaling);
}
/*
Here the kernel to use with inputs in NHWC and weights in NHWC
}
/**
* @brief Kernel to use when the input is in the NCHW or Default format
* format (4D or 2D).
*/
template<size_t NB_CHANNELS,
size_t CHANNELS_HEIGHT, size_t CHANNELS_WIDTH,
size_t NB_OUTPUTS,
size_t OUTPUTS_HEIGHT, size_t OUTPUTS_WIDTH,
ActivationFunction_T ACTIVATION,
// Memory mapping: inputs
size_t INPUT_MEM_CONT_OFFSET,
size_t INPUT_MEM_CONT_SIZE,
size_t INPUT_MEM_WRAP_OFFSET,
size_t INPUT_MEM_WRAP_SIZE,
size_t INPUT_MEM_STRIDE,
// Memory mapping: outputs
size_t OUTPUT_MEM_CONT_OFFSET,
size_t OUTPUT_MEM_CONT_SIZE,
size_t OUTPUT_MEM_WRAP_OFFSET,
size_t OUTPUT_MEM_WRAP_SIZE,
size_t OUTPUT_MEM_STRIDE,
typename Input_T, typename Output_T,
typename Weight_T, typename Bias_T,
typename Rescaling_T>
__attribute__((always_inline)) inline
void fullyconnected_default_forward (
const Input_T* __restrict inputs,
Output_T* __restrict outputs,
const Weight_T* __restrict weights,
const Bias_T* __restrict biases,
const Rescaling_T& __restrict rescaling)
{
constexpr size_t WEIGHT_OUT_CHANNELS_STRIDE = NB_CHANNELS*CHANNELS_WIDTH*CHANNELS_HEIGHT;
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (size_t och = 0; och < NB_OUTPUTS; och++) {
Bias_T weightedSum = (biases) ? biases[och] : Bias_T(0);
const size_t wOffset = WEIGHT_OUT_CHANNELS_STRIDE * och;
macsOnRange<WEIGHT_OUT_CHANNELS_STRIDE>(
inputs,
weights + wOffset,
weightedSum);
outputs[och] = activation_forward_value<Output_T>(weightedSum, och, ACTIVATION, rescaling);
}
}
/**
* @brief Kernel to use when the input is in the NHWC format, but the
* weights have not been transposed and still follow the NCHW format order.
*/
template<size_t NB_CHANNELS,
size_t CHANNELS_HEIGHT, size_t CHANNELS_WIDTH,
size_t NB_OUTPUTS,
size_t OUTPUTS_HEIGHT, size_t OUTPUTS_WIDTH,
ActivationFunction_T ACTIVATION,
// Memory mapping: inputs
size_t INPUT_MEM_CONT_OFFSET,
size_t INPUT_MEM_CONT_SIZE,
size_t INPUT_MEM_WRAP_OFFSET,
size_t INPUT_MEM_WRAP_SIZE,
size_t INPUT_MEM_STRIDE,
// Memory mapping: outputs
size_t OUTPUT_MEM_CONT_OFFSET,
size_t OUTPUT_MEM_CONT_SIZE,
size_t OUTPUT_MEM_WRAP_OFFSET,
size_t OUTPUT_MEM_WRAP_SIZE,
size_t OUTPUT_MEM_STRIDE,
typename Input_T, typename Output_T,
typename Weight_T, typename Bias_T,
typename Rescaling_T>
__attribute__((always_inline)) inline
void fullyconnected_transpose_forward (
const Input_T* __restrict inputs,
Output_T* __restrict outputs,
const Weight_T* __restrict weights,
const Bias_T* __restrict biases,
const Rescaling_T& __restrict rescaling)
{
constexpr size_t INPUT_WIDTH_STRIDE = (INPUT_MEM_STRIDE / sizeof(Input_T));
constexpr size_t INPUT_HEIGHT_STRIDE = (INPUT_MEM_STRIDE / sizeof(Input_T))*CHANNELS_WIDTH;
// constexpr size_t INPUT_OUT_CHANNELS_STRIDE = (INPUT_MEM_STRIDE / sizeof(Input_T))*CHANNELS_WIDTH*CHANNELS_HEIGHT;
constexpr size_t WEIGHT_HEIGHT_STRIDE = CHANNELS_WIDTH;
constexpr size_t WEIGHT_IN_CHANNELS_STRIDE = CHANNELS_HEIGHT*CHANNELS_WIDTH;
constexpr size_t WEIGHT_OUT_CHANNELS_STRIDE = NB_CHANNELS*CHANNELS_HEIGHT*CHANNELS_WIDTH;
#ifdef _OPENMP
#pragma omp parallel for
for (int och = 0; och < NB_OUTPUTS; och++) {
#endif
for (size_t och = 0; och < NB_OUTPUTS; och++) {
Bias_T weightedSum = (biases) ? biases[och] : Bias_T(0);
Bias_T weightedSum = biases[och];
for (size_t iy = 0; iy < CHANNELS_HEIGHT; ++iy) {
int iOffset = INPUT_HEIGHT_STRIDE * iy;
for (int iy = 0; iy < CHANNELS_HEIGHT; ++iy) {
const int iPos = (CHANNELS_WIDTH * iy);
int iOffset = NB_CHANNELS * iPos;
// Wrapping cannot occur in the middle of a line, except if
// there is only one line (1D)!
bool wrapInRange = false;
const int wOffset = NB_CHANNELS * CHANNELS_WIDTH
* (iy + CHANNELS_HEIGHT * och);
if (INPUT_MEM_WRAP_SIZE > 0 && iOffset >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) {
iOffset += (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET
- INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
}
else if (INPUT_MEM_WRAP_SIZE > 0 && CHANNELS_WIDTH > 1
&& CHANNELS_HEIGHT == 1 // single line (1D)!
&& iOffset + CHANNELS_WIDTH * NB_CHANNELS
> (INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
{
wrapInRange = true;
}
macsOnRange<NB_CHANNELS * CHANNELS_WIDTH>(
inputs + iOffset,
weights + wOffset,
weightedSum);
const size_t wOffset = WEIGHT_OUT_CHANNELS_STRIDE * och + WEIGHT_HEIGHT_STRIDE * iy;
for (size_t ix = 0; ix < CHANNELS_WIDTH; ++ix) {
int iOffsetInRange = iOffset + ix * INPUT_WIDTH_STRIDE;
if (wrapInRange
&& iOffsetInRange >= static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T)))
{
iOffsetInRange += (INPUT_MEM_WRAP_OFFSET
- INPUT_MEM_CONT_OFFSET
- INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
}
// Beware that the pointer increment for weights is
// CHANNELS_HEIGHT*CHANNELS_WIDTH
macsOnRange<NB_CHANNELS, WEIGHT_IN_CHANNELS_STRIDE>(
inputs + iOffsetInRange,
weights + wOffset + ix,
weightedSum);
}
}
outputs[och] = activation_forward_value<Output_T>(weightedSum, och, ACTIVATION, rescaling);
}
*/
}
#endif // __AIDGE_EXPORT_CPP_KERNELS_FULLYCONNECTED__
#ifndef __AIDGE_EXPORT_CPP_KERNELS_HARDMAX__
#define __AIDGE_EXPORT_CPP_KERNELS_HARDMAX__
#include "network/typedefs.hpp"
#include "network/utils.hpp"
#include <sys/types.h>
// Todo add border value and border type (Reflect, Constant, Wrap...) and add
// the two missing pad value (bottom and right)
template <size_t AXIS_DIM_SIZE,
size_t PREAXIS_STRIDE,
size_t AXIS_STRIDE,
size_t POSTAXIS_STRIDE,
size_t NB_ELTS,
typename Input_T,
typename Output_T>
// void HardmaxImpl_cpu_forward_kernel(std::int32_t axis_, const
// std::vector<DimSize_t>& dims, const void* input_, void* output_)
__attribute__((always_inline)) inline void
hardmax2d_forward(const Input_T *__restrict input,
Output_T *__restrict output) {
// fill output with 0
for (Output_T *i = output; i != output + NB_ELTS; ++i) {
*i = 0;
}
// For each index on all the axes before and after 'axis', we have a
// different max element to find
for (size_t i = 0, preAxisOffset = 0; i < PREAXIS_STRIDE;
++i, preAxisOffset += AXIS_DIM_SIZE * POSTAXIS_STRIDE) {
for (size_t j = 0; j < POSTAXIS_STRIDE; ++j) {
// Init the max with first element
size_t maxIdx = 0;
Input_T maxVal = input[preAxisOffset + j];
// Loop over the elements on 'axis'
// Since we start at 0th idx, we already initialize the values like
// the 1st iteration has been done
for (size_t k = 1,
postAxisOffset = preAxisOffset + POSTAXIS_STRIDE;
k < AXIS_DIM_SIZE;
++k, postAxisOffset += POSTAXIS_STRIDE) {
Input_T currVal = input[postAxisOffset + j];
// Update max elements
if (currVal > maxVal) {
maxIdx = k;
maxVal = currVal;
}
}
output[preAxisOffset + maxIdx * POSTAXIS_STRIDE + j] = 1;
}
}
}
#endif // __AIDGE_EXPORT_CPP_KERNELS_HARDMAX__
#ifndef __AIDGE_EXPORT_CPP_KERNELS_IDENTITY__
#define __AIDGE_EXPORT_CPP_KERNELS_IDENTITY__
#include "network/typedefs.hpp"
#include <sys/types.h>
// Generic function for identity and activation
template<size_t M,
typename Input_T, typename Output_T>
__attribute__((always_inline)) inline
void identity_forward (
const Input_T* __restrict inputs,
Output_T* __restrict outputs)
{
// If inputs and outputs pointers are the same, the memory manager has already optimized this function so it is a no-op !
if (inputs == outputs)
return;
// A identity in c++ world should equal to a Noop
// We only need to copy the input buffer to the output
for (size_t m = 0; m < M; ++m) {
outputs[m] = inputs[m];
}
}
#endif // __AIDGE_EXPORT_CPP_KERNELS_IDENTITY__
\ No newline at end of file
......@@ -2,24 +2,50 @@
#define __AIDGE_EXPORT_CPP_KERNELS_LEAKYRELU__
#include "network/typedefs.hpp"
#include <sys/types.h>
#include <sys/types.h>
template<int NB_DATA,
template<size_t NB_ELTS,
// Memory mapping: inputs
size_t INPUT_MEM_CONT_OFFSET,
size_t INPUT_MEM_CONT_SIZE,
size_t INPUT_MEM_WRAP_OFFSET,
size_t INPUT_MEM_WRAP_SIZE,
size_t INPUT_MEM_STRIDE,
// Memory mapping: outputs
size_t OUTPUT_MEM_CONT_OFFSET,
size_t OUTPUT_MEM_CONT_SIZE,
size_t OUTPUT_MEM_WRAP_OFFSET,
size_t OUTPUT_MEM_WRAP_SIZE,
size_t OUTPUT_MEM_STRIDE,
typename Input_T, typename Output_T>
__attribute__((always_inline)) inline
__attribute__((always_inline)) inline
void leakyrelu_forward (
const Input_T* __restrict inputs,
Output_T* __restrict outputs,
const float negative_slope)
{
#pragma omp parallel for
for (int i = 0; i < NB_DATA; ++i) {
if (inputs[i] >= 0) {
outputs[i] = inputs[i];
} else {
outputs[i] = negative_slope * inputs[i];
int inOffset = 0;
int outOffset = 0;
for (size_t i = 0; i < NB_ELTS; ++i) {
if (INPUT_MEM_WRAP_SIZE > 0 && i == static_cast<int>(INPUT_MEM_CONT_SIZE / sizeof(Input_T))) {
inOffset = (INPUT_MEM_WRAP_OFFSET - INPUT_MEM_CONT_OFFSET
- INPUT_MEM_CONT_SIZE) / sizeof(Input_T);
}
if (OUTPUT_MEM_WRAP_SIZE > 0 && i == static_cast<int>(OUTPUT_MEM_CONT_SIZE / sizeof(Output_T))) {
outOffset = (OUTPUT_MEM_WRAP_OFFSET - OUTPUT_MEM_CONT_OFFSET
- OUTPUT_MEM_CONT_SIZE) / sizeof(Output_T);
}
if (inputs[inOffset + i] >= 0) {
outputs[outOffset + i] = inputs[inOffset + i];
}
else {
outputs[outOffset + i] = negative_slope * inputs[inOffset + i];
}
}
}
#endif // __AIDGE_EXPORT_CPP_KERNELS_LEAKYRELU__
\ No newline at end of file
#endif // __AIDGE_EXPORT_CPP_KERNELS_LEAKYRELU__
......@@ -2,16 +2,15 @@
#define __AIDGE_EXPORT_CPP_KERNELS_MATMUL__
#include "network/typedefs.hpp"
#include "kernels/activation.hpp"
#include "network/activation_utils.hpp"
#include <sys/types.h>
// Generic function for matmul and activation
template<int M,
int K,
int N,
ActivationFunction_T ACTIVATION,
typename Input_T, typename Output_T,
typename Rescaling_T>
template<size_t NB_MAT, size_t N, size_t M, size_t K,
const size_t OFFSET_IN1[], const size_t OFFSET_IN2[],
ActivationFunction_T ACTIVATION,
typename Input_T, typename Output_T, typename Rescaling_T>
__attribute__((always_inline)) inline
void matmul_forward (
const Input_T* __restrict inputs1,
......@@ -19,15 +18,28 @@ void matmul_forward (
Output_T* __restrict outputs,
const Rescaling_T& __restrict rescaling)
{
for (int m = 0; m < M; ++m) {
for (int n = 0; n < N; ++n) {
Output_T sum = Output_T(0);
for (int k = 0; k < K; ++k) {
sum += inputs1[K*m + k] * inputs2[N*k + n];
for (size_t stack = 0; stack < NB_MAT; ++stack) {
const size_t offset1 = OFFSET_IN1[stack] * N * K;
const size_t offset2 = OFFSET_IN2[stack] * K * M;
Output_T* out_ptr = &outputs[stack * N * M];
for (size_t i = 0; i < N; ++i) {
const Output_T* in1_row = &inputs1[offset1 + i * K];
for (size_t j = 0; j < M; ++j) {
Output_T sum = 0;
// Access column of inputs2 as row-major
for (size_t l = 0; l < K; ++l) {
sum += in1_row[l] * inputs2[offset2 + l * M + j];
}
out_ptr[i * M + j] = activation_forward_value<Output_T>(
sum, 0 /* not applicable */, ACTIVATION, rescaling
);
}
outputs[N*m + n] = activation_forward_value<Output_T>(sum, 0/*not applicable*/, ACTIVATION, rescaling);
}
}
}
#endif // __AIDGE_EXPORT_CPP_KERNELS_MATMUL__
#endif // __AIDGE_EXPORT_CPP_KERNELS_MATMUL__
\ No newline at end of file