Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • eclipse/aidge/aidge_export_cpp
  • hrouis/aidge_export_cpp
  • clementgf/aidge_export_cpp
  • cguillon/aidge_export_cpp
  • silvanosky/aidge_export_cpp
  • maab05/aidge_export_cpp
  • mnewson/aidge_export_cpp
  • axelfarr/aidge_export_cpp
  • gallasko/aidge_export_cpp
  • wboussella/aidge_export_cpp
  • mick94/aidge_export_cpp
  • louislerbourg/aidge_export_cpp
12 results
Show changes
Commits on Source (197)
Showing
with 934 additions and 225 deletions
...@@ -13,6 +13,11 @@ __pycache__ ...@@ -13,6 +13,11 @@ __pycache__
dist*/ dist*/
aidge_export_cpp/_version.py aidge_export_cpp/_version.py
wheelhouse/* wheelhouse/*
env_aidge/
# Temp test folders
aidge_export_cpp/unit_tests/*_temp_test
*_test/
# Mermaid # Mermaid
*.mmd *.mmd
...@@ -22,3 +27,6 @@ xml*/ ...@@ -22,3 +27,6 @@ xml*/
# ONNX # ONNX
*.onnx *.onnx
# GDB
.gdb_history
\ No newline at end of file
...@@ -12,12 +12,27 @@ stages: ...@@ -12,12 +12,27 @@ stages:
- deploy - deploy
include: include:
- project: 'eclipse/aidge/gitlab_shared_files' - project: 'eclipse/aidge/gitlab_shared_files'
ref: 'main' ref: 'main'
file: file:
# choose which jobs to run by including the corresponding files. # choose which jobs to run by including the corresponding files.
- '.gitlab/ci/ubuntu_python.gitlab-ci.yml' - '.gitlab/ci/ubuntu_python.gitlab-ci.yml'
- '.gitlab/ci/release/pip.gitlab-ci.yml' - '.gitlab/ci/release/pip.gitlab-ci.yml'
# Since aidge_export_cpp is a pure python package building on windows and on ubuntu doesn't differ # Since aidge_export_cpp is a pure python package building on windows and on ubuntu doesn't differ
# - '.gitlab/ci/windows_python.gitlab-ci.yml' # - '.gitlab/ci/windows_python.gitlab-ci.yml'
test:ubuntu_python:
before_script:
- !reference [.setup:test:ubuntu_python, before_script]
- DEPS_NAMES=("aidge_onnx" "aidge_quantization")
- DEPENDENCY_JOB="build:ubuntu_python"
- !reference [.ubuntu:download:artifacts, script]
# Need to install extra dependence for tests:
- python -m pip install torch torchvision
coverage:ubuntu_python:
before_script:
- !reference [.setup:coverage:ubuntu_python, before_script]
- DEPS_NAMES=("aidge_onnx" "aidge_quantization")
- DEPENDENCY_JOB="build:ubuntu_python"
- !reference [.ubuntu:download:artifacts, script]
\ No newline at end of file
# Version 0.2.1 (January 31, 2025)
# Version 0.2.0 (december 6, 2024)
# Version 0.0.1 (January 23, 2024) # Version 0.0.1 (January 23, 2024)
Initial release Initial release
# Aidge CPP Export # Aidge CPP Export
Use this module to export your Aidge model to a generic CPP export Use this module to export your Aidge model to a generic CPP export
\ No newline at end of file
## Install
Install with:
pip install -v .
## Development mode install
For editable/development mode, install with:
pip install -v --no-build-isolation -e .
...@@ -2,12 +2,15 @@ r""" ...@@ -2,12 +2,15 @@ r"""
Aidge Export for CPP standalone projects Aidge Export for CPP standalone projects
""" """
from pathlib import Path
from .operators import * # Constants
from collections import defaultdict FILE = Path(__file__).resolve()
import aidge_core ROOT = FILE.parents[0]
from aidge_export_cpp.utils import ROOT
from .export_registry import ExportLibCpp
from .export_utils import *
from .operators import *
from .export import * from .export import *
from . import benchmark
import contextlib
import os
from shutil import rmtree
from subprocess import run
import numpy as np
import aidge_core
import aidge_backend_cpu
import aidge_export_cpp
def measure_inference_time(model: aidge_core.GraphView, input_data: list[str, np.ndarray], nb_warmup: int = 10, nb_iterations: int = 50) -> list[float]:
# load and set up the model
# model.set_datatype(ai.dtype.float32)
model.set_backend("cpu")
# create input Tensor list for the GraphView
ordered_inputs: list[aidge_core.Tensor] = []
# [tmp fix] manual transpositin of data for input of export BEFORE converting to Tensor
for i in input_data:
nb_dims = len(i[1].shape)
if nb_dims == 3:
ordered_inputs.append(aidge_core.Tensor(i[1].transpose(0,2,1).reshape(i[1].shape).copy()))
if nb_dims == 4:
ordered_inputs.append(aidge_core.Tensor(np.transpose(i[1], axes=(0,2,3,1)).reshape(i[1].shape).copy()))
else:
ordered_inputs.append(aidge_core.Tensor(i[1]))
# set inputs for the export
for i, inp in enumerate(model.get_ordered_inputs()):
op = inp[0].get_operator()
op.set_input(i, ordered_inputs[i])
model.forward_dims([t.dims() for t in ordered_inputs])
scheduler = aidge_core.SequentialScheduler(model)
scheduler.generate_scheduling()
# for ordered_input in ordered_inputs:
# ordered_input.set_backend("cpu")
operator_type: str = model.get_ordered_outputs()[0][0].get_operator().type()
print(" ├─Generating export...", end="", flush=True)
folder_name: str = f"{operator_type.lower()}_test_export_cpp"
with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f):
aidge_core.export_utils.scheduler_export(
scheduler,
folder_name,
aidge_export_cpp.ExportLibCpp,
memory_manager=aidge_core.mem_info.generate_optimized_memory_info,
memory_manager_args={"wrapping": False }
)
aidge_core.export_utils.generate_main_inference_time_cpp(folder_name, model, nb_iterations, nb_warmup)
print(" ok")
print(" ├─Compiling...", end="", flush=True)
with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f):
run(['make'], cwd=folder_name, stdout=f)
print(" ok")
timings_str = run(f'./{folder_name}/bin/run_export', capture_output=True, text=True)
folder_path = os.path.abspath(folder_name)
if os.path.exists(folder_path):
rmtree(folder_path, ignore_errors=True)
timings = [float(t) for t in timings_str.stdout.split(' ') if t.strip()]
return timings
def compute_output(model: aidge_core.GraphView, input_data: list[str, np.ndarray]) -> list[np.ndarray]:
# load and set up the model
model.set_backend("cpu")
# create input Tensor list for the GraphView
ordered_inputs: list[aidge_core.Tensor] = []
# [tmp fix] manual transpositin of data for input of export BEFORE converting to Tensor
for i in input_data:
nb_dims = len(i[1].shape)
if nb_dims == 3:
ordered_inputs.append(aidge_core.Tensor(i[1].transpose(0,2,1).reshape(i[1].shape).copy()))
if nb_dims == 4:
ordered_inputs.append(aidge_core.Tensor(np.transpose(i[1], axes=(0,2,3,1)).reshape(i[1].shape).copy()))
else:
ordered_inputs.append(aidge_core.Tensor(i[1]))
# set inputs for the export
for i, inp in enumerate(model.get_ordered_inputs()):
op = inp[0].get_operator()
op.set_input(i, ordered_inputs[i])
model.forward_dims([t.dims() for t in ordered_inputs])
scheduler = aidge_core.SequentialScheduler(model)
scheduler.generate_scheduling()
operator_type: str = model.get_ordered_outputs()[0][0].get_operator().type()
print(" │ Generating export...", end="", flush=True)
folder_name: str = f"{operator_type.lower()}_test_export_cpp"
with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f):
aidge_core.export_utils.scheduler_export(
scheduler,
folder_name,
aidge_export_cpp.ExportLibCpp,
memory_manager=aidge_core.mem_info.generate_optimized_memory_info,
memory_manager_args={"wrapping": False }
)
aidge_core.export_utils.generate_main_display_output_cpp(folder_name, model)
print(" ok")
print(" │ Compiling...", end="", flush=True)
with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f):
run(['make'], cwd=folder_name, stdout=f)
print(" ok")
output_str: str = run(f'./{folder_name}/bin/run_export', capture_output=True, text=True)
folder_path = os.path.abspath(folder_name)
if os.path.exists(folder_path):
rmtree(folder_path, ignore_errors=True)
outputs_str: list[str] = output_str.stdout.strip().split('\n')
outputs = [np.array([float(val) for val in single_output_str.split(' ') if val.strip()]) for i, single_output_str in enumerate(outputs_str)]
for i, pair in enumerate(model.get_ordered_outputs()):
dims = pair[0].get_operator().get_output(pair[1]).dims()
nb_dims = len(dims)
dims_permutted = dims
if nb_dims == 3:
dims_permutted = [dims[0], dims[2], dims[1]]
if nb_dims == 4:
dims_permutted = [dims[0], dims[2], dims[3], dims[1]]
if np.prod(dims) != outputs[i].size:
aidge_core.Log.fatal("Incompatible export output size ({}) with required shape {}", outputs[i].size, dims)
outputs[i] = outputs[i].reshape(dims_permutted)
if nb_dims == 3:
outputs[i] = outputs[i].transpose(0,2,1)
if nb_dims == 4:
outputs[i] = outputs[i].transpose(0,3,1,2)
return outputs
import re
import os import os
from pathlib import Path
import shutil import shutil
import numpy as np import numpy as np
from pathlib import Path
from typing import List, Union from typing import List, Union
from jinja2 import Environment, FileSystemLoader
import aidge_core import aidge_core
from aidge_core.export_utils.code_generation import * from aidge_core.mem_info import generate_optimized_memory_info
from aidge_export_cpp.utils import (ROOT, OPERATORS_REGISTRY, supported_operators) from aidge_core.export_utils import scheduler_export, generate_main_cpp, aidge2c, generate_file
from aidge_export_cpp.utils.converter import aidge_datatype2ctype, numpy_dtype2ctype
import aidge_export_cpp.operators from aidge_export_cpp import ExportLibCpp, ROOT
from aidge_export_cpp.utils.generation import * from aidge_export_cpp.export_utils import read_log_file
from aidge_export_cpp.memory import *
def export(export_folder_name: str,
def generate_input_file(export_folder:str, graphview: aidge_core.GraphView,
array_name:str, scheduler: Union[List[aidge_core.Node],
array: np.ndarray): aidge_core.Scheduler],
inputs_tensor: aidge_core.Tensor = None,
# If directory doesn't exist, create it labels: aidge_core.Tensor = None,
if not os.path.exists(export_folder): dev_mode: bool = False,
os.makedirs(export_folder) aidge_cmp: bool = False):
generate_file( """ Export an aidge_core.Scheduler to C++ code
file_path=f"{export_folder}/{array_name}.h",
template_path=str(ROOT / "templates" / "data" / "inputs.jinja"), :param export_folder_name: Export folder name
dims = array.shape, :type export_folder_name: str
data_t = numpy_dtype2ctype(array.dtype), :param graph_view: An instance of :py:class:`aidge_core.graph_view`, providing access to nodes and
name = array_name, ordered input/output data within the computational graph.
values = array.tolist() :type graph_view: aidge_core.GraphView
) :param scheduler: Scheduler instance managing the computation graph.
Uses `graph_view` and `get_sequential_static_scheduling` methods
:param inputs_tensor: **For future** argument to provide tensor to use in the main function, not implemented yet!
def export(export_folder_name, graphview, scheduler): By default, the input of the given graph will be exported.
:type input_tensor: aidge_core.Tensor
export_folder = Path().absolute() / export_folder_name to retrieve the computation graph layout and ordered nodes.
:type scheduler: aidge_core.Scheduler
os.makedirs(str(export_folder), exist_ok=True) :param labels: Argument to provide labels tensor to generate and use in the main function.
:type labels: aidge_core.Tensor
dnn_folder = export_folder / "dnn" :param dev_mode: Wether or not the developer mode is enabled. If enabled, the export files
os.makedirs(str(dnn_folder), exist_ok=True) will be symlinks from the aidge_export_cpp module. Therefore, modifying
a file within the export will change the module as well.
list_actions = [] :type dev_mode: boolean
list_configs = [] """
list_forward_nodes = scheduler.get_static_scheduling() export_folder_name = Path(export_folder_name)
for node in list_forward_nodes: # Remove existing export
if node.type() in supported_operators(): if os.path.isdir(export_folder_name):
op = OPERATORS_REGISTRY[node.type()](node) print("Removing existing export directory...")
shutil.rmtree(export_folder_name)
# For configuration files
list_configs = op.export(dnn_folder, list_configs) # Generate Model Files
"""
# For forward file Perform the following tasks :
list_actions = op.forward(list_actions) - Generate the parameters and layers config files
- Generate the forward.cpp file
- Copy all needed kernels
# Memory management """
mem_size, mem_info = compute_default_mem_info(scheduler)
scheduler_export(scheduler,
# Generate the memory file export_folder_name,
generate_file( ExportLibCpp,
str(dnn_folder / "memory" / "mem_info.h"), memory_manager=generate_optimized_memory_info,
str(ROOT / "templates" / "memory" / "mem_info.jinja"), memory_manager_args={
mem_size = mem_size, "stats_folder": f"{export_folder_name}/stats"},
mem_info_legends = MEMORY_INFO_TEMPLATE, dev_mode=dev_mode)
mem_info = mem_info
) # Generate main file
list_configs.append("memory/mem_info.h") generate_main_cpp(export_folder_name, graphview, labels=labels, inputs_tensor=inputs_tensor)
# Get entry nodes # Generate log files (aidge_cmp option)
# It supposes the entry nodes are producers with constant=false """
# Store the datatype & name If the aidge_cmp option has been enabled, the generated log_outputs will
list_inputs_name = [] be copied into the generated export in order to be used as reference.
for node in graphview.get_nodes(): """
if node.type() == "Producer": if aidge_cmp:
if not node.get_operator().attr.constant: ranked_nodes = graphview.get_ranked_nodes_name("{0}[{1}#{3}]")
export_type = aidge_datatype2ctype(node.get_operator().get_output(0).dtype()) os.makedirs(export_folder_name / "data" / "aidge_outputs")
list_inputs_name.append((export_type, node.name())) os.makedirs(export_folder_name / "data" / "export_outputs")
for node in graphview.get_nodes():
# Get output nodes if node.type() != "Producer":
# Store the datatype & name, like entry nodes file_path = 'log_outputs/' + ranked_nodes[node] + '/output_0.log'
list_outputs_name = [] data_t = aidge2c(node.get_operator().get_output(0).dtype())
for node in graphview.get_nodes(): name = node.name() + '_output_0_aidge'
if len(node.get_children()) == 0: dims = node.get_operator().get_output(0).dims()
export_type = aidge_datatype2ctype(node.get_operator().get_output(0).dtype()) values = read_log_file(file_path)
list_outputs_name.append((export_type, node.name()))
generate_file(export_folder_name / "data" / "aidge_outputs" / (node.name() + ".hpp"),
# Generate forward file ROOT / "templates" / "data" / "aidge_tensor.jinja",
# TODO: for now the mem type is bound for all intermediate results, should change. data_t=data_t,
# Note that we may have all inputs constants, hence select output type name=name,
assert len(list_outputs_name) >= 1, f"TODO: requires some output to determine mem type" dims=dims,
mem_ctype = list_outputs_name[0][0] values=values)
generate_file(
str(dnn_folder / "src" / "forward.cpp"),
str(ROOT / "templates" / "network" / "network_forward.jinja"),
headers=list_configs,
actions=list_actions,
inputs= list_inputs_name,
outputs=list_outputs_name,
mem_ctype=mem_ctype,
)
# Generate dnn API
generate_file(
str(dnn_folder / "include" / "dnn.hpp"),
str(ROOT / "templates" / "network" / "dnn_header.jinja"),
libraries=[],
functions=get_functions_from_c_file(str(dnn_folder / "src" / "forward.cpp")),
)
# Copy all static files in the export
shutil.copy(str(ROOT / "static" / "main.cpp"), str(export_folder))
shutil.copy(str(ROOT / "static" / "Makefile"), str(export_folder))
shutil.copytree(str(ROOT / "static" / "include"), str(dnn_folder / "include"), dirs_exist_ok=True)
from aidge_core.export_utils import ExportLib
from aidge_export_cpp import ROOT
class ExportLibCpp(ExportLib):
_name="export_cpp"
static_files={
str(ROOT / "static" / "Makefile"): "",
str(ROOT / "static" / "typedefs.hpp"): "dnn/include/network",
str(ROOT / "static" / "utils.hpp"): "dnn/include/network",
str(ROOT / "static" / "rescaling_utils.hpp"): "dnn/include/network",
str(ROOT / "static" / "activation_utils.hpp"): "dnn/include/network",
}
import os
from collections import OrderedDict
import aidge_core
from aidge_core.export_utils import get_node_from_metaop
def cpp_fuse_to_metaops(graph_view: aidge_core.GraphView):
"""
Fuse nodes into metaops adapted for the CPP Export
TODO: These recipes should be into aidge_core
:param graph_view: An instance of :py:class:`aidge_core.GraphView`, providing access to nodes and
ordered input/output data within the computational graph.
"""
cpp_recipes = OrderedDict({
# Quantization
"QMul": "Mul->Quantizer", # Fixed Point Scaling
# FC
"QFC": "FC->(Quantizer|QMul)",
"FCAct": "(FC|QFC)->ReLU",
# Conv
"QConv": "Conv2D->(Quantizer|QMul)",
"PadConv": "(QConv|Conv2D)<-Pad2D",
"ConvAct": "(QConv|Conv2D)->ReLU",
"PadConvAct": "PadConv->ReLU",
# ConvDw
"QConvDw": "ConvDepthWise2D->(Quantizer|QMul)",
"ConvDwPad": "(QConvDw|ConvDepthWise2D)->Pad2D",
"ConvDwAct": "(QConvDw|ConvConvDepthWise2D2D)->ReLU",
"ConvDwActPad": "ConvDwAct->Pad2D",
# Max Pooling
"PadMaxPool": "MaxPooling2D<-Pad2D",
"MaxPoolAct": "MaxPooling2D->ReLU",
"PadMaxPoolAct": "PadMaxPool->ReLU",
# Average Pooling
"PadAvgPool": "AvgPooling2D<-Pad2D",
"AvgPoolAct": "AvgPooling2D->ReLU",
"PadAvgPoolAct": "PadAvgPool->ReLU",
# Global Average Pooling
"PadGlobalAvgPool": "GlobalAveragePooling2D<-Pad2D",
"GlobalAvgPoolAct": "GlobalAveragePooling2D->ReLU",
"PadGlobalAvgPoolAct": "PadGlobalAveragePool->ReLU",
# ElemWise
"QAdd": "Add->(Quantizer|QMul)",
"QSub": "Sub->(Quantizer|QMul)",
# "QMul": "Mul->Quantizer", # Already defined
"AddAct": "(QAdd|Add)->ReLU",
"SubAct": "(QSub|Sub)->ReLU",
"MulAct": "(QMul|Mul)->ReLU",
# Activation
"QReLU": "ReLU->(Quantizer|QMul)",
})
for node, recipe in cpp_recipes.items():
aidge_core.fuse_to_metaops(graph_view, recipe, node)
def set_nodes_names(scheduler):
"""
Set the CPP nodes names as well as their producers.
The producers naming is handled from their child node.
[TODO] Fc and Conv layers will always have weights as parent 1 and
possibly biases as parent 2. It may be better to previously label the
producers.
:param scheduler: Scheduler instance managing the computation graph.
Uses `graph_view` and `get_sequential_static_scheduling` methods
to retrieve the computation graph layout and ordered nodes.
:type scheduler: aidge_core.Scheduler
"""
node_ids = {} # Dict holding the node type along with a counter
node_it = 0 # Node Iterator
## MetaOps
for node in scheduler.get_sequential_static_scheduling():
node_type = node.type()
if node_type != "Producer":
if node.type() not in node_ids:
node_ids[node_type] = 0
# Set node name
node.set_name("_" + str(node_it) + "_" +
str(node_type) + "_" + str(node_ids[node_type]))
node_ids[node_type] += 1
node_it += 1
# Set producers names
## Weights & Biases producers
if get_node_from_metaop(node, "FC") or \
get_node_from_metaop(node, "Conv2D") or \
get_node_from_metaop(node, "ConvDepthWise2D"):
node.get_parent(1).set_name(node.name() + "_weights")
if node.get_parent(2) is not None:
node.get_parent(2).set_name(node.name() + "_biases")
## Scaling Producers
for node in scheduler.get_sequential_static_scheduling():
"""
TODO: If multiple quantizer nodes are found, the producers will
all have the same name and this will not work properly.
"""
if node.type() == "Producer":
child_node = node.output(0)[0][0]
if node.attributes().has_attr("shift_prod"):
node.set_name(child_node.name() + "_shift")
if node.attributes().has_attr("coef_prod"):
node.set_name(child_node.name() + "_coef")
def set_nodes_datatypes(graph_view: aidge_core.GraphView):
""" Set the nodes' datatypes
The set_datatype function can't be used on Conv2D and FC nodes directly
as the biases datatype is different from the other inputs.
TODO: Should be using forward_datatype()
:param graph_view: An instance of :py:class:`aidge_core.graph_view`, providing access to nodes and
ordered input/output data within the computational graph.
"""
for node in graph_view.get_nodes():
if node.type() != "Producer":
if get_node_from_metaop(node, "FC") or \
get_node_from_metaop(node, "Conv2D") or \
get_node_from_metaop(node, "ConvDepthWise2D"):
node.get_operator().get_input(0).set_datatype(aidge_core.dtype.int8) # Input
node.get_operator().get_input(1).set_datatype(aidge_core.dtype.int8) # Weights
if node.get_parent(2) is not None:
node.get_operator().get_input(2).set_datatype(aidge_core.dtype.int32) # Biases
node.get_operator().get_output(0).set_datatype(aidge_core.dtype.int8) # Output
else:
node.get_operator().set_datatype(aidge_core.dtype.int8)
# Set input node's datatype
for n in graph_view.get_input_nodes():
n.get_operator().get_input(0).set_datatype(aidge_core.dtype.int8)
def read_log_file(file_path: str):
""" Read log file
Used to read the aidge generated log files containing the intermediate
tensors of the exported model.
:param file_path: Path to the file to read.
:type file_path: str
"""
# Check if the file exists
if not os.path.isfile(file_path):
print(f"File not found: {file_path}")
return None
with open(file_path, 'r') as file:
content = file.read()
return content
def exclude_unwanted_producers(model):
""" Exclude some producers not needed for the export
Currently excludes the producers attached to the Mul and BitShift nodes, as they are
tensors holding a single data. This data is retrieved during the export
generation process and passed as argument directly within the Mul layer
configuration.
"""
nodes_to_ignore = ["Mul", "BitShift", "Clip"]
for node in model.get_nodes():
if node.type() == "Producer":
children_nodes = [n.type() for n in node.get_children()]
for node_type in nodes_to_ignore:
if node_type in children_nodes:
node.attributes().ignore = True
break
def set_scaling_attributes(export_node: aidge_core.export_utils.ExportNode, node: aidge_core.Node):
"""
Look recursively for a Quantizer node inside of the given node,
then set shift and coef attributes of the given export node.
[TODO] Should be moved into aidge_core.ExportNode
:param export_node: An instance of :py:class:`aidge_core.export_utils.ExportNode` to set the scaling
attributes needed for a quantized export.
:type export_node: aidge_core.export_utils.ExportNode
:param node: Node which may hold a Quantizer node.
:type node: aidge_core.Node
"""
QNode = get_node_from_metaop(node, "Quantizer")
if QNode:
BNode = get_node_from_metaop(QNode[0], "BitShift")
export_node.attributes["shift_value"] = BNode[0].get_operator().get_input(1)[0]
QMulNode = get_node_from_metaop(node, "QMul")
if QMulNode:
CNode = get_node_from_metaop(QMulNode[0], "Mul")
export_node.attributes["coef_value"] = CNode[0].get_operator().get_input(1)[0]
def normalize(array):
"""
Normalize an input image between -1 and 1
"""
if array.max() == array.min():
return array/array.max()
array = (array - array.min()) / (array.max() - array.min())
return 2 * array - 1
#ifndef __AIDGE_EXPORT_CPP_KERNELS_ACTIVATION__ #ifndef __AIDGE_EXPORT_CPP_KERNELS_ACTIVATION__
#define __AIDGE_EXPORT_CPP_KERNELS_ACTIVATION__ #define __AIDGE_EXPORT_CPP_KERNELS_ACTIVATION__
#include <type_traits> #include "network/activation_utils.hpp"
#include "network/typedefs.hpp" #include "network/rescaling_utils.hpp"
#include "network/utils.hpp"
#include "network/rescaling.hpp"
template<typename Output_T, typename T,
typename std::enable_if<std::is_floating_point<T>::value>::type* = nullptr>
__attribute__((always_inline)) inline
Output_T saturate (T value, int32_t /*sat*/)
{
return value;
}
template<typename Output_T, typename T,
typename std::enable_if<!std::is_floating_point<T>::value>::type* = nullptr>
__attribute__((always_inline)) inline
Output_T saturate (T value, uint32_t sat)
{
if (std::is_unsigned<Output_T>::value) {
return clamp(value, T(0), (T(1) << sat) - 1);
} else {
return clamp(value, -(T(1) << (sat - 1)), (T(1) << (sat - 1)) - 1);
}
}
template<typename Output_T,
typename Sum_T,
typename Rescaling_T>
__attribute__((always_inline)) inline
Output_T activation_forward_value (Sum_T weightedSum,
int output,
ActivationFunction_T func,
const Rescaling_T& __restrict rescaling)
{
switch(func) {
case Linear:
case Saturation: {
break;
}
case Rectifier: {
if(weightedSum <= 0) weightedSum = 0;
break;
}
default:
// Unsupported activation function
break;
}
// Value fixed here for now but it should be generated by
// the export module or determined by the type of Output_T
// For now only works for int8_t and uint8_t
const uint32_t NB_BITS = 8;
return saturate<Output_T>(rescaling(weightedSum, output), NB_BITS);
}
template<int NB_DATA, template<int NB_DATA,
ActivationFunction_T ACTIVATION, ActivationFunction_T ACTIVATION,
typename Input_T, typename Output_T, typename Rescaling_T> typename Input_T, typename Output_T, typename Rescaling_T>
__attribute__((always_inline)) inline __attribute__((always_inline)) inline
void activation_forward ( void activation_forward (
const Input_T* __restrict inputs, const Input_T* __restrict inputs,
Output_T* __restrict outputs, Output_T* __restrict outputs,
......
...@@ -2,17 +2,19 @@ ...@@ -2,17 +2,19 @@
#define __AIDGE_EXPORT_CPP_KERNELS_BATCHNORM__ #define __AIDGE_EXPORT_CPP_KERNELS_BATCHNORM__
#include "network/typedefs.hpp" #include "network/typedefs.hpp"
#include "network/rescaling.hpp" #include "network/activation_utils.hpp"
#include <math.h> #include <math.h>
// WARNING: this kernel only works for 32-bits floating point values // WARNING: this kernel only works for 32-bits floating point values
template<int NB_OUTPUTS, template<int NB_BATCHES, int NB_OUTPUTS,
int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH, int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
ActivationFunction_T ACTIVATION, ActivationFunction_T ACTIVATION,
typename Input_T, typename Output_T, typename Input_T, typename Output_T,
typename Param_T> typename Param_T,
__attribute__((always_inline)) inline typename Rescaling_T>
__attribute__((always_inline)) inline
void batchnorm_forward ( void batchnorm_forward (
const Input_T* __restrict inputs, const Input_T* __restrict inputs,
Output_T* __restrict outputs, Output_T* __restrict outputs,
...@@ -20,18 +22,22 @@ void batchnorm_forward ( ...@@ -20,18 +22,22 @@ void batchnorm_forward (
const Param_T* __restrict variances, const Param_T* __restrict variances,
const Param_T* __restrict means, const Param_T* __restrict means,
const Param_T* __restrict scales, const Param_T* __restrict scales,
const double epsilon) const double epsilon,
const Rescaling_T& __restrict rescaling)
{ {
for (unsigned int output = 0; output < NB_OUTPUTS; ++output) { for (unsigned int batch = 0; batch < NB_BATCHES; ++batch) {
const Output_T var = sqrt(variances[output] + epsilon); for (unsigned int output = 0; output < NB_OUTPUTS; ++output) {
// If the variance is 0, we need to avoid division by 0
Output_T var = sqrt(variances[output] > 0.0 ? variances[output] + epsilon : epsilon);
for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) { for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) { for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
const int outputOffset = OUTPUTS_HEIGHT * oy + ox; const int outputOffset = batch * OUTPUTS_WIDTH * OUTPUTS_HEIGHT * NB_OUTPUTS + output * OUTPUTS_WIDTH * OUTPUTS_HEIGHT + OUTPUTS_WIDTH * oy + ox;
const Output_T normalized = (inputs[outputOffset + output] - means[output]) / var; const Output_T normalized = (inputs[outputOffset] - means[output]) / var;
const Output_T sAs = scales[output] * normalized + biases[output]; const Output_T sAs = scales[output] * normalized + biases[output];
outputs[outputOffset + output] = sat<Output_T>(sAs, output, ACTIVATION, NoScaling); outputs[outputOffset] = activation_forward_value<Output_T>(sAs, output, ACTIVATION, rescaling);
}
} }
} }
} }
......
#ifndef __AIDGE_EXPORT_CPP_KERNELS_CONCAT__
#define __AIDGE_EXPORT_CPP_KERNELS_CONCAT__
template<int AXIS_SIZE_POST,
int AXIS_SIZE_PRE,
unsigned int NB_INPUTS,
typename T>
__attribute__((always_inline)) inline static
void concat_forward (
const T* const * __restrict inputs,
const unsigned int* __restrict sizes,
T* __restrict output)
{
unsigned int total_concat_axis_size = 0;
for (unsigned int n = 0; n < NB_INPUTS; ++n)
total_concat_axis_size += sizes[n];
for (int i = 0; i < AXIS_SIZE_PRE; ++i) {
// Loop over post-axis (e.g., dims after axis 1)
for (int j = 0; j < AXIS_SIZE_POST; ++j) {
unsigned int axis_offset = 0;
// Loop over each input tensor
for (unsigned int n = 0; n < NB_INPUTS; ++n) {
for (unsigned int k = 0; k < sizes[n]; ++k) {
const int input_idx = i * sizes[n] * AXIS_SIZE_POST + k * AXIS_SIZE_POST + j;
output[i * total_concat_axis_size * AXIS_SIZE_POST + (axis_offset + k) * AXIS_SIZE_POST + j] =
inputs[n][input_idx];
}
axis_offset += sizes[n]; // move along axis in output
}
}
}
}
#endif // __AIDGE_EXPORT_CPP_KERNELS_CONCAT__
\ No newline at end of file
...@@ -2,13 +2,13 @@ ...@@ -2,13 +2,13 @@
#define __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION__ #define __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION__
#include "network/typedefs.hpp" #include "network/typedefs.hpp"
#include "network/rescaling.hpp" #include "network/rescaling_utils.hpp"
#include "network/utils.hpp" #include "network/utils.hpp"
#include "kernels/macs.hpp" #include "network/macs.hpp"
#include "kernels/activation.hpp" #include "network/activation_utils.hpp"
template<int NB_CHANNELS, template<int NB_CHANNELS,
int CHANNELS_HEIGHT, int CHANNELS_WIDTH, int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
int NB_OUTPUTS, int NB_OUTPUTS,
int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH, int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
...@@ -17,10 +17,10 @@ template<int NB_CHANNELS, ...@@ -17,10 +17,10 @@ template<int NB_CHANNELS,
int DILATION_Y, int DILATION_X, int DILATION_Y, int DILATION_X,
int KERNEL_HEIGHT, int KERNEL_WIDTH, int KERNEL_HEIGHT, int KERNEL_WIDTH,
ActivationFunction_T ACTIVATION, ActivationFunction_T ACTIVATION,
typename Input_T, typename Output_T, typename Input_T, typename Output_T,
typename Weight_T, typename Bias_T, typename Weight_T, typename Bias_T,
typename Rescaling_T> typename Rescaling_T>
__attribute__((always_inline)) inline __attribute__((always_inline)) inline
void convolution_forward( void convolution_forward(
const Input_T* __restrict inputs, const Input_T* __restrict inputs,
Output_T* __restrict outputs, Output_T* __restrict outputs,
...@@ -28,10 +28,10 @@ void convolution_forward( ...@@ -28,10 +28,10 @@ void convolution_forward(
const Bias_T* __restrict biases, const Bias_T* __restrict biases,
const Rescaling_T& __restrict rescaling) const Rescaling_T& __restrict rescaling)
{ {
constexpr int DILATED_KERNEL_HEIGHT constexpr int DILATED_KERNEL_HEIGHT
= KERNEL_HEIGHT + (DILATION_Y - 1) * (KERNEL_HEIGHT - 1); = KERNEL_HEIGHT + (DILATION_Y - 1) * (KERNEL_HEIGHT - 1);
constexpr int DILATED_KERNEL_WIDTH constexpr int DILATED_KERNEL_WIDTH
= KERNEL_WIDTH + (DILATION_X - 1) * (KERNEL_WIDTH - 1); = KERNEL_WIDTH + (DILATION_X - 1) * (KERNEL_WIDTH - 1);
constexpr int OUTPUTS_HEIGHT_NOPAD constexpr int OUTPUTS_HEIGHT_NOPAD
...@@ -44,11 +44,13 @@ void convolution_forward( ...@@ -44,11 +44,13 @@ void convolution_forward(
: max(PADDING_Y - (oy * STRIDE_Y), 0); : max(PADDING_Y - (oy * STRIDE_Y), 0);
const int syMax = (PADDING_Y == 0 const int syMax = (PADDING_Y == 0
&& OUTPUTS_HEIGHT == OUTPUTS_HEIGHT_NOPAD) ? DILATED_KERNEL_HEIGHT && OUTPUTS_HEIGHT == OUTPUTS_HEIGHT_NOPAD) ? DILATED_KERNEL_HEIGHT
: clamp(CHANNELS_HEIGHT + PADDING_Y - (oy * STRIDE_Y), : clamp(CHANNELS_HEIGHT + PADDING_Y - (oy * STRIDE_Y),
0, DILATED_KERNEL_HEIGHT); 0, DILATED_KERNEL_HEIGHT);
const int iy = (oy * STRIDE_Y) - PADDING_Y; const int iy = (oy * STRIDE_Y) - PADDING_Y;
#ifdef _OPENMP
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
#endif
for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) { for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
for (int output = 0; output < NB_OUTPUTS; ++output) { for (int output = 0; output < NB_OUTPUTS; ++output) {
// moved to inner loop for collapsing --> // moved to inner loop for collapsing -->
...@@ -57,16 +59,16 @@ void convolution_forward( ...@@ -57,16 +59,16 @@ void convolution_forward(
const int sxMax = (PADDING_X == 0 const int sxMax = (PADDING_X == 0
&& OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD) && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
? DILATED_KERNEL_WIDTH ? DILATED_KERNEL_WIDTH
: clamp(CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X), : clamp(CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X),
0, DILATED_KERNEL_WIDTH); 0, DILATED_KERNEL_WIDTH);
const int ix = (ox * STRIDE_X) - PADDING_X; const int ix = (ox * STRIDE_X) - PADDING_X;
const int oPos = (ox + OUTPUTS_WIDTH * oy); const int oPos = (ox + OUTPUTS_WIDTH * oy);
int oOffset = NB_OUTPUTS * oPos; const int oOffset = NB_OUTPUTS * oPos;
// <-- // <--
// Check if the biases are defined
Bias_T weightedSum = biases[output]; Bias_T weightedSum = biases ? biases[output] : 0;
for (int sy = 0; sy < KERNEL_HEIGHT; ++sy) { for (int sy = 0; sy < KERNEL_HEIGHT; ++sy) {
if ((PADDING_Y != 0 if ((PADDING_Y != 0
...@@ -77,7 +79,7 @@ void convolution_forward( ...@@ -77,7 +79,7 @@ void convolution_forward(
} }
const int iPos = ix + CHANNELS_WIDTH * (iy + sy*DILATION_Y); const int iPos = ix + CHANNELS_WIDTH * (iy + sy*DILATION_Y);
int iOffset = NB_CHANNELS * iPos; const int iOffset = NB_CHANNELS * iPos;
const int wOffset = (output*KERNEL_HEIGHT + sy) * KERNEL_WIDTH * NB_CHANNELS; const int wOffset = (output*KERNEL_HEIGHT + sy) * KERNEL_WIDTH * NB_CHANNELS;
...@@ -85,8 +87,8 @@ void convolution_forward( ...@@ -85,8 +87,8 @@ void convolution_forward(
|| sxMax - sxMin == KERNEL_WIDTH)) || sxMax - sxMin == KERNEL_WIDTH))
{ {
macsOnRange<KERNEL_WIDTH * NB_CHANNELS>( macsOnRange<KERNEL_WIDTH * NB_CHANNELS>(
inputs + iOffset, inputs + iOffset,
weights + wOffset, weights + wOffset,
weightedSum); weightedSum);
} }
else { else {
...@@ -98,13 +100,13 @@ void convolution_forward( ...@@ -98,13 +100,13 @@ void convolution_forward(
continue; continue;
} }
int iOffsetInRange = iOffset const int iOffsetInRange = iOffset
+ sx * DILATION_X * NB_CHANNELS; + sx * DILATION_X * NB_CHANNELS;
macsOnRange<NB_CHANNELS>( macsOnRange<NB_CHANNELS>(
// same input line so no wrapping can occur // same input line so no wrapping can occur
inputs + iOffsetInRange, inputs + iOffsetInRange,
weights + wOffset + sx * NB_CHANNELS, weights + wOffset + sx * NB_CHANNELS,
weightedSum); weightedSum);
} }
} }
...@@ -116,4 +118,45 @@ void convolution_forward( ...@@ -116,4 +118,45 @@ void convolution_forward(
} }
} }
// Template specialization when biases are not given to the convolution
template<int NB_CHANNELS,
int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
int NB_OUTPUTS,
int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
int PADDING_Y, int PADDING_X,
int STRIDE_Y, int STRIDE_X,
int DILATION_Y, int DILATION_X,
int KERNEL_HEIGHT, int KERNEL_WIDTH,
ActivationFunction_T ACTIVATION,
typename Input_T, typename Output_T,
typename Weight_T,
typename Rescaling_T>
__attribute__((always_inline)) inline
void convolution_forward(
const Input_T* __restrict inputs,
Output_T* __restrict outputs,
const Weight_T* __restrict weights,
std::nullptr_t __restrict,
const Rescaling_T& __restrict rescaling)
{
const float* b = nullptr;
convolution_forward<NB_CHANNELS,
CHANNELS_HEIGHT,
CHANNELS_WIDTH,
NB_OUTPUTS,
OUTPUTS_HEIGHT,
OUTPUTS_WIDTH,
PADDING_Y,
PADDING_X,
STRIDE_Y,
STRIDE_X,
DILATION_Y,
DILATION_X,
KERNEL_HEIGHT,
KERNEL_WIDTH,
ACTIVATION>
(inputs, outputs, weights, b, rescaling);
}
#endif // __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION__ #endif // __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION__
#ifndef __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION_DEPTHWISE__
#define __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION_DEPTHWISE__
#include "network/typedefs.hpp"
#include "network/rescaling_utils.hpp"
#include "network/utils.hpp"
#include "network/macs.hpp"
#include "network/activation_utils.hpp"
template<int NB_CHANNELS,
int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
int NB_OUTPUTS,
int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
int PADDING_Y, int PADDING_X,
int STRIDE_Y, int STRIDE_X,
int DILATION_Y, int DILATION_X,
int KERNEL_HEIGHT, int KERNEL_WIDTH,
ActivationFunction_T ACTIVATION,
typename Input_T, typename Output_T,
typename Weight_T, typename Bias_T,
typename Rescaling_T>
__attribute__((always_inline)) inline
void convolution_depthwise_forward(
const Input_T* __restrict inputs,
Output_T* __restrict outputs,
const Weight_T* __restrict weights,
const Bias_T* __restrict biases,
const Rescaling_T& __restrict rescaling)
{
static_assert(NB_OUTPUTS % NB_CHANNELS == 0,
"NB_OUTPUTS should be a multiple of NB_CHANNELS.");
constexpr int DILATED_KERNEL_HEIGHT
= KERNEL_HEIGHT + (DILATION_Y - 1) * (KERNEL_HEIGHT - 1);
constexpr int DILATED_KERNEL_WIDTH
= KERNEL_WIDTH + (DILATION_X - 1) * (KERNEL_WIDTH - 1);
constexpr int OUTPUTS_HEIGHT_NOPAD
= (CHANNELS_HEIGHT - DILATION_Y * (KERNEL_HEIGHT - 1) - 1 + STRIDE_Y) / STRIDE_Y;
constexpr int OUTPUTS_WIDTH_NOPAD
= (CHANNELS_WIDTH - DILATION_X * (KERNEL_WIDTH - 1) - 1 + STRIDE_X) / STRIDE_X;
for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
const int syMin = (PADDING_Y == 0) ? 0
: max(PADDING_Y - (oy * STRIDE_Y), 0);
const int syMax = (PADDING_Y == 0
&& OUTPUTS_HEIGHT == OUTPUTS_HEIGHT_NOPAD) ? DILATED_KERNEL_HEIGHT
: clamp(CHANNELS_HEIGHT + PADDING_Y - (oy * STRIDE_Y),
0, DILATED_KERNEL_HEIGHT);
const int iy = (oy * STRIDE_Y) - PADDING_Y;
#ifdef _OPENMP
#pragma omp parallel for collapse(2)
#endif
for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
for (int output = 0; output < NB_OUTPUTS; ++output) {
// moved to inner loop for collapsing -->
const int sxMin = (PADDING_X == 0) ? 0
: max(PADDING_X - (ox * STRIDE_X), 0);
const int sxMax = (PADDING_X == 0
&& OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
? DILATED_KERNEL_WIDTH
: clamp(CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X),
0, DILATED_KERNEL_WIDTH);
const int ix = (ox * STRIDE_X) - PADDING_X;
const int oPos = (ox + OUTPUTS_WIDTH * oy);
const int oOffset = NB_OUTPUTS * oPos;
// <--
const int channel = (output * NB_CHANNELS) / NB_OUTPUTS;
Bias_T weightedSum = biases ? biases[output] : 0;
for (int sy = 0; sy < KERNEL_HEIGHT; ++sy) {
if ((PADDING_Y != 0
|| OUTPUTS_HEIGHT != OUTPUTS_HEIGHT_NOPAD)
&& ((sy*DILATION_Y < syMin) || (sy*DILATION_Y >= syMax)))
{
continue;
}
const int iPos = ix + CHANNELS_WIDTH * (iy + sy*DILATION_Y);
const int iOffset = NB_CHANNELS * iPos;
const int wOffset = (output*KERNEL_HEIGHT + sy)
* KERNEL_WIDTH;
if (DILATION_X == 1 && ((PADDING_X == 0
&& OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
|| sxMax - sxMin == KERNEL_WIDTH))
{
macsOnRange<KERNEL_WIDTH, NB_CHANNELS>(
inputs + iOffset + channel,
weights + wOffset,
weightedSum);
}
else {
for (int sx = 0; sx < KERNEL_WIDTH; ++sx) {
if ((PADDING_X != 0
|| OUTPUTS_WIDTH != OUTPUTS_WIDTH_NOPAD)
&& ((sx*DILATION_X < sxMin) || (sx*DILATION_X >= sxMax)))
{
continue;
}
const int iOffsetInRange = iOffset
+ sx * DILATION_X * NB_CHANNELS;
weightedSum += inputs[iOffsetInRange + channel]
* weights[wOffset + sx];
}
}
}
outputs[oOffset + output] = activation_forward_value<Output_T>(weightedSum, output, ACTIVATION, rescaling);
}
}
}
}
// Template specialization when biases are not given to the convolution
template<int NB_CHANNELS,
int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
int NB_OUTPUTS,
int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
int PADDING_Y, int PADDING_X,
int STRIDE_Y, int STRIDE_X,
int DILATION_Y, int DILATION_X,
int KERNEL_HEIGHT, int KERNEL_WIDTH,
ActivationFunction_T ACTIVATION,
typename Input_T, typename Output_T,
typename Weight_T,
typename Rescaling_T>
__attribute__((always_inline)) inline
void convolution_depthwise_forward(
const Input_T* __restrict inputs,
Output_T* __restrict outputs,
const Weight_T* __restrict weights,
std::nullptr_t __restrict,
const Rescaling_T& __restrict rescaling)
{
const float* b = nullptr;
convolution_depthwise_forward<NB_CHANNELS,
CHANNELS_HEIGHT,
CHANNELS_WIDTH,
NB_OUTPUTS,
OUTPUTS_HEIGHT,
OUTPUTS_WIDTH,
PADDING_Y,
PADDING_X,
STRIDE_Y,
STRIDE_X,
DILATION_Y,
DILATION_X,
KERNEL_HEIGHT,
KERNEL_WIDTH,
ACTIVATION>
(inputs, outputs, weights, b, rescaling);
}
#endif // __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION_DEPTHWISE__
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
#define __AIDGE_EXPORT_CPP_KERNELS_ELEMWISE__ #define __AIDGE_EXPORT_CPP_KERNELS_ELEMWISE__
#include "network/typedefs.hpp" #include "network/typedefs.hpp"
#include "kernels/activation.hpp" #include "network/activation_utils.hpp"
// Generic function for two inputs // Generic function for two inputs
......
...@@ -2,20 +2,20 @@ ...@@ -2,20 +2,20 @@
#define __AIDGE_EXPORT_CPP_KERNELS_FULLYCONNECTED__ #define __AIDGE_EXPORT_CPP_KERNELS_FULLYCONNECTED__
#include "network/typedefs.hpp" #include "network/typedefs.hpp"
#include "network/rescaling.hpp" #include "network/rescaling_utils.hpp"
#include "network/utils.hpp" #include "network/utils.hpp"
#include "kernels/macs.hpp" #include "network/macs.hpp"
#include "kernels/activation.hpp" #include "network/activation_utils.hpp"
template<int NB_CHANNELS, template<int NB_CHANNELS,
int CHANNELS_HEIGHT, int CHANNELS_WIDTH, int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
int NB_OUTPUTS, int NB_OUTPUTS,
int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH, int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
ActivationFunction_T ACTIVATION, ActivationFunction_T ACTIVATION,
typename Input_T, typename Output_T, typename Input_T, typename Output_T,
typename Weight_T, typename Bias_T, typename Weight_T, typename Bias_T,
typename Rescaling_T> typename Rescaling_T>
__attribute__((always_inline)) inline __attribute__((always_inline)) inline
void fullyconnected_forward ( void fullyconnected_forward (
const Input_T* __restrict inputs, const Input_T* __restrict inputs,
Output_T* __restrict outputs, Output_T* __restrict outputs,
...@@ -28,14 +28,17 @@ void fullyconnected_forward ( ...@@ -28,14 +28,17 @@ void fullyconnected_forward (
// It is only an issue if the FC was after a flatten layer. // It is only an issue if the FC was after a flatten layer.
// Otherwise it is not an issue for the other FC because CHANNELS_WIDTH = CHANNELS_HEIGHT = 1 // Otherwise it is not an issue for the other FC because CHANNELS_WIDTH = CHANNELS_HEIGHT = 1
// Solution: Add a system to check dataformat // Solution: Add a system to check dataformat
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int och = 0; och < NB_OUTPUTS; och++) { for (int och = 0; och < NB_OUTPUTS; och++) {
Bias_T weightedSum = biases[och]; Bias_T weightedSum = (biases) ? biases[och] : Bias_T(0);
for (int iy = 0; iy < CHANNELS_HEIGHT; ++iy) { for (int iy = 0; iy < CHANNELS_HEIGHT; ++iy) {
for (int ix = 0; ix < CHANNELS_WIDTH; ++ix) { for (int ix = 0; ix < CHANNELS_WIDTH; ++ix) {
for (int ch = 0; ch < NB_CHANNELS; ++ch) { for (int ch = 0; ch < NB_CHANNELS; ++ch) {
weightedSum += inputs[CHANNELS_WIDTH*NB_CHANNELS*iy + NB_CHANNELS*ix + ch] weightedSum += inputs[CHANNELS_WIDTH*NB_CHANNELS*iy + NB_CHANNELS*ix + ch]
* weights[CHANNELS_HEIGHT*CHANNELS_WIDTH*NB_CHANNELS*och + CHANNELS_HEIGHT*CHANNELS_WIDTH*ch + CHANNELS_HEIGHT*iy + ix]; * weights[CHANNELS_HEIGHT*CHANNELS_WIDTH*NB_CHANNELS*och + CHANNELS_HEIGHT*CHANNELS_WIDTH*ch + CHANNELS_HEIGHT*iy + ix];
} }
} }
...@@ -45,10 +48,12 @@ void fullyconnected_forward ( ...@@ -45,10 +48,12 @@ void fullyconnected_forward (
} }
/* /*
Here the kernel to use with inputs in NHWC and weights in NHWC Here the kernel to use with inputs in NHWC and weights in NHWC
#ifdef _OPENMP
#pragma omp parallel for #pragma omp parallel for
#endif
for (int och = 0; och < NB_OUTPUTS; och++) { for (int och = 0; och < NB_OUTPUTS; och++) {
Bias_T weightedSum = biases[och]; Bias_T weightedSum = (biases) ? biases[och] : Bias_T(0);
for (int iy = 0; iy < CHANNELS_HEIGHT; ++iy) { for (int iy = 0; iy < CHANNELS_HEIGHT; ++iy) {
const int iPos = (CHANNELS_WIDTH * iy); const int iPos = (CHANNELS_WIDTH * iy);
...@@ -58,8 +63,8 @@ Here the kernel to use with inputs in NHWC and weights in NHWC ...@@ -58,8 +63,8 @@ Here the kernel to use with inputs in NHWC and weights in NHWC
* (iy + CHANNELS_HEIGHT * och); * (iy + CHANNELS_HEIGHT * och);
macsOnRange<NB_CHANNELS * CHANNELS_WIDTH>( macsOnRange<NB_CHANNELS * CHANNELS_WIDTH>(
inputs + iOffset, inputs + iOffset,
weights + wOffset, weights + wOffset,
weightedSum); weightedSum);
} }
...@@ -69,4 +74,4 @@ Here the kernel to use with inputs in NHWC and weights in NHWC ...@@ -69,4 +74,4 @@ Here the kernel to use with inputs in NHWC and weights in NHWC
} }
#endif // __AIDGE_EXPORT_CPP_KERNELS_FULLYCONNECTED__ #endif // __AIDGE_EXPORT_CPP_KERNELS_FULLYCONNECTED__
\ No newline at end of file
#ifndef __AIDGE_EXPORT_CPP_KERNELS_HARDMAX__
#define __AIDGE_EXPORT_CPP_KERNELS_HARDMAX__
#include "network/typedefs.hpp"
#include "network/utils.hpp"
// Todo add border value and border type (Reflect, Constant, Wrap...) and add
// the two missing pad value (bottom and right)
template <unsigned int AXIS_DIM_SIZE,
unsigned int PREAXIS_STRIDE,
unsigned int AXIS_STRIDE,
unsigned int POSTAXIS_STRIDE,
unsigned int NB_ELTS,
typename Input_T,
typename Output_T>
// void HardmaxImpl_cpu_forward_kernel(std::int32_t axis_, const
// std::vector<DimSize_t>& dims, const void* input_, void* output_)
__attribute__((always_inline)) inline void
hardmax2d_forward(const Input_T *__restrict input,
Output_T *__restrict output) {
// fill output with 0
for (Output_T *i = output; i != output + NB_ELTS; ++i) {
*i = 0;
}
// For each index on all the axes before and after 'axis', we have a
// different max element to find
for (unsigned int i = 0, preAxisOffset = 0; i < PREAXIS_STRIDE;
++i, preAxisOffset += AXIS_DIM_SIZE * POSTAXIS_STRIDE) {
for (unsigned int j = 0; j < POSTAXIS_STRIDE; ++j) {
// Init the max with first element
unsigned int maxIdx = 0;
Input_T maxVal = input[preAxisOffset + j];
// Loop over the elements on 'axis'
// Since we start at 0th idx, we already initialize the values like
// the 1st iteration has been done
for (unsigned int k = 1,
postAxisOffset = preAxisOffset + POSTAXIS_STRIDE;
k < AXIS_DIM_SIZE;
++k, postAxisOffset += POSTAXIS_STRIDE) {
Input_T currVal = input[postAxisOffset + j];
// Update max elements
if (currVal > maxVal) {
maxIdx = k;
maxVal = currVal;
}
}
output[preAxisOffset + maxIdx * POSTAXIS_STRIDE + j] = 1;
}
}
}
#endif // __AIDGE_EXPORT_CPP_KERNELS_HARDMAX__
...@@ -11,7 +11,9 @@ void leakyrelu_forward ( ...@@ -11,7 +11,9 @@ void leakyrelu_forward (
Output_T* __restrict outputs, Output_T* __restrict outputs,
const float negative_slope) const float negative_slope)
{ {
#ifdef _OPENMP
#pragma omp parallel for #pragma omp parallel for
#endif
for (int i = 0; i < NB_DATA; ++i) { for (int i = 0; i < NB_DATA; ++i) {
if (inputs[i] >= 0) { if (inputs[i] >= 0) {
outputs[i] = inputs[i]; outputs[i] = inputs[i];
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
#define __AIDGE_EXPORT_CPP_KERNELS_MATMUL__ #define __AIDGE_EXPORT_CPP_KERNELS_MATMUL__
#include "network/typedefs.hpp" #include "network/typedefs.hpp"
#include "kernels/activation.hpp" #include "network/activation_utils.hpp"
// Generic function for matmul and activation // Generic function for matmul and activation
......
#ifndef __AIDGE_EXPORT_CPP_KERNELS_PAD2D__
#define __AIDGE_EXPORT_CPP_KERNELS_PAD2D__
#include "network/typedefs.hpp"
#include "network/utils.hpp"
// Todo add border value and border type (Reflect, Constant, Wrap...) and add the two missing pad value (bottom and right)
template<int NB_BATCHES, int NB_CHANNELS,
int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
int NB_OUTPUTS,
int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
int PADDING_TOP,
int PADDING_LEFT,
int PADDING_BOTTOM,
int PADDING_RIGHT,
typename Input_T, typename Output_T>
__attribute__((always_inline)) inline
void pad_forward(
double borderValue,
const Input_T* __restrict inputs,
Output_T* __restrict outputs
)
{
const unsigned int oySize = CHANNELS_HEIGHT + PADDING_TOP + PADDING_BOTTOM;
const unsigned int oxSize = CHANNELS_WIDTH + PADDING_LEFT + PADDING_RIGHT;
for (unsigned int batch = 0; batch < NB_BATCHES; ++batch) {
for (unsigned int ch = 0; ch < NB_CHANNELS; ++ch) {
const unsigned int preIndex = batch * NB_CHANNELS * CHANNELS_HEIGHT * CHANNELS_WIDTH + ch * CHANNELS_HEIGHT * CHANNELS_WIDTH;
for (unsigned int oy = 0; oy < oySize; ++oy) {
for (unsigned int ox = 0; ox < oxSize; ++ox) {
const unsigned int outIndex = batch * NB_CHANNELS * oySize * oxSize + ch * oySize * oxSize + oy * oxSize + ox;
outputs[outIndex] = borderValue;
const unsigned int inputX = ox - PADDING_LEFT;
const unsigned int inputY = oy - PADDING_TOP;
if (inputY >= 0 and inputY < CHANNELS_HEIGHT and inputX >= 0 and inputX < CHANNELS_WIDTH)
{
outputs[outIndex] = inputs[preIndex + inputY * CHANNELS_WIDTH + inputX];
}
}
}
}
}
}
#endif // __AIDGE_EXPORT_CPP_KERNELS_PAD2D__