Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • eclipse/aidge/aidge_export_cpp
  • hrouis/aidge_export_cpp
  • clementgf/aidge_export_cpp
  • cguillon/aidge_export_cpp
  • silvanosky/aidge_export_cpp
  • maab05/aidge_export_cpp
  • mnewson/aidge_export_cpp
  • axelfarr/aidge_export_cpp
  • gallasko/aidge_export_cpp
  • wboussella/aidge_export_cpp
  • mick94/aidge_export_cpp
  • louislerbourg/aidge_export_cpp
12 results
Show changes
Commits on Source (239)
Showing
with 942 additions and 222 deletions
......@@ -11,6 +11,13 @@ __pycache__
*.pyc
*.egg-info
dist*/
aidge_export_cpp/_version.py
wheelhouse/*
env_aidge/
# Temp test folders
aidge_export_cpp/unit_tests/*_temp_test
*_test/
# Mermaid
*.mmd
......@@ -19,4 +26,7 @@ dist*/
xml*/
# ONNX
*.onnx
\ No newline at end of file
*.onnx
# GDB
.gdb_history
\ No newline at end of file
###############################################################################
# Aidge Continuous Integration and Deployment #
# #
###############################################################################
stages:
- static_analysis
- build
- test
- coverage
- release
- deploy
include:
- project: 'eclipse/aidge/gitlab_shared_files'
ref: 'main'
file:
# choose which jobs to run by including the corresponding files.
- '.gitlab/ci/ubuntu_python.gitlab-ci.yml'
- '.gitlab/ci/release/pip.gitlab-ci.yml'
# Since aidge_export_cpp is a pure python package building on windows and on ubuntu doesn't differ
# - '.gitlab/ci/windows_python.gitlab-ci.yml'
test:ubuntu_python:
before_script:
- !reference [.setup:test:ubuntu_python, before_script]
- DEPS_NAMES=("aidge_onnx" "aidge_quantization")
- DEPENDENCY_JOB="build:ubuntu_python"
- !reference [.ubuntu:download:artifacts, script]
# Need to install extra dependence for tests:
- python -m pip install torch torchvision
coverage:ubuntu_python:
before_script:
- !reference [.setup:coverage:ubuntu_python, before_script]
- DEPS_NAMES=("aidge_onnx" "aidge_quantization")
- DEPENDENCY_JOB="build:ubuntu_python"
- !reference [.ubuntu:download:artifacts, script]
\ No newline at end of file
#!/bin/bash
set -e
if [[ "$1" == "" ]]; then
echo "build aidge deps in cibuildwheel container before building wheel."
echo "search path defines where the dependencies will be searched."
echo "Hint : In wheel containers, files are mounted on /host by default."
echo "\nusage : ./cibuildwheel_build_deps_before_build_wheel.sh $search_path"
fi
set -x
if [[ $AIDGE_DEPENDENCIES == "" ]]; then # case for aidge_ core
mkdir -p build # creating build if its not already there to hold the build of cpp files
rm -rf build/* # build from scratch
else
for repo in $AIDGE_DEPENDENCIES ; do # case for other projects
search_path=$1
REPO_PATH=$(find $search_path ! -writable -prune -o -type d \
-name "$repo" \
-not -path "*/install/*" \
-not -path "*/.git/*" \
-not -path "*/miniconda/*" \
-not -path "*/conda/*" \
-not -path "*/.local/*" \
-not -path "*/lib/*" \
-not -path "*/$repo/$repo/*" \
-not -path "*/proc/*" \
-print -quit)
if [[ -z "$REPO_PATH" ]]; then
echo "ERROR : dependency $repo not found in search_path \"$search_path\". ABORTING."
exit -1
fi
cd $REPO_PATH
mkdir -p build # creating build if its not already there to hold the build of cpp files
rm -rf build/* # build from scratch
pip install . -v
cd -
done
fi
set +x
set +e
# Version 0.2.1 (January 31, 2025)
# Version 0.2.0 (december 6, 2024)
# Version 0.0.1 (January 23, 2024)
Initial release
include MANIFEST.in
include LICENSE
include README.md
recursive-include aidge_export_cpp *
include setup.py
include version.txt
# Aidge CPP Export
Use this module to export your Aidge model to a generic CPP export
\ No newline at end of file
Use this module to export your Aidge model to a generic CPP export
## Install
Install with:
pip install -v .
## Development mode install
For editable/development mode, install with:
pip install -v --no-build-isolation -e .
......@@ -2,14 +2,15 @@ r"""
Aidge Export for CPP standalone projects
"""
from pathlib import Path
from .operators import *
from collections import defaultdict
import aidge_core
from aidge_export_cpp.utils import ROOT
__version__ = open(ROOT / "version.txt", "r").read().strip()
# Constants
FILE = Path(__file__).resolve()
ROOT = FILE.parents[0]
from .export_registry import ExportLibCpp
from .export_utils import *
from .operators import *
from .export import *
from . import benchmark
import contextlib
import os
from shutil import rmtree
from subprocess import run
import numpy as np
import aidge_core
import aidge_backend_cpu
import aidge_export_cpp
def measure_inference_time(model: aidge_core.GraphView, input_data: list[str, np.ndarray], nb_warmup: int = 10, nb_iterations: int = 50) -> list[float]:
# load and set up the model
# model.set_datatype(ai.dtype.float32)
model.set_backend("cpu")
# create input Tensor list for the GraphView
ordered_inputs: list[aidge_core.Tensor] = []
# [tmp fix] manual transpositin of data for input of export BEFORE converting to Tensor
for i in input_data:
nb_dims = len(i[1].shape)
if nb_dims == 3:
ordered_inputs.append(aidge_core.Tensor(i[1].transpose(0,2,1).reshape(i[1].shape).copy()))
if nb_dims == 4:
ordered_inputs.append(aidge_core.Tensor(np.transpose(i[1], axes=(0,2,3,1)).reshape(i[1].shape).copy()))
else:
ordered_inputs.append(aidge_core.Tensor(i[1]))
# set inputs for the export
for i, inp in enumerate(model.get_ordered_inputs()):
op = inp[0].get_operator()
op.set_input(i, ordered_inputs[i])
model.forward_dims([t.dims() for t in ordered_inputs])
scheduler = aidge_core.SequentialScheduler(model)
scheduler.generate_scheduling()
# for ordered_input in ordered_inputs:
# ordered_input.set_backend("cpu")
operator_type: str = model.get_ordered_outputs()[0][0].get_operator().type()
print(" ├─Generating export...", end="", flush=True)
folder_name: str = f"{operator_type.lower()}_test_export_cpp"
with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f):
aidge_core.export_utils.scheduler_export(
scheduler,
folder_name,
aidge_export_cpp.ExportLibCpp,
memory_manager=aidge_core.mem_info.generate_optimized_memory_info,
memory_manager_args={"wrapping": False }
)
aidge_core.export_utils.generate_main_inference_time_cpp(folder_name, model, nb_iterations, nb_warmup)
print(" ok")
print(" ├─Compiling...", end="", flush=True)
with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f):
run(['make'], cwd=folder_name, stdout=f)
print(" ok")
timings_str = run(f'./{folder_name}/bin/run_export', capture_output=True, text=True)
folder_path = os.path.abspath(folder_name)
if os.path.exists(folder_path):
rmtree(folder_path, ignore_errors=True)
timings = [float(t) for t in timings_str.stdout.split(' ') if t.strip()]
return timings
def compute_output(model: aidge_core.GraphView, input_data: list[str, np.ndarray]) -> list[np.ndarray]:
# load and set up the model
model.set_backend("cpu")
# create input Tensor list for the GraphView
ordered_inputs: list[aidge_core.Tensor] = []
# [tmp fix] manual transpositin of data for input of export BEFORE converting to Tensor
for i in input_data:
nb_dims = len(i[1].shape)
if nb_dims == 3:
ordered_inputs.append(aidge_core.Tensor(i[1].transpose(0,2,1).reshape(i[1].shape).copy()))
if nb_dims == 4:
ordered_inputs.append(aidge_core.Tensor(np.transpose(i[1], axes=(0,2,3,1)).reshape(i[1].shape).copy()))
else:
ordered_inputs.append(aidge_core.Tensor(i[1]))
# set inputs for the export
for i, inp in enumerate(model.get_ordered_inputs()):
op = inp[0].get_operator()
op.set_input(i, ordered_inputs[i])
model.forward_dims([t.dims() for t in ordered_inputs])
scheduler = aidge_core.SequentialScheduler(model)
scheduler.generate_scheduling()
operator_type: str = model.get_ordered_outputs()[0][0].get_operator().type()
print(" │ Generating export...", end="", flush=True)
folder_name: str = f"{operator_type.lower()}_test_export_cpp"
with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f):
aidge_core.export_utils.scheduler_export(
scheduler,
folder_name,
aidge_export_cpp.ExportLibCpp,
memory_manager=aidge_core.mem_info.generate_optimized_memory_info,
memory_manager_args={"wrapping": False }
)
aidge_core.export_utils.generate_main_display_output_cpp(folder_name, model)
print(" ok")
print(" │ Compiling...", end="", flush=True)
with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f):
run(['make'], cwd=folder_name, stdout=f)
print(" ok")
output_str: str = run(f'./{folder_name}/bin/run_export', capture_output=True, text=True)
folder_path = os.path.abspath(folder_name)
if os.path.exists(folder_path):
rmtree(folder_path, ignore_errors=True)
outputs_str: list[str] = output_str.stdout.strip().split('\n')
outputs = [np.array([float(val) for val in single_output_str.split(' ') if val.strip()]) for i, single_output_str in enumerate(outputs_str)]
for i, pair in enumerate(model.get_ordered_outputs()):
dims = pair[0].get_operator().get_output(pair[1]).dims()
nb_dims = len(dims)
dims_permutted = dims
if nb_dims == 3:
dims_permutted = [dims[0], dims[2], dims[1]]
if nb_dims == 4:
dims_permutted = [dims[0], dims[2], dims[3], dims[1]]
if np.prod(dims) != outputs[i].size:
aidge_core.Log.fatal("Incompatible export output size ({}) with required shape {}", outputs[i].size, dims)
outputs[i] = outputs[i].reshape(dims_permutted)
if nb_dims == 3:
outputs[i] = outputs[i].transpose(0,2,1)
if nb_dims == 4:
outputs[i] = outputs[i].transpose(0,3,1,2)
return outputs
import re
import os
from pathlib import Path
import shutil
import numpy as np
from pathlib import Path
from typing import List, Union
from jinja2 import Environment, FileSystemLoader
import aidge_core
from aidge_core.export.code_generation import *
from aidge_export_cpp.utils import (ROOT, OPERATORS_REGISTRY, supported_operators)
from aidge_export_cpp.utils.converter import aidge_datatype2ctype, numpy_dtype2ctype
import aidge_export_cpp.operators
from aidge_export_cpp.utils.generation import *
from aidge_export_cpp.memory import *
def generate_input_file(export_folder:str,
array_name:str,
array: np.ndarray):
# If directory doesn't exist, create it
if not os.path.exists(export_folder):
os.makedirs(export_folder)
generate_file(
file_path=f"{export_folder}/{array_name}.h",
template_path=str(ROOT / "templates" / "data" / "inputs.jinja"),
dims = array.shape,
data_t = numpy_dtype2ctype(array.dtype),
name = array_name,
values = array.tolist()
)
def export(export_folder_name, graphview, scheduler):
export_folder = Path().absolute() / export_folder_name
os.makedirs(str(export_folder), exist_ok=True)
dnn_folder = export_folder / "dnn"
os.makedirs(str(dnn_folder), exist_ok=True)
list_actions = []
list_configs = []
from aidge_core.mem_info import generate_optimized_memory_info
from aidge_core.export_utils import scheduler_export, generate_main_cpp, aidge2c, generate_file
list_forward_nodes = scheduler.get_static_scheduling()
from aidge_export_cpp import ExportLibCpp, ROOT
from aidge_export_cpp.export_utils import read_log_file
for node in list_forward_nodes:
if node.type() in supported_operators():
op = OPERATORS_REGISTRY[node.type()](node)
# For configuration files
list_configs = op.export(dnn_folder, list_configs)
# For forward file
list_actions = op.forward(list_actions)
# Memory management
mem_size, mem_info = compute_default_mem_info(scheduler)
# Generate the memory file
generate_file(
str(dnn_folder / "memory" / "mem_info.h"),
str(ROOT / "templates" / "memory" / "mem_info.jinja"),
mem_size = mem_size,
mem_info_legends = MEMORY_INFO_TEMPLATE,
mem_info = mem_info
)
list_configs.append("memory/mem_info.h")
# Get entry nodes
# It supposes the entry nodes are producers with constant=false
# Store the datatype & name
list_inputs_name = []
for node in graphview.get_nodes():
if node.type() == "Producer":
if not node.get_operator().get_attr("Constant"):
export_type = aidge_datatype2ctype(node.get_operator().get_output(0).dtype())
list_inputs_name.append((export_type, node.name()))
# Get output nodes
# Store the datatype & name, like entry nodes
list_outputs_name = []
for node in graphview.get_nodes():
if len(node.get_children()) == 0:
export_type = aidge_datatype2ctype(node.get_operator().get_output(0).dtype())
list_outputs_name.append((export_type, node.name()))
# Generate forward file
generate_file(
str(dnn_folder / "src" / "forward.cpp"),
str(ROOT / "templates" / "network" / "network_forward.jinja"),
headers=list_configs,
actions=list_actions,
inputs= list_inputs_name,
outputs=list_outputs_name
)
# Generate dnn API
generate_file(
str(dnn_folder / "include" / "dnn.hpp"),
str(ROOT / "templates" / "network" / "dnn_header.jinja"),
libraries=[],
functions=get_functions_from_c_file(str(dnn_folder / "src" / "forward.cpp")),
)
# Copy all static files in the export
shutil.copy(str(ROOT / "static" / "main.cpp"), str(export_folder))
shutil.copy(str(ROOT / "static" / "Makefile"), str(export_folder))
shutil.copytree(str(ROOT / "static" / "include"), str(dnn_folder / "include"), dirs_exist_ok=True)
\ No newline at end of file
def export(export_folder_name: str,
graphview: aidge_core.GraphView,
scheduler: Union[List[aidge_core.Node],
aidge_core.Scheduler],
inputs_tensor: aidge_core.Tensor = None,
labels: aidge_core.Tensor = None,
dev_mode: bool = False,
aidge_cmp: bool = False):
""" Export an aidge_core.Scheduler to C++ code
:param export_folder_name: Export folder name
:type export_folder_name: str
:param graph_view: An instance of :py:class:`aidge_core.graph_view`, providing access to nodes and
ordered input/output data within the computational graph.
:type graph_view: aidge_core.GraphView
:param scheduler: Scheduler instance managing the computation graph.
Uses `graph_view` and `get_sequential_static_scheduling` methods
:param inputs_tensor: **For future** argument to provide tensor to use in the main function, not implemented yet!
By default, the input of the given graph will be exported.
:type input_tensor: aidge_core.Tensor
to retrieve the computation graph layout and ordered nodes.
:type scheduler: aidge_core.Scheduler
:param labels: Argument to provide labels tensor to generate and use in the main function.
:type labels: aidge_core.Tensor
:param dev_mode: Wether or not the developer mode is enabled. If enabled, the export files
will be symlinks from the aidge_export_cpp module. Therefore, modifying
a file within the export will change the module as well.
:type dev_mode: boolean
"""
export_folder_name = Path(export_folder_name)
# Remove existing export
if os.path.isdir(export_folder_name):
print("Removing existing export directory...")
shutil.rmtree(export_folder_name)
# Generate Model Files
"""
Perform the following tasks :
- Generate the parameters and layers config files
- Generate the forward.cpp file
- Copy all needed kernels
"""
scheduler_export(scheduler,
export_folder_name,
ExportLibCpp,
memory_manager=generate_optimized_memory_info,
memory_manager_args={
"stats_folder": f"{export_folder_name}/stats"},
dev_mode=dev_mode)
# Generate main file
generate_main_cpp(export_folder_name, graphview, labels=labels, inputs_tensor=inputs_tensor)
# Generate log files (aidge_cmp option)
"""
If the aidge_cmp option has been enabled, the generated log_outputs will
be copied into the generated export in order to be used as reference.
"""
if aidge_cmp:
ranked_nodes = graphview.get_ranked_nodes_name("{0}[{1}#{3}]")
os.makedirs(export_folder_name / "data" / "aidge_outputs")
os.makedirs(export_folder_name / "data" / "export_outputs")
for node in graphview.get_nodes():
if node.type() != "Producer":
file_path = 'log_outputs/' + ranked_nodes[node] + '/output_0.log'
data_t = aidge2c(node.get_operator().get_output(0).dtype())
name = node.name() + '_output_0_aidge'
dims = node.get_operator().get_output(0).dims()
values = read_log_file(file_path)
generate_file(export_folder_name / "data" / "aidge_outputs" / (node.name() + ".hpp"),
ROOT / "templates" / "data" / "aidge_tensor.jinja",
data_t=data_t,
name=name,
dims=dims,
values=values)
from aidge_core.export_utils import ExportLib
from aidge_export_cpp import ROOT
class ExportLibCpp(ExportLib):
_name="export_cpp"
static_files={
str(ROOT / "static" / "Makefile"): "",
str(ROOT / "static" / "typedefs.hpp"): "dnn/include/network",
str(ROOT / "static" / "utils.hpp"): "dnn/include/network",
str(ROOT / "static" / "rescaling_utils.hpp"): "dnn/include/network",
str(ROOT / "static" / "activation_utils.hpp"): "dnn/include/network",
}
import os
from collections import OrderedDict
import aidge_core
from aidge_core.export_utils import get_node_from_metaop
def cpp_fuse_to_metaops(graph_view: aidge_core.GraphView):
"""
Fuse nodes into metaops adapted for the CPP Export
TODO: These recipes should be into aidge_core
:param graph_view: An instance of :py:class:`aidge_core.GraphView`, providing access to nodes and
ordered input/output data within the computational graph.
"""
cpp_recipes = OrderedDict({
# Quantization
"QMul": "Mul->Quantizer", # Fixed Point Scaling
# FC
"QFC": "FC->(Quantizer|QMul)",
"FCAct": "(FC|QFC)->ReLU",
# Conv
"QConv": "Conv2D->(Quantizer|QMul)",
"PadConv": "(QConv|Conv2D)<-Pad2D",
"ConvAct": "(QConv|Conv2D)->ReLU",
"PadConvAct": "PadConv->ReLU",
# ConvDw
"QConvDw": "ConvDepthWise2D->(Quantizer|QMul)",
"ConvDwPad": "(QConvDw|ConvDepthWise2D)->Pad2D",
"ConvDwAct": "(QConvDw|ConvConvDepthWise2D2D)->ReLU",
"ConvDwActPad": "ConvDwAct->Pad2D",
# Max Pooling
"PadMaxPool": "MaxPooling2D<-Pad2D",
"MaxPoolAct": "MaxPooling2D->ReLU",
"PadMaxPoolAct": "PadMaxPool->ReLU",
# Average Pooling
"PadAvgPool": "AvgPooling2D<-Pad2D",
"AvgPoolAct": "AvgPooling2D->ReLU",
"PadAvgPoolAct": "PadAvgPool->ReLU",
# Global Average Pooling
"PadGlobalAvgPool": "GlobalAveragePooling2D<-Pad2D",
"GlobalAvgPoolAct": "GlobalAveragePooling2D->ReLU",
"PadGlobalAvgPoolAct": "PadGlobalAveragePool->ReLU",
# ElemWise
"QAdd": "Add->(Quantizer|QMul)",
"QSub": "Sub->(Quantizer|QMul)",
# "QMul": "Mul->Quantizer", # Already defined
"AddAct": "(QAdd|Add)->ReLU",
"SubAct": "(QSub|Sub)->ReLU",
"MulAct": "(QMul|Mul)->ReLU",
# Activation
"QReLU": "ReLU->(Quantizer|QMul)",
})
for node, recipe in cpp_recipes.items():
aidge_core.fuse_to_metaops(graph_view, recipe, node)
def set_nodes_names(scheduler):
"""
Set the CPP nodes names as well as their producers.
The producers naming is handled from their child node.
[TODO] Fc and Conv layers will always have weights as parent 1 and
possibly biases as parent 2. It may be better to previously label the
producers.
:param scheduler: Scheduler instance managing the computation graph.
Uses `graph_view` and `get_sequential_static_scheduling` methods
to retrieve the computation graph layout and ordered nodes.
:type scheduler: aidge_core.Scheduler
"""
node_ids = {} # Dict holding the node type along with a counter
node_it = 0 # Node Iterator
## MetaOps
for node in scheduler.get_sequential_static_scheduling():
node_type = node.type()
if node_type != "Producer":
if node.type() not in node_ids:
node_ids[node_type] = 0
# Set node name
node.set_name("_" + str(node_it) + "_" +
str(node_type) + "_" + str(node_ids[node_type]))
node_ids[node_type] += 1
node_it += 1
# Set producers names
## Weights & Biases producers
if get_node_from_metaop(node, "FC") or \
get_node_from_metaop(node, "Conv2D") or \
get_node_from_metaop(node, "ConvDepthWise2D"):
node.get_parent(1).set_name(node.name() + "_weights")
if node.get_parent(2) is not None:
node.get_parent(2).set_name(node.name() + "_biases")
## Scaling Producers
for node in scheduler.get_sequential_static_scheduling():
"""
TODO: If multiple quantizer nodes are found, the producers will
all have the same name and this will not work properly.
"""
if node.type() == "Producer":
child_node = node.output(0)[0][0]
if node.attributes().has_attr("shift_prod"):
node.set_name(child_node.name() + "_shift")
if node.attributes().has_attr("coef_prod"):
node.set_name(child_node.name() + "_coef")
def set_nodes_datatypes(graph_view: aidge_core.GraphView):
""" Set the nodes' datatypes
The set_datatype function can't be used on Conv2D and FC nodes directly
as the biases datatype is different from the other inputs.
TODO: Should be using forward_datatype()
:param graph_view: An instance of :py:class:`aidge_core.graph_view`, providing access to nodes and
ordered input/output data within the computational graph.
"""
for node in graph_view.get_nodes():
if node.type() != "Producer":
if get_node_from_metaop(node, "FC") or \
get_node_from_metaop(node, "Conv2D") or \
get_node_from_metaop(node, "ConvDepthWise2D"):
node.get_operator().get_input(0).set_datatype(aidge_core.dtype.int8) # Input
node.get_operator().get_input(1).set_datatype(aidge_core.dtype.int8) # Weights
if node.get_parent(2) is not None:
node.get_operator().get_input(2).set_datatype(aidge_core.dtype.int32) # Biases
node.get_operator().get_output(0).set_datatype(aidge_core.dtype.int8) # Output
else:
node.get_operator().set_datatype(aidge_core.dtype.int8)
# Set input node's datatype
for n in graph_view.get_input_nodes():
n.get_operator().get_input(0).set_datatype(aidge_core.dtype.int8)
def read_log_file(file_path: str):
""" Read log file
Used to read the aidge generated log files containing the intermediate
tensors of the exported model.
:param file_path: Path to the file to read.
:type file_path: str
"""
# Check if the file exists
if not os.path.isfile(file_path):
print(f"File not found: {file_path}")
return None
with open(file_path, 'r') as file:
content = file.read()
return content
def exclude_unwanted_producers(model):
""" Exclude some producers not needed for the export
Currently excludes the producers attached to the Mul and BitShift nodes, as they are
tensors holding a single data. This data is retrieved during the export
generation process and passed as argument directly within the Mul layer
configuration.
"""
nodes_to_ignore = ["Mul", "BitShift", "Clip"]
for node in model.get_nodes():
if node.type() == "Producer":
children_nodes = [n.type() for n in node.get_children()]
for node_type in nodes_to_ignore:
if node_type in children_nodes:
node.attributes().ignore = True
break
def set_scaling_attributes(export_node: aidge_core.export_utils.ExportNode, node: aidge_core.Node):
"""
Look recursively for a Quantizer node inside of the given node,
then set shift and coef attributes of the given export node.
[TODO] Should be moved into aidge_core.ExportNode
:param export_node: An instance of :py:class:`aidge_core.export_utils.ExportNode` to set the scaling
attributes needed for a quantized export.
:type export_node: aidge_core.export_utils.ExportNode
:param node: Node which may hold a Quantizer node.
:type node: aidge_core.Node
"""
QNode = get_node_from_metaop(node, "Quantizer")
if QNode:
BNode = get_node_from_metaop(QNode[0], "BitShift")
export_node.attributes["shift_value"] = BNode[0].get_operator().get_input(1)[0]
QMulNode = get_node_from_metaop(node, "QMul")
if QMulNode:
CNode = get_node_from_metaop(QMulNode[0], "Mul")
export_node.attributes["coef_value"] = CNode[0].get_operator().get_input(1)[0]
def normalize(array):
"""
Normalize an input image between -1 and 1
"""
if array.max() == array.min():
return array/array.max()
array = (array - array.min()) / (array.max() - array.min())
return 2 * array - 1
#ifndef __AIDGE_EXPORT_CPP_KERNELS_ACTIVATION__
#define __AIDGE_EXPORT_CPP_KERNELS_ACTIVATION__
#include <type_traits>
#include "network/typedefs.hpp"
#include "network/utils.hpp"
#include "network/rescaling.hpp"
template<typename Output_T, typename T,
typename std::enable_if<std::is_floating_point<T>::value>::type* = nullptr>
__attribute__((always_inline)) inline
Output_T saturate (T value, int32_t /*sat*/)
{
return value;
}
template<typename Output_T, typename T,
typename std::enable_if<!std::is_floating_point<T>::value>::type* = nullptr>
__attribute__((always_inline)) inline
Output_T saturate (T value, uint32_t sat)
{
if (std::is_unsigned<Output_T>::value) {
return clamp(value, T(0), (T(1) << sat) - 1);
} else {
return clamp(value, -(T(1) << (sat - 1)), (T(1) << (sat - 1)) - 1);
}
}
template<typename Output_T,
typename Sum_T,
typename Rescaling_T>
__attribute__((always_inline)) inline
Output_T activation_forward_value (Sum_T weightedSum,
int output,
ActivationFunction_T func,
const Rescaling_T& __restrict rescaling)
{
switch(func) {
case Linear:
case Saturation: {
break;
}
case Rectifier: {
if(weightedSum <= 0) weightedSum = 0;
break;
}
default:
// Unsupported activation function
break;
}
// Value fixed here for now but it should be generated by
// the export module or determined by the type of Output_T
// For now only works for int8_t and uint8_t
const uint32_t NB_BITS = 8;
return saturate<Output_T>(rescaling(weightedSum, output), NB_BITS);
}
#include "network/activation_utils.hpp"
#include "network/rescaling_utils.hpp"
template<int NB_DATA,
ActivationFunction_T ACTIVATION,
typename Input_T, typename Output_T, typename Rescaling_T>
__attribute__((always_inline)) inline
__attribute__((always_inline)) inline
void activation_forward (
const Input_T* __restrict inputs,
Output_T* __restrict outputs,
......
......@@ -2,17 +2,19 @@
#define __AIDGE_EXPORT_CPP_KERNELS_BATCHNORM__
#include "network/typedefs.hpp"
#include "network/rescaling.hpp"
#include "network/activation_utils.hpp"
#include <math.h>
// WARNING: this kernel only works for 32-bits floating point values
template<int NB_OUTPUTS,
template<int NB_BATCHES, int NB_OUTPUTS,
int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
ActivationFunction_T ACTIVATION,
typename Input_T, typename Output_T,
typename Param_T>
__attribute__((always_inline)) inline
typename Input_T, typename Output_T,
typename Param_T,
typename Rescaling_T>
__attribute__((always_inline)) inline
void batchnorm_forward (
const Input_T* __restrict inputs,
Output_T* __restrict outputs,
......@@ -20,18 +22,22 @@ void batchnorm_forward (
const Param_T* __restrict variances,
const Param_T* __restrict means,
const Param_T* __restrict scales,
const double epsilon)
const double epsilon,
const Rescaling_T& __restrict rescaling)
{
for (unsigned int output = 0; output < NB_OUTPUTS; ++output) {
const Output_T var = sqrt(variances[output] + epsilon);
for (unsigned int batch = 0; batch < NB_BATCHES; ++batch) {
for (unsigned int output = 0; output < NB_OUTPUTS; ++output) {
// If the variance is 0, we need to avoid division by 0
Output_T var = sqrt(variances[output] > 0.0 ? variances[output] + epsilon : epsilon);
for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
const int outputOffset = OUTPUTS_HEIGHT * oy + ox;
for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
const int outputOffset = batch * OUTPUTS_WIDTH * OUTPUTS_HEIGHT * NB_OUTPUTS + output * OUTPUTS_WIDTH * OUTPUTS_HEIGHT + OUTPUTS_WIDTH * oy + ox;
const Output_T normalized = (inputs[outputOffset + output] - means[output]) / var;
const Output_T sAs = scales[output] * normalized + biases[output];
outputs[outputOffset + output] = sat<Output_T>(sAs, output, ACTIVATION, NoScaling);
const Output_T normalized = (inputs[outputOffset] - means[output]) / var;
const Output_T sAs = scales[output] * normalized + biases[output];
outputs[outputOffset] = activation_forward_value<Output_T>(sAs, output, ACTIVATION, rescaling);
}
}
}
}
......
#ifndef __AIDGE_EXPORT_CPP_KERNELS_CONCAT__
#define __AIDGE_EXPORT_CPP_KERNELS_CONCAT__
template<int AXIS_SIZE_POST,
int AXIS_SIZE_PRE,
unsigned int NB_INPUTS,
typename T>
__attribute__((always_inline)) inline static
void concat_forward (
const T* const * __restrict inputs,
const unsigned int* __restrict sizes,
T* __restrict output)
{
unsigned int total_concat_axis_size = 0;
for (unsigned int n = 0; n < NB_INPUTS; ++n)
total_concat_axis_size += sizes[n];
for (int i = 0; i < AXIS_SIZE_PRE; ++i) {
// Loop over post-axis (e.g., dims after axis 1)
for (int j = 0; j < AXIS_SIZE_POST; ++j) {
unsigned int axis_offset = 0;
// Loop over each input tensor
for (unsigned int n = 0; n < NB_INPUTS; ++n) {
for (unsigned int k = 0; k < sizes[n]; ++k) {
const int input_idx = i * sizes[n] * AXIS_SIZE_POST + k * AXIS_SIZE_POST + j;
output[i * total_concat_axis_size * AXIS_SIZE_POST + (axis_offset + k) * AXIS_SIZE_POST + j] =
inputs[n][input_idx];
}
axis_offset += sizes[n]; // move along axis in output
}
}
}
}
#endif // __AIDGE_EXPORT_CPP_KERNELS_CONCAT__
\ No newline at end of file
......@@ -2,13 +2,13 @@
#define __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION__
#include "network/typedefs.hpp"
#include "network/rescaling.hpp"
#include "network/rescaling_utils.hpp"
#include "network/utils.hpp"
#include "kernels/macs.hpp"
#include "kernels/activation.hpp"
#include "network/macs.hpp"
#include "network/activation_utils.hpp"
template<int NB_CHANNELS,
template<int NB_CHANNELS,
int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
int NB_OUTPUTS,
int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
......@@ -17,10 +17,10 @@ template<int NB_CHANNELS,
int DILATION_Y, int DILATION_X,
int KERNEL_HEIGHT, int KERNEL_WIDTH,
ActivationFunction_T ACTIVATION,
typename Input_T, typename Output_T,
typename Input_T, typename Output_T,
typename Weight_T, typename Bias_T,
typename Rescaling_T>
__attribute__((always_inline)) inline
__attribute__((always_inline)) inline
void convolution_forward(
const Input_T* __restrict inputs,
Output_T* __restrict outputs,
......@@ -28,10 +28,10 @@ void convolution_forward(
const Bias_T* __restrict biases,
const Rescaling_T& __restrict rescaling)
{
constexpr int DILATED_KERNEL_HEIGHT
constexpr int DILATED_KERNEL_HEIGHT
= KERNEL_HEIGHT + (DILATION_Y - 1) * (KERNEL_HEIGHT - 1);
constexpr int DILATED_KERNEL_WIDTH
constexpr int DILATED_KERNEL_WIDTH
= KERNEL_WIDTH + (DILATION_X - 1) * (KERNEL_WIDTH - 1);
constexpr int OUTPUTS_HEIGHT_NOPAD
......@@ -44,11 +44,13 @@ void convolution_forward(
: max(PADDING_Y - (oy * STRIDE_Y), 0);
const int syMax = (PADDING_Y == 0
&& OUTPUTS_HEIGHT == OUTPUTS_HEIGHT_NOPAD) ? DILATED_KERNEL_HEIGHT
: clamp(CHANNELS_HEIGHT + PADDING_Y - (oy * STRIDE_Y),
: clamp(CHANNELS_HEIGHT + PADDING_Y - (oy * STRIDE_Y),
0, DILATED_KERNEL_HEIGHT);
const int iy = (oy * STRIDE_Y) - PADDING_Y;
#ifdef _OPENMP
#pragma omp parallel for collapse(2)
#endif
for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
for (int output = 0; output < NB_OUTPUTS; ++output) {
// moved to inner loop for collapsing -->
......@@ -57,16 +59,16 @@ void convolution_forward(
const int sxMax = (PADDING_X == 0
&& OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
? DILATED_KERNEL_WIDTH
: clamp(CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X),
: clamp(CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X),
0, DILATED_KERNEL_WIDTH);
const int ix = (ox * STRIDE_X) - PADDING_X;
const int oPos = (ox + OUTPUTS_WIDTH * oy);
int oOffset = NB_OUTPUTS * oPos;
const int oOffset = NB_OUTPUTS * oPos;
// <--
Bias_T weightedSum = biases[output];
// Check if the biases are defined
Bias_T weightedSum = biases ? biases[output] : 0;
for (int sy = 0; sy < KERNEL_HEIGHT; ++sy) {
if ((PADDING_Y != 0
......@@ -77,7 +79,7 @@ void convolution_forward(
}
const int iPos = ix + CHANNELS_WIDTH * (iy + sy*DILATION_Y);
int iOffset = NB_CHANNELS * iPos;
const int iOffset = NB_CHANNELS * iPos;
const int wOffset = (output*KERNEL_HEIGHT + sy) * KERNEL_WIDTH * NB_CHANNELS;
......@@ -85,8 +87,8 @@ void convolution_forward(
|| sxMax - sxMin == KERNEL_WIDTH))
{
macsOnRange<KERNEL_WIDTH * NB_CHANNELS>(
inputs + iOffset,
weights + wOffset,
inputs + iOffset,
weights + wOffset,
weightedSum);
}
else {
......@@ -98,13 +100,13 @@ void convolution_forward(
continue;
}
int iOffsetInRange = iOffset
const int iOffsetInRange = iOffset
+ sx * DILATION_X * NB_CHANNELS;
macsOnRange<NB_CHANNELS>(
// same input line so no wrapping can occur
inputs + iOffsetInRange,
weights + wOffset + sx * NB_CHANNELS,
inputs + iOffsetInRange,
weights + wOffset + sx * NB_CHANNELS,
weightedSum);
}
}
......@@ -116,4 +118,45 @@ void convolution_forward(
}
}
// Template specialization when biases are not given to the convolution
template<int NB_CHANNELS,
int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
int NB_OUTPUTS,
int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
int PADDING_Y, int PADDING_X,
int STRIDE_Y, int STRIDE_X,
int DILATION_Y, int DILATION_X,
int KERNEL_HEIGHT, int KERNEL_WIDTH,
ActivationFunction_T ACTIVATION,
typename Input_T, typename Output_T,
typename Weight_T,
typename Rescaling_T>
__attribute__((always_inline)) inline
void convolution_forward(
const Input_T* __restrict inputs,
Output_T* __restrict outputs,
const Weight_T* __restrict weights,
std::nullptr_t __restrict,
const Rescaling_T& __restrict rescaling)
{
const float* b = nullptr;
convolution_forward<NB_CHANNELS,
CHANNELS_HEIGHT,
CHANNELS_WIDTH,
NB_OUTPUTS,
OUTPUTS_HEIGHT,
OUTPUTS_WIDTH,
PADDING_Y,
PADDING_X,
STRIDE_Y,
STRIDE_X,
DILATION_Y,
DILATION_X,
KERNEL_HEIGHT,
KERNEL_WIDTH,
ACTIVATION>
(inputs, outputs, weights, b, rescaling);
}
#endif // __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION__
#ifndef __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION_DEPTHWISE__
#define __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION_DEPTHWISE__
#include "network/typedefs.hpp"
#include "network/rescaling_utils.hpp"
#include "network/utils.hpp"
#include "network/macs.hpp"
#include "network/activation_utils.hpp"
template<int NB_CHANNELS,
int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
int NB_OUTPUTS,
int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
int PADDING_Y, int PADDING_X,
int STRIDE_Y, int STRIDE_X,
int DILATION_Y, int DILATION_X,
int KERNEL_HEIGHT, int KERNEL_WIDTH,
ActivationFunction_T ACTIVATION,
typename Input_T, typename Output_T,
typename Weight_T, typename Bias_T,
typename Rescaling_T>
__attribute__((always_inline)) inline
void convolution_depthwise_forward(
const Input_T* __restrict inputs,
Output_T* __restrict outputs,
const Weight_T* __restrict weights,
const Bias_T* __restrict biases,
const Rescaling_T& __restrict rescaling)
{
static_assert(NB_OUTPUTS % NB_CHANNELS == 0,
"NB_OUTPUTS should be a multiple of NB_CHANNELS.");
constexpr int DILATED_KERNEL_HEIGHT
= KERNEL_HEIGHT + (DILATION_Y - 1) * (KERNEL_HEIGHT - 1);
constexpr int DILATED_KERNEL_WIDTH
= KERNEL_WIDTH + (DILATION_X - 1) * (KERNEL_WIDTH - 1);
constexpr int OUTPUTS_HEIGHT_NOPAD
= (CHANNELS_HEIGHT - DILATION_Y * (KERNEL_HEIGHT - 1) - 1 + STRIDE_Y) / STRIDE_Y;
constexpr int OUTPUTS_WIDTH_NOPAD
= (CHANNELS_WIDTH - DILATION_X * (KERNEL_WIDTH - 1) - 1 + STRIDE_X) / STRIDE_X;
for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
const int syMin = (PADDING_Y == 0) ? 0
: max(PADDING_Y - (oy * STRIDE_Y), 0);
const int syMax = (PADDING_Y == 0
&& OUTPUTS_HEIGHT == OUTPUTS_HEIGHT_NOPAD) ? DILATED_KERNEL_HEIGHT
: clamp(CHANNELS_HEIGHT + PADDING_Y - (oy * STRIDE_Y),
0, DILATED_KERNEL_HEIGHT);
const int iy = (oy * STRIDE_Y) - PADDING_Y;
#ifdef _OPENMP
#pragma omp parallel for collapse(2)
#endif
for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
for (int output = 0; output < NB_OUTPUTS; ++output) {
// moved to inner loop for collapsing -->
const int sxMin = (PADDING_X == 0) ? 0
: max(PADDING_X - (ox * STRIDE_X), 0);
const int sxMax = (PADDING_X == 0
&& OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
? DILATED_KERNEL_WIDTH
: clamp(CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X),
0, DILATED_KERNEL_WIDTH);
const int ix = (ox * STRIDE_X) - PADDING_X;
const int oPos = (ox + OUTPUTS_WIDTH * oy);
const int oOffset = NB_OUTPUTS * oPos;
// <--
const int channel = (output * NB_CHANNELS) / NB_OUTPUTS;
Bias_T weightedSum = biases ? biases[output] : 0;
for (int sy = 0; sy < KERNEL_HEIGHT; ++sy) {
if ((PADDING_Y != 0
|| OUTPUTS_HEIGHT != OUTPUTS_HEIGHT_NOPAD)
&& ((sy*DILATION_Y < syMin) || (sy*DILATION_Y >= syMax)))
{
continue;
}
const int iPos = ix + CHANNELS_WIDTH * (iy + sy*DILATION_Y);
const int iOffset = NB_CHANNELS * iPos;
const int wOffset = (output*KERNEL_HEIGHT + sy)
* KERNEL_WIDTH;
if (DILATION_X == 1 && ((PADDING_X == 0
&& OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
|| sxMax - sxMin == KERNEL_WIDTH))
{
macsOnRange<KERNEL_WIDTH, NB_CHANNELS>(
inputs + iOffset + channel,
weights + wOffset,
weightedSum);
}
else {
for (int sx = 0; sx < KERNEL_WIDTH; ++sx) {
if ((PADDING_X != 0
|| OUTPUTS_WIDTH != OUTPUTS_WIDTH_NOPAD)
&& ((sx*DILATION_X < sxMin) || (sx*DILATION_X >= sxMax)))
{
continue;
}
const int iOffsetInRange = iOffset
+ sx * DILATION_X * NB_CHANNELS;
weightedSum += inputs[iOffsetInRange + channel]
* weights[wOffset + sx];
}
}
}
outputs[oOffset + output] = activation_forward_value<Output_T>(weightedSum, output, ACTIVATION, rescaling);
}
}
}
}
// Template specialization when biases are not given to the convolution
template<int NB_CHANNELS,
int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
int NB_OUTPUTS,
int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
int PADDING_Y, int PADDING_X,
int STRIDE_Y, int STRIDE_X,
int DILATION_Y, int DILATION_X,
int KERNEL_HEIGHT, int KERNEL_WIDTH,
ActivationFunction_T ACTIVATION,
typename Input_T, typename Output_T,
typename Weight_T,
typename Rescaling_T>
__attribute__((always_inline)) inline
void convolution_depthwise_forward(
const Input_T* __restrict inputs,
Output_T* __restrict outputs,
const Weight_T* __restrict weights,
std::nullptr_t __restrict,
const Rescaling_T& __restrict rescaling)
{
const float* b = nullptr;
convolution_depthwise_forward<NB_CHANNELS,
CHANNELS_HEIGHT,
CHANNELS_WIDTH,
NB_OUTPUTS,
OUTPUTS_HEIGHT,
OUTPUTS_WIDTH,
PADDING_Y,
PADDING_X,
STRIDE_Y,
STRIDE_X,
DILATION_Y,
DILATION_X,
KERNEL_HEIGHT,
KERNEL_WIDTH,
ACTIVATION>
(inputs, outputs, weights, b, rescaling);
}
#endif // __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION_DEPTHWISE__
......@@ -2,7 +2,7 @@
#define __AIDGE_EXPORT_CPP_KERNELS_ELEMWISE__
#include "network/typedefs.hpp"
#include "kernels/activation.hpp"
#include "network/activation_utils.hpp"
// Generic function for two inputs
......
......@@ -2,20 +2,20 @@
#define __AIDGE_EXPORT_CPP_KERNELS_FULLYCONNECTED__
#include "network/typedefs.hpp"
#include "network/rescaling.hpp"
#include "network/rescaling_utils.hpp"
#include "network/utils.hpp"
#include "kernels/macs.hpp"
#include "kernels/activation.hpp"
#include "network/macs.hpp"
#include "network/activation_utils.hpp"
template<int NB_CHANNELS,
template<int NB_CHANNELS,
int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
int NB_OUTPUTS,
int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
ActivationFunction_T ACTIVATION,
typename Input_T, typename Output_T,
typename Input_T, typename Output_T,
typename Weight_T, typename Bias_T,
typename Rescaling_T>
__attribute__((always_inline)) inline
__attribute__((always_inline)) inline
void fullyconnected_forward (
const Input_T* __restrict inputs,
Output_T* __restrict outputs,
......@@ -28,14 +28,17 @@ void fullyconnected_forward (
// It is only an issue if the FC was after a flatten layer.
// Otherwise it is not an issue for the other FC because CHANNELS_WIDTH = CHANNELS_HEIGHT = 1
// Solution: Add a system to check dataformat
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int och = 0; och < NB_OUTPUTS; och++) {
Bias_T weightedSum = biases[och];
Bias_T weightedSum = (biases) ? biases[och] : Bias_T(0);
for (int iy = 0; iy < CHANNELS_HEIGHT; ++iy) {
for (int ix = 0; ix < CHANNELS_WIDTH; ++ix) {
for (int ch = 0; ch < NB_CHANNELS; ++ch) {
weightedSum += inputs[CHANNELS_WIDTH*NB_CHANNELS*iy + NB_CHANNELS*ix + ch]
weightedSum += inputs[CHANNELS_WIDTH*NB_CHANNELS*iy + NB_CHANNELS*ix + ch]
* weights[CHANNELS_HEIGHT*CHANNELS_WIDTH*NB_CHANNELS*och + CHANNELS_HEIGHT*CHANNELS_WIDTH*ch + CHANNELS_HEIGHT*iy + ix];
}
}
......@@ -45,10 +48,12 @@ void fullyconnected_forward (
}
/*
Here the kernel to use with inputs in NHWC and weights in NHWC
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int och = 0; och < NB_OUTPUTS; och++) {
Bias_T weightedSum = biases[och];
Bias_T weightedSum = (biases) ? biases[och] : Bias_T(0);
for (int iy = 0; iy < CHANNELS_HEIGHT; ++iy) {
const int iPos = (CHANNELS_WIDTH * iy);
......@@ -58,8 +63,8 @@ Here the kernel to use with inputs in NHWC and weights in NHWC
* (iy + CHANNELS_HEIGHT * och);
macsOnRange<NB_CHANNELS * CHANNELS_WIDTH>(
inputs + iOffset,
weights + wOffset,
inputs + iOffset,
weights + wOffset,
weightedSum);
}
......@@ -69,4 +74,4 @@ Here the kernel to use with inputs in NHWC and weights in NHWC
}
#endif // __AIDGE_EXPORT_CPP_KERNELS_FULLYCONNECTED__
\ No newline at end of file
#endif // __AIDGE_EXPORT_CPP_KERNELS_FULLYCONNECTED__
#ifndef __AIDGE_EXPORT_CPP_KERNELS_HARDMAX__
#define __AIDGE_EXPORT_CPP_KERNELS_HARDMAX__
#include "network/typedefs.hpp"
#include "network/utils.hpp"
// Todo add border value and border type (Reflect, Constant, Wrap...) and add
// the two missing pad value (bottom and right)
template <unsigned int AXIS_DIM_SIZE,
unsigned int PREAXIS_STRIDE,
unsigned int AXIS_STRIDE,
unsigned int POSTAXIS_STRIDE,
unsigned int NB_ELTS,
typename Input_T,
typename Output_T>
// void HardmaxImpl_cpu_forward_kernel(std::int32_t axis_, const
// std::vector<DimSize_t>& dims, const void* input_, void* output_)
__attribute__((always_inline)) inline void
hardmax2d_forward(const Input_T *__restrict input,
Output_T *__restrict output) {
// fill output with 0
for (Output_T *i = output; i != output + NB_ELTS; ++i) {
*i = 0;
}
// For each index on all the axes before and after 'axis', we have a
// different max element to find
for (unsigned int i = 0, preAxisOffset = 0; i < PREAXIS_STRIDE;
++i, preAxisOffset += AXIS_DIM_SIZE * POSTAXIS_STRIDE) {
for (unsigned int j = 0; j < POSTAXIS_STRIDE; ++j) {
// Init the max with first element
unsigned int maxIdx = 0;
Input_T maxVal = input[preAxisOffset + j];
// Loop over the elements on 'axis'
// Since we start at 0th idx, we already initialize the values like
// the 1st iteration has been done
for (unsigned int k = 1,
postAxisOffset = preAxisOffset + POSTAXIS_STRIDE;
k < AXIS_DIM_SIZE;
++k, postAxisOffset += POSTAXIS_STRIDE) {
Input_T currVal = input[postAxisOffset + j];
// Update max elements
if (currVal > maxVal) {
maxIdx = k;
maxVal = currVal;
}
}
output[preAxisOffset + maxIdx * POSTAXIS_STRIDE + j] = 1;
}
}
}
#endif // __AIDGE_EXPORT_CPP_KERNELS_HARDMAX__
......@@ -11,7 +11,9 @@ void leakyrelu_forward (
Output_T* __restrict outputs,
const float negative_slope)
{
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int i = 0; i < NB_DATA; ++i) {
if (inputs[i] >= 0) {
outputs[i] = inputs[i];
......