Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • eclipse/aidge/aidge_export_cpp
  • hrouis/aidge_export_cpp
  • clementgf/aidge_export_cpp
  • cguillon/aidge_export_cpp
  • silvanosky/aidge_export_cpp
  • maab05/aidge_export_cpp
  • mnewson/aidge_export_cpp
  • axelfarr/aidge_export_cpp
  • gallasko/aidge_export_cpp
  • wboussella/aidge_export_cpp
  • mick94/aidge_export_cpp
  • louislerbourg/aidge_export_cpp
12 results
Show changes
Commits on Source (26)
Showing
with 1219 additions and 162 deletions
#ifndef __AIDGE_EXPORT_CPP_KERNELS_ADD__
#define __AIDGE_EXPORT_CPP_KERNELS_ADD__
#include "network/typedefs.hpp"
#include "kernels/activation.hpp"
template<int NB_ELTS,
int INPUT_A_DIMS[], int INPUT_B_DIMS[], int OUTPUT_DIMS[],
int SIZE_DIM_IN_A, int SIZE_DIM_IN_B, int SIZE_DIM_OUT, int OUT_SIZE,
ActivationFunction_T ACTIVATION,
typename Input_T, typename Output_T>
__attribute__((always_inline)) inline
void add_forward (
Output_T* __restrict outputs,
const Input_T* __restrict inputs1,
const Input_T* __restrict inputs2)
{
int ndim_a[SIZE_DIM_OUT];
int ndim_b[SIZE_DIM_OUT];
for (int i= 0; i<SIZE_DIM_OUT; i++){
int idx = SIZE_DIM_OUT-SIZE_DIM_IN_A;
ndim_a[i] = (i< idx) ? 1 : INPUT_A_DIMS[i-idx];
}
for (int i= 0; i<SIZE_DIM_OUT; i++){
int idx = SIZE_DIM_OUT-SIZE_DIM_IN_B;
ndim_b[i] = (i< idx) ? 1 : INPUT_B_DIMS[i-idx];
}
// Find the highest equal dimension
int contiguousidx = SIZE_DIM_OUT -1 ;
for (int i = contiguousidx ; ndim_a[i] == ndim_b[i]; i--) {
contiguousidx = i;
}
// Compute the highest number of contiguous data for each Tensor
int input0_contiguous_size = 1;
for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
input0_contiguous_size *= ndim_a[i];
}
int input1_contiguous_size = 1;
for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
input1_contiguous_size *= ndim_b[i];
}
int output_contiguous_size = 1;
for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
output_contiguous_size *= OUTPUT_DIMS[i];
}
// initialize strides to iterate through data because of broadcasting
int stride_post0[contiguousidx ] ;
int stride_post1[contiguousidx ] ;
int stride_step0[contiguousidx ] ;
int stride_step1[contiguousidx ] ;
if (contiguousidx > 0) {
stride_post0[contiguousidx - 1] = 1;
stride_post1[contiguousidx - 1] = 1;
for (int i = contiguousidx -2; i != -1; --i) {
stride_post0[i] = stride_post0[i+1]*ndim_a[i+1];
stride_post1[i] = stride_post1[i+1]*ndim_b[i+1];
}
for (int i = 0; i < contiguousidx ; ++i) {
stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1;
stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1;
}
}
int offsetIn0 = 0;
int offsetIn1 = 0;
int offsetOut = 0;
int nbMatrices = 1;
for(int i = 0; i<contiguousidx ; ++i){
nbMatrices *= OUTPUT_DIMS[i];
}
int dim = contiguousidx - 1;
for(int stack = 0; stack < nbMatrices;){
for(int i = 0; i < output_contiguous_size; ++i){
int in0_id = (input0_contiguous_size != 1) ? i : 0;
int in1_id = (input1_contiguous_size != 1) ? i : 0;
outputs[i + offsetOut*output_contiguous_size] = inputs1[in0_id + offsetIn0*input0_contiguous_size] + inputs2[in1_id + offsetIn1*input1_contiguous_size];
}
if (++stack < nbMatrices) {
int tmp_stack = stack;
while(tmp_stack % OUTPUT_DIMS[dim] == 0) {
tmp_stack /= OUTPUT_DIMS[dim];
dim--;
}
offsetIn0 += stride_step0[dim];
offsetIn1 += stride_step1[dim];
++offsetOut;
dim = contiguousidx - 1;
}
}
}
#endif // __AIDGE_EXPORT_CPP_KERNELS_ADD__
\ No newline at end of file
......@@ -3,6 +3,7 @@
#include "network/typedefs.hpp"
#include "kernels/rescaling.hpp"
#include "kernels/activation.hpp"
#include <math.h>
// WARNING: this kernel only works for 32-bits floating point values
......@@ -12,30 +13,40 @@ template<int NB_OUTPUTS,
ActivationFunction_T ACTIVATION,
typename Input_T, typename Output_T,
typename Param_T>
__attribute__((always_inline)) inline
__attribute__((always_inline)) inline
void batchnorm_forward (
const Input_T* __restrict inputs,
Output_T* __restrict outputs,
const Param_T* __restrict scales,
const Param_T* __restrict biases,
const Param_T* __restrict variances,
const Param_T* __restrict means,
const Param_T* __restrict scales,
const Param_T* __restrict variances,
const double epsilon)
{
for (unsigned int output = 0; output < NB_OUTPUTS; ++output) {
const Output_T var = sqrt(variances[output] + epsilon);
for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
const int outputOffset = OUTPUTS_HEIGHT * oy + ox;
const Output_T normalized = (inputs[outputOffset + output] - means[output]) / var;
const Output_T sAs = scales[output] * normalized + biases[output];
outputs[outputOffset + output] = sat<Output_T>(sAs, output, ACTIVATION, NoScaling);
}
int featureMapSize = OUTPUTS_HEIGHT * OUTPUTS_WIDTH;
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int ch = 0; ch < NB_OUTPUTS; ++ch) {
int ioIndex = ch * featureMapSize;
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int i = ioIndex; i < ioIndex + featureMapSize; i++) {
outputs[i] = biases[ch];
}
float var = sqrt(variances[ch] + epsilon);
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int feature = 0; feature < featureMapSize; ++feature) {
outputs[ioIndex + feature] += (scales[ch] * (inputs[ioIndex + feature] - means[ch]) / var);
}
}
}
#endif // __AIDGE_EXPORT_CPP_KERNELS_BATCHNORM__
......@@ -6,30 +6,44 @@
#include "network/utils.hpp"
#include "kernels/macs.hpp"
#include "kernels/activation.hpp"
#include <omp.h>
#include <iostream>
// Weights index en NHWC
constexpr int inds_pos(int n, int c, int h, int w, int N, int C, int H, int W) {
return n * (H * W * C) +
h * (W * C) +
w * C +
c;
}
// Image index in CHW
constexpr int inds_pos(int c, int h, int w, int C, int H, int W) {
return c * (H * W) +
h * W +
w;
}
template<int NB_CHANNELS,
int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
int NB_OUTPUTS,
int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
template<int NB_CHANNELS,
int IN_HEIGHT, int IN_WIDTH,
int NB_OUTPUTS, int GROUPS,
int OUT_HEIGHT, int OUT_WIDTH,
int PADDING_Y, int PADDING_X,
int STRIDE_Y, int STRIDE_X,
int DILATION_Y, int DILATION_X,
int KERNEL_HEIGHT, int KERNEL_WIDTH,
ActivationFunction_T ACTIVATION,
typename Input_T, typename Output_T,
typename Input_T, typename Output_T,
typename Weight_T, typename Bias_T,
typename Rescaling_T>
__attribute__((always_inline)) inline
__attribute__((always_inline)) inline
void convolution_forward(
const Input_T* __restrict inputs,
const Input_T* __restrict inputs,
Output_T* __restrict outputs,
const Weight_T* __restrict weights,
const Bias_T* __restrict biases,
const Rescaling_T& __restrict rescaling)
{
constexpr int DILATED_KERNEL_HEIGHT
= KERNEL_HEIGHT + (DILATION_Y - 1) * (KERNEL_HEIGHT - 1);
constexpr int DILATED_KERNEL_WIDTH
= KERNEL_WIDTH + (DILATION_X - 1) * (KERNEL_WIDTH - 1);
......
#ifndef __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION__
#define __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION__
#include "network/typedefs.hpp"
#include "kernels/rescaling.hpp"
#include "network/utils.hpp"
#include "kernels/macs.hpp"
#include "kernels/activation.hpp"
// Weights index en NHWC
constexpr int inds_pos(int n, int c, int h, int w, int N, int C, int H, int W) {
return n * (H * W * C) +
h * (W * C) +
w * C +
c;
}
// Image index in CHW
constexpr int inds_pos(int c, int h, int w, int C, int H, int W) {
return c * (H * W) +
h * W +
w;
}
template<int NB_CHANNELS,
int IN_HEIGHT, int IN_WIDTH,
int NB_OUTPUTS, int GROUPS,
int OUT_HEIGHT, int OUT_WIDTH,
int PADDING_Y, int PADDING_X,
int STRIDE_Y, int STRIDE_X,
int DILATION_Y, int DILATION_X,
int KERNEL_HEIGHT, int KERNEL_WIDTH,
ActivationFunction_T ACTIVATION,
typename Input_T, typename Output_T,
typename Weight_T, typename Bias_T,
typename Rescaling_T>
__attribute__((always_inline)) inline
void convolution_forward(
const Input_T* __restrict inputs,
Output_T* __restrict outputs,
const Weight_T* __restrict weights,
const Bias_T* __restrict biases,
const Rescaling_T& __restrict rescaling)
{
if (NB_CHANNELS % GROUPS != 0 || NB_OUTPUTS % GROUPS != 0) {
throw std::invalid_argument("Groups must be a divisor of both NB_CHANNELS and NB_OUTPUTS!");
}
int c_in_g = NB_CHANNELS / GROUPS;
int c_out_g = NB_OUTPUTS / GROUPS;
#ifdef _OPENMP
#pragma omp parallel for collapse(3)
#endif
for (int oc = 0; oc < NB_OUTPUTS; oc++) {
for (int i = 0; i < OUT_HEIGHT; ++i) {
for (int j = 0; j < OUT_WIDTH; ++j) {
int g_oc = oc / c_out_g;
Output_T value = biases[oc];
for (int ic = g_oc * c_in_g; ic < (g_oc + 1) * c_in_g; ++ic) {
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int m = 0; m < KERNEL_HEIGHT; ++m) {
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int n = 0; n < KERNEL_WIDTH; ++n) {
int i_p = i * STRIDE_X - PADDING_X + m * DILATION_X;
int j_p = j * STRIDE_Y - PADDING_Y + n * DILATION_Y;
if (i_p >= 0 && i_p < IN_HEIGHT && j_p >= 0 && j_p < IN_WIDTH) {
value += weights[inds_pos(oc, ic % c_in_g, m, n, NB_OUTPUTS, c_in_g, KERNEL_HEIGHT, KERNEL_WIDTH)] *
inputs[inds_pos(ic, i_p, j_p, NB_CHANNELS, IN_HEIGHT, IN_WIDTH)];
}
}
}
}
outputs[inds_pos(oc, i, j, NB_OUTPUTS, OUT_HEIGHT, OUT_WIDTH)] = activation_forward_value<Output_T>(value, oc, ACTIVATION, rescaling);
}
}
}
}
#endif // __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION__
\ No newline at end of file
#ifndef __AIDGE_EXPORT_CPP_KERNELS_DIV__
#define __AIDGE_EXPORT_CPP_KERNELS_DIV__
#include "network/typedefs.hpp"
#include "kernels/activation.hpp"
template<int NB_ELTS,
int INPUT_A_DIMS[], int INPUT_B_DIMS[], int OUTPUT_DIMS[],
int SIZE_DIM_IN_A, int SIZE_DIM_IN_B, int SIZE_DIM_OUT, int OUT_SIZE,
ActivationFunction_T ACTIVATION,
typename Input_T, typename Output_T>
__attribute__((always_inline)) inline
void div_forward (
Output_T* __restrict outputs,
const Input_T* __restrict inputs1,
const Input_T* __restrict inputs2)
{
int ndim_a[SIZE_DIM_OUT];
int ndim_b[SIZE_DIM_OUT];
for (int i= 0; i<SIZE_DIM_OUT; i++){
int idx = SIZE_DIM_OUT-SIZE_DIM_IN_A;
ndim_a[i] = (i< idx) ? 1 : INPUT_A_DIMS[i-idx];
}
for (int i= 0; i<SIZE_DIM_OUT; i++){
int idx = SIZE_DIM_OUT-SIZE_DIM_IN_B;
ndim_b[i] = (i< idx) ? 1 : INPUT_B_DIMS[i-idx];
}
// Find the highest equal dimension
int contiguousidx = SIZE_DIM_OUT -1 ;
for (int i = contiguousidx ; ndim_a[i] == ndim_b[i]; i--) {
contiguousidx = i;
}
// Compute the highest number of contiguous data for each Tensor
int input0_contiguous_size = 1;
for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
input0_contiguous_size *= ndim_a[i];
}
int input1_contiguous_size = 1;
for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
input1_contiguous_size *= ndim_b[i];
}
int output_contiguous_size = 1;
for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
output_contiguous_size *= OUTPUT_DIMS[i];
}
// initialize strides to iterate through data because of broadcasting
int stride_post0[contiguousidx ] ;
int stride_post1[contiguousidx ] ;
int stride_step0[contiguousidx ] ;
int stride_step1[contiguousidx ] ;
if (contiguousidx > 0) {
stride_post0[contiguousidx - 1] = 1;
stride_post1[contiguousidx - 1] = 1;
for (int i = contiguousidx -2; i != -1; --i) {
stride_post0[i] = stride_post0[i+1]*ndim_a[i+1];
stride_post1[i] = stride_post1[i+1]*ndim_b[i+1];
}
for (int i = 0; i < contiguousidx ; ++i) {
stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1;
stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1;
}
}
int offsetIn0 = 0;
int offsetIn1 = 0;
int offsetOut = 0;
int nbMatrices = 1;
for(int i = 0; i<contiguousidx ; ++i){
nbMatrices *= OUTPUT_DIMS[i];
}
int dim = contiguousidx - 1;
for(int stack = 0; stack < nbMatrices;){
for(int i = 0; i < output_contiguous_size; ++i){
int in0_id = (input0_contiguous_size != 1) ? i : 0;
int in1_id = (input1_contiguous_size != 1) ? i : 0;
outputs[i + offsetOut*output_contiguous_size] = inputs1[in0_id + offsetIn0*input0_contiguous_size] / inputs2[in1_id + offsetIn1*input1_contiguous_size];
}
if (++stack < nbMatrices) {
int tmp_stack = stack;
while(tmp_stack % OUTPUT_DIMS[dim] == 0) {
tmp_stack /= OUTPUT_DIMS[dim];
dim--;
}
offsetIn0 += stride_step0[dim];
offsetIn1 += stride_step1[dim];
++offsetOut;
dim = contiguousidx - 1;
}
}
}
#endif // __AIDGE_EXPORT_CPP_KERNELS_DIV__
\ No newline at end of file
......@@ -4,13 +4,12 @@
#include "network/typedefs.hpp"
#include "kernels/activation.hpp"
// Generic function for two inputs
template<int NB_ELTS,
ElemWise_T ELEM_OP,
ActivationFunction_T ACTIVATION,
typename Input_T, typename Output_T,
typename Rescaling_T>
template<int NB_ELTS, ElemWise_T ELEM_OP,
int INPUT_A_DIMS[], int INPUT_B_DIMS[], int OUTPUT_DIMS[],
int SIZE_DIM_IN_A, int SIZE_DIM_IN_B, int SIZE_DIM_OUT, int OUT_SIZE,
ActivationFunction_T ACTIVATION,
typename Input_T, typename Output_T, typename Rescaling_T>
__attribute__((always_inline)) inline
void elemwise_forward (
Output_T* __restrict outputs,
......@@ -21,32 +20,313 @@ void elemwise_forward (
if (std::is_floating_point<Input_T>::value)
{
Input_T val = 0;
switch (ELEM_OP) {
case Add: {
for (int i = 0; i < NB_ELTS; ++i) {
val = inputs1[i] + inputs2[i];
outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
int ndim_a[SIZE_DIM_OUT];
int ndim_b[SIZE_DIM_OUT];
for (int i= 0; i<SIZE_DIM_OUT; i++){
int idx = SIZE_DIM_OUT-SIZE_DIM_IN_A;
ndim_a[i] = (i< idx) ? 1 : INPUT_A_DIMS[i-idx];
}
for (int i= 0; i<SIZE_DIM_OUT; i++){
int idx = SIZE_DIM_OUT-SIZE_DIM_IN_B;
ndim_b[i] = (i< idx) ? 1 : INPUT_B_DIMS[i-idx];
}
// Find the highest equal dimension
int contiguousidx = SIZE_DIM_OUT -1 ;
for (int i = contiguousidx ; ndim_a[i] == ndim_b[i]; i--) {
contiguousidx = i;
}
// Compute the highest number of contiguous data for each Tensor
int input0_contiguous_size = 1;
for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
input0_contiguous_size *= ndim_a[i];
}
int input1_contiguous_size = 1;
for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
input1_contiguous_size *= ndim_b[i];
}
int output_contiguous_size = 1;
for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
output_contiguous_size *= OUTPUT_DIMS[i];
}
// initialize strides to iterate through data because of broadcasting
int stride_post0[contiguousidx ] ;
int stride_post1[contiguousidx ] ;
int stride_step0[contiguousidx ] ;
int stride_step1[contiguousidx ] ;
if (contiguousidx > 0) {
stride_post0[contiguousidx - 1] = 1;
stride_post1[contiguousidx - 1] = 1;
for (int i = contiguousidx -2; i != -1; --i) {
stride_post0[i] = stride_post0[i+1]*ndim_a[i+1];
stride_post1[i] = stride_post1[i+1]*ndim_b[i+1];
}
for (int i = 0; i < contiguousidx ; ++i) {
stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1;
stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1;
}
}
int offsetIn0 = 0;
int offsetIn1 = 0;
int offsetOut = 0;
int nbMatrices = 1;
for(int i = 0; i<contiguousidx ; ++i){
nbMatrices *= OUTPUT_DIMS[i];
}
int dim = contiguousidx - 1;
for(int stack = 0; stack < nbMatrices;){
for(int i = 0; i < output_contiguous_size; ++i){
int in0_id = (input0_contiguous_size != 1) ? i : 0;
int in1_id = (input1_contiguous_size != 1) ? i : 0;
outputs[i + offsetOut*output_contiguous_size] = inputs1[in0_id + offsetIn0*input0_contiguous_size] + inputs2[in1_id + offsetIn1*input1_contiguous_size];
}
if (++stack < nbMatrices) {
int tmp_stack = stack;
while(tmp_stack % OUTPUT_DIMS[dim] == 0) {
tmp_stack /= OUTPUT_DIMS[dim];
dim--;
}
offsetIn0 += stride_step0[dim];
offsetIn1 += stride_step1[dim];
++offsetOut;
dim = contiguousidx - 1;
}
}
break;
}
case Sub: {
for (int i = 0; i < NB_ELTS; ++i) {
val = inputs1[i] - inputs2[i];
outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
case Sub: {
int ndim_a[SIZE_DIM_OUT];
int ndim_b[SIZE_DIM_OUT];
for (int i= 0; i<SIZE_DIM_OUT; i++){
int idx = SIZE_DIM_OUT-SIZE_DIM_IN_A;
ndim_a[i] = (i< idx) ? 1 : INPUT_A_DIMS[i-idx];
}
for (int i= 0; i<SIZE_DIM_OUT; i++){
int idx = SIZE_DIM_OUT-SIZE_DIM_IN_B;
ndim_b[i] = (i< idx) ? 1 : INPUT_B_DIMS[i-idx];
}
// Find the highest equal dimension
int contiguousIdx = SIZE_DIM_OUT-1;
for (int i = contiguousIdx ; ndim_a[i] == ndim_b[i]; i--) {
contiguousIdx = i;
}
// Compute the highest number of contiguous data for each Tensor
int input0_contiguous_size = 1;
for(int i = contiguousIdx; i<SIZE_DIM_OUT; ++i){
input0_contiguous_size *= ndim_a[i];
}
int input1_contiguous_size = 1;
for(int i = contiguousIdx; i<SIZE_DIM_OUT; ++i){
input1_contiguous_size *= ndim_b[i];
}
int output_contiguous_size = 1;
for(int i = contiguousIdx; i<SIZE_DIM_OUT; ++i){
output_contiguous_size *= OUTPUT_DIMS[i];
}
// initialize strides to iterate through data because of broadcasting
int stride_post0[contiguousIdx] ;
int stride_post1[contiguousIdx] ;
int stride_step0[contiguousIdx] ;
int stride_step1[contiguousIdx] ;
if (contiguousIdx > 0) {
stride_post0[contiguousIdx - 1] = 1;
stride_post1[contiguousIdx - 1] = 1;
for (int i = contiguousIdx-2; i != -1; --i) {
stride_post0[i] = stride_post0[i+1]*ndim_a[i+1];
stride_post1[i] = stride_post1[i+1]*ndim_b[i+1];
}
for (int i = 0; i < contiguousIdx; ++i) {
stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1;
stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1;
}
}
int offsetIn0 = 0;
int offsetIn1 = 0;
int offsetOut = 0;
int nbMatrices = 1;
for(int i = 0; i<contiguousIdx; ++i){
nbMatrices *= OUTPUT_DIMS[i];
}
int dim = contiguousIdx - 1;
for(int stack = 0; stack < nbMatrices;){
for(int i = 0; i < output_contiguous_size; ++i){
int in0_id = (input0_contiguous_size != 1) ? i : 0;
int in1_id = (input1_contiguous_size != 1) ? i : 0;
outputs[i + offsetOut*output_contiguous_size] = inputs1[in0_id + offsetIn0*input0_contiguous_size] - inputs2[in1_id + offsetIn1*input1_contiguous_size];
}
if (++stack < nbMatrices) {
int tmp_stack = stack;
while(tmp_stack % OUTPUT_DIMS[dim] == 0) {
tmp_stack /= OUTPUT_DIMS[dim];
dim--;
}
offsetIn0 += stride_step0[dim];
offsetIn1 += stride_step1[dim];
++offsetOut;
dim = contiguousIdx - 1;
}
}
break;
}
case Mul: {
for (int i = 0; i < NB_ELTS; ++i) {
val = inputs1[i] * inputs2[i];
outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
int ndim_a[SIZE_DIM_OUT];
int ndim_b[SIZE_DIM_OUT];
for (int i= 0; i<SIZE_DIM_OUT; i++){
int idx = SIZE_DIM_OUT-SIZE_DIM_IN_A;
ndim_a[i] = (i< idx) ? 1 : INPUT_A_DIMS[i-idx];
}
for (int i= 0; i<SIZE_DIM_OUT; i++){
int idx = SIZE_DIM_OUT-SIZE_DIM_IN_B;
ndim_b[i] = (i< idx) ? 1 : INPUT_B_DIMS[i-idx];
}
// Find the highest equal dimension
int contiguousIdx = SIZE_DIM_OUT-1;
for (int i = contiguousIdx; ndim_a[i] == ndim_b[i]; i--) {
contiguousIdx = i;
}
// Compute the highest number of contiguous data for each Tensor
int input0_contiguous_size = 1;
for(int i = contiguousIdx; i<SIZE_DIM_OUT; ++i){
input0_contiguous_size *= ndim_a[i];
}
int input1_contiguous_size = 1;
for(int i = contiguousIdx; i<SIZE_DIM_OUT; ++i){
input1_contiguous_size *= ndim_b[i];
}
int output_contiguous_size = 1;
for(int i = contiguousIdx; i<SIZE_DIM_OUT; ++i){
output_contiguous_size *= OUTPUT_DIMS[i];
}
// initialize strides to iterate through data because of broadcasting
int stride_post0[contiguousIdx] ;
int stride_post1[contiguousIdx] ;
int stride_step0[contiguousIdx] ;
int stride_step1[contiguousIdx] ;
if (contiguousIdx > 0) {
stride_post0[contiguousIdx - 1] = 1;
stride_post1[contiguousIdx - 1] = 1;
for (int i = contiguousIdx-2; i != -1; --i) {
stride_post0[i] = stride_post0[i+1]*ndim_a[i+1];
stride_post1[i] = stride_post1[i+1]*ndim_b[i+1];
}
for (int i = 0; i < contiguousIdx; ++i) {
stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1;
stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1;
}
}
int offsetIn0 = 0;
int offsetIn1 = 0;
int offsetOut = 0;
int nbMatrices = 1;
for(int i = 0; i<contiguousIdx; ++i){
nbMatrices *= OUTPUT_DIMS[i];
}
int dim = contiguousIdx - 1;
for(int stack = 0; stack < nbMatrices;){
for(int i = 0; i < output_contiguous_size; ++i){
int in0_id = (input0_contiguous_size != 1) ? i : 0;
int in1_id = (input1_contiguous_size != 1) ? i : 0;
outputs[i + offsetOut*output_contiguous_size] = inputs1[in0_id + offsetIn0*input0_contiguous_size] * inputs2[in1_id + offsetIn1*input1_contiguous_size];
}
if (++stack < nbMatrices) {
int tmp_stack = stack;
while(tmp_stack % OUTPUT_DIMS[dim] == 0) {
tmp_stack /= OUTPUT_DIMS[dim];
dim--;
}
offsetIn0 += stride_step0[dim];
offsetIn1 += stride_step1[dim];
++offsetOut;
dim = contiguousIdx - 1;
}
}
break;
}
case Div: {
int ndim_a[SIZE_DIM_OUT];
int ndim_b[SIZE_DIM_OUT];
for (int i= 0; i<SIZE_DIM_OUT; i++){
int idx = SIZE_DIM_OUT-SIZE_DIM_IN_A;
ndim_a[i] = (i< idx) ? 1 : INPUT_A_DIMS[i-idx];
}
for (int i= 0; i<SIZE_DIM_OUT; i++){
int idx = SIZE_DIM_OUT-SIZE_DIM_IN_B;
ndim_b[i] = (i< idx) ? 1 : INPUT_B_DIMS[i-idx];
}
// Find the highest equal dimension
int contiguousIdx = SIZE_DIM_OUT-1;
for (int i = contiguousIdx; ndim_a[i] == ndim_b[i]; i--) {
contiguousIdx = i;
}
// Compute the highest number of contiguous data for each Tensor
int input0_contiguous_size = 1;
for(int i = contiguousIdx; i<SIZE_DIM_OUT; ++i){
input0_contiguous_size *= ndim_a[i];
}
int input1_contiguous_size = 1;
for(int i = contiguousIdx; i<SIZE_DIM_OUT; ++i){
input1_contiguous_size *= ndim_b[i];
}
int output_contiguous_size = 1;
for(int i = contiguousIdx; i<SIZE_DIM_OUT; ++i){
output_contiguous_size *= OUTPUT_DIMS[i];
}
// initialize strides to iterate through data because of broadcasting
int stride_post0[contiguousIdx] ;
int stride_post1[contiguousIdx] ;
int stride_step0[contiguousIdx] ;
int stride_step1[contiguousIdx] ;
if (contiguousIdx > 0) {
stride_post0[contiguousIdx - 1] = 1;
stride_post1[contiguousIdx - 1] = 1;
for (int i = contiguousIdx-2; i != -1; --i) {
stride_post0[i] = stride_post0[i+1]*ndim_a[i+1];
stride_post1[i] = stride_post1[i+1]*ndim_b[i+1];
}
for (int i = 0; i < contiguousIdx; ++i) {
stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1;
stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1;
}
}
int offsetIn0 = 0;
int offsetIn1 = 0;
int offsetOut = 0;
int nbMatrices = 1;
for(int i = 0; i<contiguousIdx; ++i){
nbMatrices *= OUTPUT_DIMS[i];
}
int dim = contiguousIdx - 1;
for(int stack = 0; stack < nbMatrices;){
for(int i = 0; i < output_contiguous_size; ++i){
int in0_id = (input0_contiguous_size != 1) ? i : 0;
int in1_id = (input1_contiguous_size != 1) ? i : 0;
outputs[i + offsetOut*output_contiguous_size] = inputs1[in0_id + offsetIn0*input0_contiguous_size] / inputs2[in1_id + offsetIn1*input1_contiguous_size];
}
if (++stack < nbMatrices) {
int tmp_stack = stack;
while(tmp_stack % OUTPUT_DIMS[dim] == 0) {
tmp_stack /= OUTPUT_DIMS[dim];
dim--;
}
offsetIn0 += stride_step0[dim];
offsetIn1 += stride_step1[dim];
++offsetOut;
dim = contiguousIdx - 1;
}
}
break;
}
default: {
// Copy inputs1 in outputs for default case
for (int i = 0; i < NB_ELTS; ++i) {
val = inputs1[i];
outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
......@@ -58,7 +338,6 @@ void elemwise_forward (
else
{
int32_t val = 0;
switch (ELEM_OP) {
case Add: {
for (int i = 0; i < NB_ELTS; ++i) {
......@@ -81,6 +360,13 @@ void elemwise_forward (
}
break;
}
case Div: {
for (int i = 0; i < NB_ELTS; ++i) {
val = inputs1[i] / inputs2[i] ;
outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
}
break;
}
default: {
// Copy inputs1 in outputs for default case
for (int i = 0; i < NB_ELTS; ++i) {
......@@ -94,78 +380,6 @@ void elemwise_forward (
}
// Generic function for multiple inputs
// Not working
// template<ElemWise_T ELEM_OP, typename Output_T>
// __attribute__((always_inline)) inline
// Output_T elemWise (int /*pos*/, int /*ch*/)
// {
// return 0;
// }
// template<ElemWise_T ELEM_OP,
// int NB_CHANNELS,
// // For next inputs
// int... ARGS,
// typename... INPUTS,
// // Types
// typename Input_T, typename Output_T>
// __attribute__((always_inline)) inline
// Output_T elemWise (int pos, int ch,
// const Input_T* __restrict firstInputs,
// INPUTS... inputs)
// {
// int iOffset = NB_CHANNELS * pos;
// return firstInputs[iOffset + ch]
// + elemWise<ELEM_OP, ARGS...>(pos, ch, inputs...);
// }
// template<// For all inputs
// int NB_CHANNELS,
// int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
// int NB_ELTS,
// int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
// ElemWise_T ELEM_OP,
// ActivationFunction_T ACTIVATION,
// // For next inputs
// int... ARGS,
// typename... INPUTS,
// // Types
// typename Input_T, typename Output_T,
// typename Rescaling_T>
// __attribute__((always_inline)) inline
// void elemWise_forward (
// Output_T* __restrict outputs,
// const Rescaling_T& __restrict rescaling,
// const Input_T* __restrict firstInputs,
// INPUTS... inputs)
// {
// for (int oy = 0; oy < OUTPUTS_HEIGHT; oy++) {
// for (int ox = 0; ox < OUTPUTS_WIDTH; ox++) {
// const int pos = (ox + OUTPUTS_WIDTH * oy);
// int oOffset = NB_ELTS * pos;
// for (int ch = 0; ch < NB_ELTS; ++ch) {
// const Add_T val = elemWise<ELEM_OP,
// INPUT_NB_CHANNELS,
// INPUT_MEM_CONT_OFFSET,
// INPUT_MEM_CONT_NB_ELTS,
// INPUT_MEM_WRAP_OFFSET,
// INPUT_MEM_WRAP_NB_ELTS,
// INPUT_MEM_STRIDE,
// ARGS...>(pos, ch, firstInputs, inputs...);
// outputs[oOffset + ch]
// = sat<Output_T>(val, ch, ACTIVATION, rescaling);
// }
// }
// }
// }
#endif // __AIDGE_EXPORT_CPP_KERNELS_ELEMWISE__
#endif // __AIDGE_EXPORT_CPP_KERNELS_ELEMWISE__
\ No newline at end of file
#ifndef __AIDGE_EXPORT_CPP_KERNELS_ERF__
#define __AIDGE_EXPORT_CPP_KERNELS_ERF__
#include "network/typedefs.hpp"
#include <cmath>
#include <math.h>
template<int _NB_ELTS,
typename Input_T, typename Output_T>
__attribute__((always_inline)) inline
void erf_forward (
const Input_T* __restrict inputs,
Output_T* __restrict outputs)
{
double a1 = 0.254829592;
double a2 = -0.284496736;
double a3 = 1.421413741;
double a4 = -1.453152027;
double a5 = 1.061405429;
double p = 0.3275911;
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int i = 0; i < _NB_ELTS; ++i) {
int sign = 1;
if (inputs[i] < 0)
sign = -1;
double abs_value = abs(inputs[i]);
// A&S formula 7.1.26
double t = 1.0/(1.0 + p*abs_value);
double y = 1.0 - (((((a5*t + a4)*t) + a3)*t + a2)*t + a1)*t*exp(-abs_value*abs_value);
outputs[i] = sign*y;
}
}
#endif // __AIDGE_EXPORT_CPP_KERNELS_ERF_
\ No newline at end of file
......@@ -28,7 +28,7 @@ void fullyconnected_forward (
// It is only an issue if the FC was after a flatten layer.
// Otherwise it is not an issue for the other FC because CHANNELS_WIDTH = CHANNELS_HEIGHT = 1
// Solution: Add a system to check dataformat
for (int och = 0; och < NB_OUTPUTS; och++) {
/*for (int och = 0; och < NB_OUTPUTS; och++) {
Bias_T weightedSum = biases[och];
......@@ -42,9 +42,9 @@ void fullyconnected_forward (
}
outputs[och] = activation_forward_value<Output_T>(weightedSum, och, ACTIVATION, rescaling);
}
/*
Here the kernel to use with inputs in NHWC and weights in NHWC
}*/
//Here the kernel to use with inputs in NHWC and weights in NHWC
#pragma omp parallel for
for (int och = 0; och < NB_OUTPUTS; och++) {
......@@ -65,7 +65,7 @@ Here the kernel to use with inputs in NHWC and weights in NHWC
outputs[och] = activation_forward_value<Output_T>(weightedSum, och, ACTIVATION, rescaling);
}
*/
}
......
......@@ -6,28 +6,106 @@
// Generic function for matmul and activation
template<int M,
int K,
int N,
template<int INPUT_A_DIMS[], int INPUT_B_DIMS[], int OUTPUT_DIMS[],
int _SIZE_DIM_IN_A, int _SIZE_DIM_IN_B, int SIZE_DIM_OUT,
ActivationFunction_T ACTIVATION,
typename Input_T, typename Output_T,
typename Rescaling_T>
typename Input_T, typename Output_T>
__attribute__((always_inline)) inline
void matmul_forward (
const Input_T* __restrict inputs1,
const Input_T* __restrict inputs2,
Output_T* __restrict outputs,
const Rescaling_T& __restrict rescaling)
Output_T* __restrict outputs)
{
for (int m = 0; m < M; ++m) {
for (int n = 0; n < N; ++n) {
Output_T sum = Output_T(0);
for (int k = 0; k < K; ++k) {
sum += inputs1[K*m + k] * inputs2[N*k + n];
//initialize arrays storing broadcasted(or not) dims
int ndim_a[SIZE_DIM_OUT];
int ndim_b[SIZE_DIM_OUT];
if ( _SIZE_DIM_IN_A == 1){
ndim_a[0] = 1;
ndim_a[1] =INPUT_A_DIMS[0];
}
if ( _SIZE_DIM_IN_B == 1){
ndim_b[0] =INPUT_B_DIMS[0];
ndim_b[1] = 1;
}
for (int i= 0; i<SIZE_DIM_OUT; i++){
int idx = SIZE_DIM_OUT-_SIZE_DIM_IN_A;
ndim_a[i] = (i< idx) ? 1 :INPUT_A_DIMS[i-idx];
}
for (int i= 0; i<SIZE_DIM_OUT; i++){
int idx = SIZE_DIM_OUT-_SIZE_DIM_IN_B;
ndim_b[i] = (i< idx) ? 1 :INPUT_B_DIMS[i-idx];
}
// initialize strides to iterate through data because of broadcasting
int stride_post0[SIZE_DIM_OUT-2] ;
int stride_post1[SIZE_DIM_OUT-2] ;
int stride_step0[SIZE_DIM_OUT-2] ;
int stride_step1[SIZE_DIM_OUT-2] ;
if (SIZE_DIM_OUT > 2){
stride_post0[SIZE_DIM_OUT - 3] = 1;
stride_post1[SIZE_DIM_OUT - 3] = 1;
for (int i = SIZE_DIM_OUT-4; i != -1; --i) {
stride_post0[i] = stride_post0[i+1]*ndim_a[i+1];
stride_post1[i] = stride_post1[i+1]*ndim_b[i+1];
}
for (int i = 0; i < SIZE_DIM_OUT-2; ++i) {
stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1;
stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1;
}
}
// if _SIZE_DIM_IN_B == _SIZE_DIM_IN_A, then _SIZE_DIM_IN_A == SIZE_DIM_OUT == _SIZE_DIM_IN_B;
// else it will be broadcasted to the correct dims
int nbMatrices = 1;
for(int i = SIZE_DIM_OUT -3; i>=0; --i){
nbMatrices *= OUTPUT_DIMS[i];
}
int dim = SIZE_DIM_OUT -3;
int offsetIn0 = 0;
int offsetIn1 = 0;
int offsetOut = 0;
const int n = ndim_a[SIZE_DIM_OUT - 2];
const int k = ndim_a[SIZE_DIM_OUT - 1];
const int m = ndim_b[SIZE_DIM_OUT - 1];
const int matrix0Size = n*k;
const int matrix1Size = k*m;
const int matrixOutSize = n*m;
for(int stack = 0; stack < nbMatrices;){
for (int i = 0; i < n; ++i) {
for (int j = 0; j < m; ++j) {
float sum = 0;
for (int l = 0; l < k; ++l) {
sum += (inputs1[ offsetIn0*matrix0Size + i*k + l] * inputs2[offsetIn1*matrix1Size + l*m + j]);
}
outputs[ offsetOut*matrixOutSize + i*m + j] = sum;
}
}
if (++stack < nbMatrices) {
int tmp_stack = stack;
while(tmp_stack % OUTPUT_DIMS[dim] == 0) {
tmp_stack /= OUTPUT_DIMS[dim];
dim--;
}
outputs[N*m + n] = activation_forward_value<Output_T>(sum, 0/*not applicable*/, ACTIVATION, rescaling);
offsetIn0 += stride_step0[dim];
offsetIn1 += stride_step1[dim];
++offsetOut;
dim = SIZE_DIM_OUT -3;
}
}
}
#endif // __AIDGE_EXPORT_CPP_KERNELS_MATMUL__
#endif // __AIDGE_EXPORT_CPP_KERNELS_MATMUL__
\ No newline at end of file
#ifndef __AIDGE_EXPORT_CPP_KERNELS_MUL__
#define __AIDGE_EXPORT_CPP_KERNELS_MUL__
#include "network/typedefs.hpp"
#include "kernels/activation.hpp"
template<int NB_ELTS,
int INPUT_A_DIMS[], int INPUT_B_DIMS[], int OUTPUT_DIMS[],
int SIZE_DIM_IN_A, int SIZE_DIM_IN_B, int SIZE_DIM_OUT, int OUT_SIZE,
ActivationFunction_T ACTIVATION,
typename Input_T, typename Output_T>
__attribute__((always_inline)) inline
void mul_forward (
Output_T* __restrict outputs,
const Input_T* __restrict inputs1,
const Input_T* __restrict inputs2)
{
int ndim_a[SIZE_DIM_OUT];
int ndim_b[SIZE_DIM_OUT];
for (int i= 0; i<SIZE_DIM_OUT; i++){
int idx = SIZE_DIM_OUT-SIZE_DIM_IN_A;
ndim_a[i] = (i< idx) ? 1 : INPUT_A_DIMS[i-idx];
}
for (int i= 0; i<SIZE_DIM_OUT; i++){
int idx = SIZE_DIM_OUT-SIZE_DIM_IN_B;
ndim_b[i] = (i< idx) ? 1 : INPUT_B_DIMS[i-idx];
}
// Find the highest equal dimension
int contiguousidx = SIZE_DIM_OUT -1 ;
for (int i = contiguousidx ; ndim_a[i] == ndim_b[i]; i--) {
contiguousidx = i;
}
// Compute the highest number of contiguous data for each Tensor
int input0_contiguous_size = 1;
for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
input0_contiguous_size *= ndim_a[i];
}
int input1_contiguous_size = 1;
for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
input1_contiguous_size *= ndim_b[i];
}
int output_contiguous_size = 1;
for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
output_contiguous_size *= OUTPUT_DIMS[i];
}
// initialize strides to iterate through data because of broadcasting
int stride_post0[contiguousidx ] ;
int stride_post1[contiguousidx ] ;
int stride_step0[contiguousidx ] ;
int stride_step1[contiguousidx ] ;
if (contiguousidx > 0) {
stride_post0[contiguousidx - 1] = 1;
stride_post1[contiguousidx - 1] = 1;
for (int i = contiguousidx -2; i != -1; --i) {
stride_post0[i] = stride_post0[i+1]*ndim_a[i+1];
stride_post1[i] = stride_post1[i+1]*ndim_b[i+1];
}
for (int i = 0; i < contiguousidx ; ++i) {
stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1;
stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1;
}
}
int offsetIn0 = 0;
int offsetIn1 = 0;
int offsetOut = 0;
int nbMatrices = 1;
for(int i = 0; i<contiguousidx ; ++i){
nbMatrices *= OUTPUT_DIMS[i];
}
int dim = contiguousidx - 1;
for(int stack = 0; stack < nbMatrices;){
for(int i = 0; i < output_contiguous_size; ++i){
int in0_id = (input0_contiguous_size != 1) ? i : 0;
int in1_id = (input1_contiguous_size != 1) ? i : 0;
outputs[i + offsetOut*output_contiguous_size] = inputs1[in0_id + offsetIn0*input0_contiguous_size] * inputs2[in1_id + offsetIn1*input1_contiguous_size];
}
if (++stack < nbMatrices) {
int tmp_stack = stack;
while(tmp_stack % OUTPUT_DIMS[dim] == 0) {
tmp_stack /= OUTPUT_DIMS[dim];
dim--;
}
offsetIn0 += stride_step0[dim];
offsetIn1 += stride_step1[dim];
++offsetOut;
dim = contiguousidx - 1;
}
}
}
#endif // __AIDGE_EXPORT_CPP_KERNELS_MUL__
\ No newline at end of file
......@@ -7,6 +7,35 @@
#include <stdexcept>
void reorder_NCHW_NHWC_pool(const float* input, float* output, int N, int C, int H, int W, bool direct = true) {
auto nchw_index = [=](int n, int c, int h, int w) {
return ((n * C + c) * H + h) * W + w;
};
auto nhwc_index = [=](int n, int h, int w, int c) {
return ((n * H + h) * W + w) * C + c;
};
#pragma omp parallel for
for (int n = 0; n < N; ++n) {
#pragma omp parallel for
for (int c = 0; c < C; ++c) {
#pragma omp parallel for
for (int h = 0; h < H; ++h) {
#pragma omp parallel for
for (int w = 0; w < W; ++w) {
if (direct) {
output[nhwc_index(n, h, w, c)] = input[nchw_index(n, c, h, w)];
} else {
output[nchw_index(n, c, h, w)] = input[nhwc_index(n, h, w, c)];
}
}
}
}
}
}
template<int NB_CHANNELS,
int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
int NB_OUTPUTS,
......@@ -22,11 +51,16 @@ void pooling_forward(
const Input_T* __restrict inputs,
Output_T* __restrict outputs)
{
float inputs_ordered[NB_CHANNELS * CHANNELS_HEIGHT * CHANNELS_WIDTH];
float outputs_unordered[NB_OUTPUTS * OUTPUTS_HEIGHT * OUTPUTS_WIDTH];
reorder_NCHW_NHWC_pool(inputs, inputs_ordered, 1, NB_CHANNELS, CHANNELS_HEIGHT, CHANNELS_WIDTH, true);
constexpr int OUTPUTS_HEIGHT_NOPAD
= (CHANNELS_HEIGHT - POOL_HEIGHT + STRIDE_Y) / STRIDE_Y;
constexpr int OUTPUTS_WIDTH_NOPAD
= (CHANNELS_WIDTH - POOL_WIDTH + STRIDE_X) / STRIDE_X;
#pragma omp parallel for
for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
const int syMin = (PADDING_Y == 0) ? 0
: max(PADDING_Y - (oy * STRIDE_Y), 0);
......@@ -78,15 +112,15 @@ void pooling_forward(
int iOffsetInRange = iOffset + output + sx * NB_CHANNELS;
if (inputs[iOffsetInRange] > maxVal)
maxVal = inputs[iOffsetInRange];
if (inputs_ordered[iOffsetInRange] > maxVal)
maxVal = inputs_ordered[iOffsetInRange];
}
}
outputs[oOffset + output] = maxVal;
outputs_unordered[oOffset + output] = maxVal;
}
else if (POOLING_TYPE == Average) {
int32_t sum = 0;
Input_T sum = 0;
for (int sy = 0; sy < POOL_HEIGHT; ++sy) {
if ((PADDING_Y != 0
......@@ -109,11 +143,12 @@ void pooling_forward(
}
int iOffsetInRange = iOffset + output + sx * NB_CHANNELS;
sum += inputs[iOffsetInRange];
sum = inputs_ordered[iOffsetInRange] + sum;
}
}
outputs[oOffset + output] = (Output_T) (sum / (POOL_HEIGHT * POOL_WIDTH));
outputs_unordered[oOffset + output] = (Output_T) (sum / (POOL_HEIGHT * POOL_WIDTH));
}
else {
throw std::runtime_error("The export only supports Max and Average pooling.");
......@@ -121,6 +156,7 @@ void pooling_forward(
}
}
}
reorder_NCHW_NHWC_pool(outputs_unordered, outputs, 1, NB_OUTPUTS, OUTPUTS_HEIGHT, OUTPUTS_WIDTH, false);
}
#endif // __AIDGE_EXPORT_CPP_KERNELS_POOLING__
#endif // __AIDGE_EXPORT_CPP_KERNELS_POOLING__
\ No newline at end of file
#ifndef __AIDGE_EXPORT_CPP_KERNELS_SUB__
#define __AIDGE_EXPORT_CPP_KERNELS_SUB__
#include "network/typedefs.hpp"
#include "kernels/activation.hpp"
template<int NB_ELTS,
int INPUT_A_DIMS[], int INPUT_B_DIMS[], int OUTPUT_DIMS[],
int SIZE_DIM_IN_A, int SIZE_DIM_IN_B, int SIZE_DIM_OUT, int OUT_SIZE,
ActivationFunction_T ACTIVATION,
typename Input_T, typename Output_T>
__attribute__((always_inline)) inline
void sub_forward (
Output_T* __restrict outputs,
const Input_T* __restrict inputs1,
const Input_T* __restrict inputs2)
{
int ndim_a[SIZE_DIM_OUT];
int ndim_b[SIZE_DIM_OUT];
for (int i= 0; i<SIZE_DIM_OUT; i++){
int idx = SIZE_DIM_OUT-SIZE_DIM_IN_A;
ndim_a[i] = (i< idx) ? 1 : INPUT_A_DIMS[i-idx];
}
for (int i= 0; i<SIZE_DIM_OUT; i++){
int idx = SIZE_DIM_OUT-SIZE_DIM_IN_B;
ndim_b[i] = (i< idx) ? 1 : INPUT_B_DIMS[i-idx];
}
// Find the highest equal dimension
int contiguousidx = SIZE_DIM_OUT -1 ;
for (int i = contiguousidx ; ndim_a[i] == ndim_b[i]; i--) {
contiguousidx = i;
}
// Compute the highest number of contiguous data for each Tensor
int input0_contiguous_size = 1;
for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
input0_contiguous_size *= ndim_a[i];
}
int input1_contiguous_size = 1;
for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
input1_contiguous_size *= ndim_b[i];
}
int output_contiguous_size = 1;
for(int i = contiguousidx ; i<SIZE_DIM_OUT; ++i){
output_contiguous_size *= OUTPUT_DIMS[i];
}
// initialize strides to iterate through data because of broadcasting
int stride_post0[contiguousidx ] ;
int stride_post1[contiguousidx ] ;
int stride_step0[contiguousidx ] ;
int stride_step1[contiguousidx ] ;
if (contiguousidx > 0) {
stride_post0[contiguousidx - 1] = 1;
stride_post1[contiguousidx - 1] = 1;
for (int i = contiguousidx -2; i != -1; --i) {
stride_post0[i] = stride_post0[i+1]*ndim_a[i+1];
stride_post1[i] = stride_post1[i+1]*ndim_b[i+1];
}
for (int i = 0; i < contiguousidx ; ++i) {
stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1;
stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1;
}
}
int offsetIn0 = 0;
int offsetIn1 = 0;
int offsetOut = 0;
int nbMatrices = 1;
for(int i = 0; i<contiguousidx ; ++i){
nbMatrices *= OUTPUT_DIMS[i];
}
int dim = contiguousidx - 1;
for(int stack = 0; stack < nbMatrices;){
for(int i = 0; i < output_contiguous_size; ++i){
int in0_id = (input0_contiguous_size != 1) ? i : 0;
int in1_id = (input1_contiguous_size != 1) ? i : 0;
outputs[i + offsetOut*output_contiguous_size] = inputs1[in0_id + offsetIn0*input0_contiguous_size] - inputs2[in1_id + offsetIn1*input1_contiguous_size];
}
if (++stack < nbMatrices) {
int tmp_stack = stack;
while(tmp_stack % OUTPUT_DIMS[dim] == 0) {
tmp_stack /= OUTPUT_DIMS[dim];
dim--;
}
offsetIn0 += stride_step0[dim];
offsetIn1 += stride_step1[dim];
++offsetOut;
dim = contiguousidx - 1;
}
}
}
#endif // __AIDGE_EXPORT_CPP_KERNELS_SUB__
\ No newline at end of file
#ifndef __AIDGE_EXPORT_CPP_KERNELS_TRANSPOSE__
#define __AIDGE_EXPORT_CPP_KERNELS_TRANSPOSE__
#include "network/typedefs.hpp"
using namespace std;
template< int INPUT_DIMS[], int PERM[], int OUTPUT_DIMS[],
int SIZE_OUTPUT_DIMS, int SIZE,
typename Input_T, typename Output_T>
__attribute__((always_inline)) inline
void transpose_forward (
const Input_T* __restrict inputs,
Output_T* __restrict outputs)
{
int newStrides[SIZE_OUTPUT_DIMS];
for (int i = 0; i<SIZE_OUTPUT_DIMS;++i){newStrides[i] = 1;}
for (int i = 0; i < SIZE_OUTPUT_DIMS; ++i) {
for (int j = i + 1; j < SIZE_OUTPUT_DIMS; ++j) {
newStrides[i] *= OUTPUT_DIMS[j];
}
}
int indices[SIZE_OUTPUT_DIMS];
for (int i = 0; i<SIZE_OUTPUT_DIMS;++i){indices[i] = 0;}
for (int i = 0; i < SIZE; ++i) {
int idx = 0;
for (int j = SIZE_OUTPUT_DIMS -1; j >=0; --j) {
idx += indices[PERM[j]] * newStrides[j];
}
outputs[idx] = inputs[i];
for (int j = SIZE_OUTPUT_DIMS - 1; j >= 0; --j) {
if (indices[j] < INPUT_DIMS[j] - 1) {
indices[j]++;
break;
}
else {
indices[j] = 0;
}
}
}
}
#endif // __AIDGE_EXPORT_CPP_KERNELS_TRANSPOSE__
......@@ -56,7 +56,7 @@ class ProducerCPP(ExportNode):
super().__init__(node, mem_info)
self.values = np.array(self.operator.get_output(0))
if len(self.values.shape) == 4: # Note: export in HWC
if len(self.values.shape) == 4: # Note: export in HWC
self.values = np.transpose(self.values, (0, 2, 3, 1))
def export(self, export_folder: Path):
......@@ -143,6 +143,24 @@ def _setup_conv2D(conv):
str(ROOT / "kernels" / "rescaling.hpp")
]
def _setup_elemwise_op(elemwise, op):
"""Common code (template and kernel setup) shared across all the different elementWise operator (Add, Sub,...)."""
elemwise.attributes["elemwise_op"] = op
elemwise.attributes["activation"] = "Linear"
elemwise.attributes["rescaling"] = "NoScaling"
elemwise.config_template = str(
ROOT / "templates" / "configuration" / "elemwise_config.jinja")
elemwise.forward_template = str(
ROOT / "templates" / "kernel_forward" / "elemwise_forward.jinja")
elemwise.include_list = []
elemwise.kernels_to_copy = [
str(ROOT / "kernels" / "elemwise.hpp"),
str(ROOT / "kernels" / "activation.hpp"),
str(ROOT / "kernels" / "rescaling.hpp")
]
@ExportLibCpp.register("Conv2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
class ConvCPP(ExportNodeCpp):
def __init__(self, node, mem_info):
......@@ -150,7 +168,7 @@ class ConvCPP(ExportNodeCpp):
# No padding with Conv
# Use PaddedConv to add padding attribute
self.attributes["padding"] = [0, 0]
self.attributes["groups"] = 1
_setup_conv2D(self)
@ExportLibCpp.register_metaop("PaddedConv2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
......@@ -169,25 +187,28 @@ class PaddedConvCPP(ExportNodeCpp):
).attr.stride_dims
self.attributes["dilation_dims"] = n.get_operator(
).attr.dilation_dims
self.attributes["groups"] = 1
_setup_conv2D(self)
def _setup_elemwise_op(elemwise, op):
"""Common code (template and kernel setup) shared across all the different elementWise operator (Add, Sub,...)."""
@ExportLibCpp.register_metaop("PaddedConvDepthWise2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
class PaddedConvDepthWiseCPP(ExportNodeCpp):
def __init__(self, node, mem_info):
super().__init__(node, mem_info)
# TODO find a way to retrive attr for meta op
for n in self.operator.get_micro_graph().get_nodes():
if n.type() == "Pad2D":
self.attributes["padding"] = n.get_operator(
).attr.begin_end_borders
if n.type() == "ConvDepthWise2D":
self.attributes["kernel_dims"] = n.get_operator(
).attr.kernel_dims
self.attributes["stride_dims"] = n.get_operator(
).attr.stride_dims
self.attributes["dilation_dims"] = n.get_operator(
).attr.dilation_dims
elemwise.attributes["elemwise_op"] = op
elemwise.attributes["activation"] = "Linear"
elemwise.attributes["rescaling"] = "NoScaling"
elemwise.config_template = str(
ROOT / "templates" / "configuration" / "elemwise_config.jinja")
elemwise.forward_template = str(
ROOT / "templates" / "kernel_forward" / "elemwise_forward.jinja")
elemwise.include_list = []
elemwise.kernels_to_copy = [
str(ROOT / "kernels" / "elemwise.hpp"),
str(ROOT / "kernels" / "activation.hpp"),
str(ROOT / "kernels" / "rescaling.hpp")
]
self.attributes["groups"] = self.attributes["out_chan"][0]
_setup_conv2D(self)
@ExportLibCpp.register("Add", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
class AddCPP(ExportNodeCpp):
......@@ -210,6 +231,14 @@ class MulCPP(ExportNodeCpp):
_setup_elemwise_op(self, "Mul")
@ExportLibCpp.register("Div", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
class DivCPP(ExportNodeCpp):
def __init__(self, node, mem_info):
super().__init__(node, mem_info)
_setup_elemwise_op(self, "Div")
def _setup_pooling(pooling):
"""Common code (template and kernel setup) shared across all the different pooling operator."""
......@@ -302,4 +331,39 @@ class TransposeCPP(ExportNodeCpp):
self.include_list = []
self.kernels_to_copy = [
str(ROOT / "kernels" / "transpose.hpp")
]
@ExportLibCpp.register("Erf", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
class ErfCPP(ExportNodeCpp):
def __init__(self, node, mem_info):
super().__init__(node, mem_info)
self.attributes["activation"] = "Linear"
self.attributes["rescaling"] = "NoScaling"
self.config_template = str(
ROOT / "templates" / "configuration" / "erf_config.jinja")
self.forward_template = str(
ROOT / "templates" / "kernel_forward" / "erf_forward.jinja")
self.include_list = []
self.kernels_to_copy = [
str(ROOT / "kernels" / "erf.hpp"),
str(ROOT / "kernels" / "activation.hpp"),
str(ROOT / "kernels" / "rescaling.hpp")
]
@ExportLibCpp.register("BatchNorm2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
class BatchNorm2DCPP(ExportNodeCpp):
def __init__(self, node, mem_info):
super().__init__(node, mem_info)
self.attributes["activation"] = "Linear"
self.attributes["rescaling"] = "NoScaling"
self.config_template = str(
ROOT / "templates" / "configuration" / "batchnorm_config.jinja")
self.forward_template = str(
ROOT / "templates" / "kernel_forward" / "batchnorm_forward.jinja")
self.include_list = []
self.kernels_to_copy = [
str(ROOT / "kernels" / "batchnorm.hpp"),
str(ROOT / "kernels" / "activation.hpp"),
str(ROOT / "kernels" / "rescaling.hpp")
]
\ No newline at end of file
......@@ -19,7 +19,8 @@ typedef enum {
typedef enum {
Add,
Sub,
Mul
Mul,
Div
} ElemWise_T;
typedef enum {
......
{#- For name header -#}
#ifndef {{ name|upper }}_LAYER_H
#define {{ name|upper }}_LAYER_H
#include "kernels/rescaling.hpp"
{% include "./_def_io.jinja" %}
{% include "./_meminfo.jinja" %}
{# For layer configuration -#}
#define {{ name|upper }}_NB_ELTS {{ in_dims[0]|join('*') }}
#define {{ name|upper }}_NB_ELTS_B {{ in_dims[1]|join('*')}}
int {{name|upper}}_OUTPUT_DIMS[] = { {{ out_dims[0]|join(", ") }} };
int {{name|upper}}_INPUT_A_DIMS[] = { {{ in_dims[0]|join(", ") }} };
int {{name|upper}}_INPUT_B_DIMS[] = { {{ in_dims[1]|join(", ") }} };
#define {{name|upper}}_SIZE_DIM_IN_A {{in_dims[0]|length}}
#define {{name|upper}}_SIZE_DIM_IN_B {{in_dims[1]|length}}
#define {{name|upper}}_SIZE_DIM_OUT {{out_dims[0]|length}}
#define {{ name|upper }}_OUT_SIZE {{out_size[0]}}
#define {{name|upper }}_SIZE_DIM_OUT {{out_dims[0]|length}}
#define {{ name|upper }}_ACTIVATION {{ activation }}
static const {{ rescaling }} {{ name|upper }}_RESCALING = {};
#endif /* {{ name|upper }}_LAYER_H */
......@@ -8,4 +8,4 @@
#define {{ name|upper }}_ACTIVATION {{ activation }}
#define {{ name|upper }}_EPSILON {{ epsilon }}
#endif /* {{ name|upper }}_LAYER_H */
#endif /* {{ name|upper }}_LAYER_H */
\ No newline at end of file
......@@ -5,6 +5,8 @@
{# For layer configuration -#}
{% include "./_def_io.jinja" %}
{% include "./_meminfo.jinja" %}
#define {{ name|upper }}_GROUPS {{ groups }}
#define {{ name|upper }}_PADDING_Y {{ padding[1] }}
#define {{ name|upper }}_PADDING_X {{ padding[0] }}
#define {{ name|upper }}_STRIDE_Y {{ stride_dims[1] }}
......@@ -21,5 +23,4 @@ static const {{ rescaling }} {{ name|upper }}_RESCALING = {};
#define {{ name|upper }}_WEIGHTS_SIZE {{ weights_size }}
#define {{ name|upper }}_BIASES_SIZE {{ out_chan[0] }}
#endif /* {{ name|upper }}_LAYER_H */
{#- For name header -#}
#ifndef {{ name|upper }}_LAYER_H
#define {{ name|upper }}_LAYER_H
#include "kernels/rescaling.hpp"
{% include "./_def_io.jinja" %}
{% include "./_meminfo.jinja" %}
{# For layer configuration -#}
#define {{ name|upper }}_NB_ELTS {{ in_dims[0]|join('*') }}
#define {{ name|upper }}_NB_ELTS_B {{ in_dims[1]|join('*')}}
int {{name|upper}}_OUTPUT_DIMS[] = { {{ out_dims[0]|join(", ") }} };
int {{name|upper}}_INPUT_A_DIMS[] = { {{ in_dims[0]|join(", ") }} };
int {{name|upper}}_INPUT_B_DIMS[] = { {{ in_dims[1]|join(", ") }} };
#define {{name|upper}}_SIZE_DIM_IN_A {{in_dims[0]|length}}
#define {{name|upper}}_SIZE_DIM_IN_B {{in_dims[1]|length}}
#define {{name|upper}}_SIZE_DIM_OUT {{out_dims[0]|length}}
#define {{ name|upper }}_OUT_SIZE {{out_size[0]}}
#define {{name|upper }}_SIZE_DIM_OUT {{out_dims[0]|length}}
#define {{ name|upper }}_ACTIVATION {{ activation }}
static const {{ rescaling }} {{ name|upper }}_RESCALING = {};
#endif /* {{ name|upper }}_LAYER_H */
......@@ -7,6 +7,19 @@
{% include "./_meminfo.jinja" %}
{# For layer configuration -#}
#define {{ name|upper }}_NB_ELTS {{ in_dims[0]|join('*') }}
#define {{ name|upper }}_NB_ELTS_B {{ in_dims[1]|join('*')}}
int {{name|upper}}_OUTPUT_DIMS[] = { {{ out_dims[0]|join(", ") }} };
int {{name|upper}}_INPUT_A_DIMS[] = { {{ in_dims[0]|join(", ") }} };
int {{name|upper}}_INPUT_B_DIMS[] = { {{ in_dims[1]|join(", ") }} };
#define {{name|upper}}_SIZE_DIM_IN_A {{in_dims[0]|length}}
#define {{name|upper}}_SIZE_DIM_IN_B {{in_dims[1]|length}}
#define {{name|upper}}_SIZE_DIM_OUT {{out_dims[0]|length}}
#define {{ name|upper }}_OUT_SIZE {{out_size[0]}}
#define {{name|upper }}_SIZE_DIM_OUT {{out_dims[0]|length}}
#define {{ name|upper }}_ACTIVATION {{ activation }}
#define {{ name|upper }}_ELEM_OP {{ elemwise_op }}
static const {{ rescaling }} {{ name|upper }}_RESCALING = {};
......