Skip to content
Snippets Groups Projects
Commit a0fd8327 authored by Charles Villard's avatar Charles Villard
Browse files

edit: Conv/FC: Adapt code generation to handle no bias in FC and Conv

operation
parent c74265bd
No related branches found
No related tags found
No related merge requests found
Pipeline #57308 failed
#ifndef __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION__
#define __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION__
#include "network/typedefs.hpp"
#include "kernels/activation.hpp"
#include "kernels/macs.hpp"
#include "network/rescaling.hpp"
#include "network/typedefs.hpp"
#include "network/utils.hpp"
#include "kernels/macs.hpp"
#include "kernels/activation.hpp"
template<int NB_CHANNELS,
int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
int NB_OUTPUTS,
int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
int PADDING_Y, int PADDING_X,
int STRIDE_Y, int STRIDE_X,
int DILATION_Y, int DILATION_X,
int KERNEL_HEIGHT, int KERNEL_WIDTH,
ActivationFunction_T ACTIVATION,
typename Input_T, typename Output_T,
typename Weight_T, typename Bias_T,
typename Rescaling_T>
__attribute__((always_inline)) inline
void convolution_forward(
const Input_T* __restrict inputs,
Output_T* __restrict outputs,
const Weight_T* __restrict weights,
const Bias_T* __restrict biases,
const Rescaling_T& __restrict rescaling)
{
constexpr int DILATED_KERNEL_HEIGHT
= KERNEL_HEIGHT + (DILATION_Y - 1) * (KERNEL_HEIGHT - 1);
constexpr int DILATED_KERNEL_WIDTH
= KERNEL_WIDTH + (DILATION_X - 1) * (KERNEL_WIDTH - 1);
constexpr int OUTPUTS_HEIGHT_NOPAD
= (CHANNELS_HEIGHT - DILATION_Y * (KERNEL_HEIGHT - 1) - 1 + STRIDE_Y) / STRIDE_Y;
constexpr int OUTPUTS_WIDTH_NOPAD
= (CHANNELS_WIDTH - DILATION_X * (KERNEL_WIDTH - 1) - 1 + STRIDE_X) / STRIDE_X;
template<
int NB_CHANNELS,
int CHANNELS_HEIGHT,
int CHANNELS_WIDTH,
int NB_OUTPUTS,
int OUTPUTS_HEIGHT,
int OUTPUTS_WIDTH,
int PADDING_Y,
int PADDING_X,
int STRIDE_Y,
int STRIDE_X,
int DILATION_Y,
int DILATION_X,
int KERNEL_HEIGHT,
int KERNEL_WIDTH,
ActivationFunction_T ACTIVATION,
typename Input_T,
typename Output_T,
typename Weight_T,
typename Bias_T,
typename Rescaling_T>
__attribute__((always_inline)) inline void convolution_forward(
const Input_T *__restrict inputs,
Output_T *__restrict outputs,
const Weight_T *__restrict weights,
const Bias_T *__restrict biases,
const Rescaling_T &__restrict rescaling
) {
constexpr int DILATED_KERNEL_HEIGHT =
KERNEL_HEIGHT + (DILATION_Y - 1) * (KERNEL_HEIGHT - 1);
constexpr int DILATED_KERNEL_WIDTH =
KERNEL_WIDTH + (DILATION_X - 1) * (KERNEL_WIDTH - 1);
constexpr int OUTPUTS_HEIGHT_NOPAD =
(CHANNELS_HEIGHT - DILATION_Y * (KERNEL_HEIGHT - 1) - 1 + STRIDE_Y)
/ STRIDE_Y;
constexpr int OUTPUTS_WIDTH_NOPAD =
(CHANNELS_WIDTH - DILATION_X * (KERNEL_WIDTH - 1) - 1 + STRIDE_X)
/ STRIDE_X;
for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
const int syMin = (PADDING_Y == 0) ? 0
: max(PADDING_Y - (oy * STRIDE_Y), 0);
const int syMax = (PADDING_Y == 0
&& OUTPUTS_HEIGHT == OUTPUTS_HEIGHT_NOPAD) ? DILATED_KERNEL_HEIGHT
: clamp(CHANNELS_HEIGHT + PADDING_Y - (oy * STRIDE_Y),
0, DILATED_KERNEL_HEIGHT);
const int syMin =
(PADDING_Y == 0) ? 0 : max(PADDING_Y - (oy * STRIDE_Y), 0);
const int syMax =
(PADDING_Y == 0 && OUTPUTS_HEIGHT == OUTPUTS_HEIGHT_NOPAD)
? DILATED_KERNEL_HEIGHT
: clamp(
CHANNELS_HEIGHT + PADDING_Y - (oy * STRIDE_Y),
0,
DILATED_KERNEL_HEIGHT
);
const int iy = (oy * STRIDE_Y) - PADDING_Y;
#pragma omp parallel for collapse(2)
for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
for (int output = 0; output < NB_OUTPUTS; ++output) {
// moved to inner loop for collapsing -->
const int sxMin = (PADDING_X == 0) ? 0
: max(PADDING_X - (ox * STRIDE_X), 0);
const int sxMax = (PADDING_X == 0
&& OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
? DILATED_KERNEL_WIDTH
: clamp(CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X),
0, DILATED_KERNEL_WIDTH);
const int sxMin =
(PADDING_X == 0) ? 0 : max(PADDING_X - (ox * STRIDE_X), 0);
const int sxMax =
(PADDING_X == 0 && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
? DILATED_KERNEL_WIDTH
: clamp(
CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X),
0,
DILATED_KERNEL_WIDTH
);
const int ix = (ox * STRIDE_X) - PADDING_X;
const int oPos = (ox + OUTPUTS_WIDTH * oy);
......@@ -66,51 +82,58 @@ void convolution_forward(
// <--
Bias_T weightedSum = biases[output];
Bias_T weightedSum = (biases) ? biases[output] : Bias_T(0);
for (int sy = 0; sy < KERNEL_HEIGHT; ++sy) {
if ((PADDING_Y != 0
|| OUTPUTS_HEIGHT != OUTPUTS_HEIGHT_NOPAD)
&& ((sy*DILATION_Y < syMin) || (sy*DILATION_Y >= syMax)))
|| OUTPUTS_HEIGHT != OUTPUTS_HEIGHT_NOPAD)
&& ((sy * DILATION_Y < syMin)
|| (sy * DILATION_Y >= syMax)))
{
continue;
}
const int iPos = ix + CHANNELS_WIDTH * (iy + sy*DILATION_Y);
const int iPos =
ix + CHANNELS_WIDTH * (iy + sy * DILATION_Y);
int iOffset = NB_CHANNELS * iPos;
const int wOffset = (output*KERNEL_HEIGHT + sy) * KERNEL_WIDTH * NB_CHANNELS;
const int wOffset = (output * KERNEL_HEIGHT + sy)
* KERNEL_WIDTH * NB_CHANNELS;
if (DILATION_X == 1 && ((PADDING_X == 0 && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
|| sxMax - sxMin == KERNEL_WIDTH))
if (DILATION_X == 1
&& ((PADDING_X == 0
&& OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
|| sxMax - sxMin == KERNEL_WIDTH))
{
macsOnRange<KERNEL_WIDTH * NB_CHANNELS>(
inputs + iOffset,
weights + wOffset,
weightedSum);
}
else {
inputs + iOffset, weights + wOffset, weightedSum
);
} else {
for (int sx = 0; sx < KERNEL_WIDTH; ++sx) {
if ((PADDING_X != 0
|| OUTPUTS_WIDTH != OUTPUTS_WIDTH_NOPAD)
&& ((sx*DILATION_X < sxMin) || (sx*DILATION_X >= sxMax)))
|| OUTPUTS_WIDTH != OUTPUTS_WIDTH_NOPAD)
&& ((sx * DILATION_X < sxMin)
|| (sx * DILATION_X >= sxMax)))
{
continue;
}
int iOffsetInRange = iOffset
+ sx * DILATION_X * NB_CHANNELS;
int iOffsetInRange =
iOffset + sx * DILATION_X * NB_CHANNELS;
macsOnRange<NB_CHANNELS>(
// same input line so no wrapping can occur
inputs + iOffsetInRange,
weights + wOffset + sx * NB_CHANNELS,
weightedSum);
inputs + iOffsetInRange,
weights + wOffset + sx * NB_CHANNELS,
weightedSum
);
}
}
}
outputs[oOffset + output] = activation_forward_value<Output_T>(weightedSum, output, ACTIVATION, rescaling);
outputs[oOffset + output] = activation_forward_value<Output_T>(
weightedSum, output, ACTIVATION, rescaling
);
}
}
}
......
#ifndef __AIDGE_EXPORT_CPP_KERNELS_FULLYCONNECTED__
#define __AIDGE_EXPORT_CPP_KERNELS_FULLYCONNECTED__
#include "network/typedefs.hpp"
#include "kernels/activation.hpp"
#include "kernels/macs.hpp"
#include "network/rescaling.hpp"
#include "network/typedefs.hpp"
#include "network/utils.hpp"
#include "kernels/macs.hpp"
#include "kernels/activation.hpp"
template<int NB_CHANNELS,
int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
int NB_OUTPUTS,
int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
ActivationFunction_T ACTIVATION,
typename Input_T, typename Output_T,
typename Weight_T, typename Bias_T,
typename Rescaling_T>
__attribute__((always_inline)) inline
void fullyconnected_forward (
const Input_T* __restrict inputs,
Output_T* __restrict outputs,
const Weight_T* __restrict weights,
const Bias_T* __restrict biases,
const Rescaling_T& __restrict rescaling)
{
template<
int NB_CHANNELS,
int CHANNELS_HEIGHT,
int CHANNELS_WIDTH,
int NB_OUTPUTS,
int OUTPUTS_HEIGHT,
int OUTPUTS_WIDTH,
ActivationFunction_T ACTIVATION,
typename Input_T,
typename Output_T,
typename Weight_T,
typename Bias_T,
typename Rescaling_T>
__attribute__((always_inline)) inline void fullyconnected_forward(
const Input_T *__restrict inputs,
Output_T *__restrict outputs,
const Weight_T *__restrict weights,
const Bias_T *__restrict biases,
const Rescaling_T &__restrict rescaling
) {
// Warning, there is a trick here !
// To use this kernel, the inputs have to be in NHWC and the weights are in NCHW
// It is only an issue if the FC was after a flatten layer.
// Otherwise it is not an issue for the other FC because CHANNELS_WIDTH = CHANNELS_HEIGHT = 1
// Solution: Add a system to check dataformat
// To use this kernel, the inputs have to be in NHWC and the weights are in
// NCHW It is only an issue if the FC was after a flatten layer. Otherwise
// it is not an issue for the other FC because CHANNELS_WIDTH =
// CHANNELS_HEIGHT = 1 Solution: Add a system to check dataformat
for (int och = 0; och < NB_OUTPUTS; och++) {
Bias_T weightedSum = biases[och];
Bias_T weightedSum = (biases) ? biases[och] : Bias_T(0);
for (int iy = 0; iy < CHANNELS_HEIGHT; ++iy) {
for (int ix = 0; ix < CHANNELS_WIDTH; ++ix) {
for (int ch = 0; ch < NB_CHANNELS; ++ch) {
weightedSum += inputs[CHANNELS_WIDTH*NB_CHANNELS*iy + NB_CHANNELS*ix + ch]
* weights[CHANNELS_HEIGHT*CHANNELS_WIDTH*NB_CHANNELS*och + CHANNELS_HEIGHT*CHANNELS_WIDTH*ch + CHANNELS_HEIGHT*iy + ix];
weightedSum += inputs
[CHANNELS_WIDTH * NB_CHANNELS * iy
+ NB_CHANNELS * ix + ch]
* weights
[CHANNELS_HEIGHT * CHANNELS_WIDTH
* NB_CHANNELS * och
+ CHANNELS_HEIGHT * CHANNELS_WIDTH * ch
+ CHANNELS_HEIGHT * iy + ix];
}
}
}
outputs[och] = activation_forward_value<Output_T>(weightedSum, och, ACTIVATION, rescaling);
outputs[och] = activation_forward_value<Output_T>(
weightedSum, och, ACTIVATION, rescaling
);
}
/*
Here the kernel to use with inputs in NHWC and weights in NHWC
#pragma omp parallel for
for (int och = 0; och < NB_OUTPUTS; och++) {
/*
Here the kernel to use with inputs in NHWC and weights in NHWC
#pragma omp parallel for
for (int och = 0; och < NB_OUTPUTS; och++) {
Bias_T weightedSum = biases[och];
Bias_T weightedSum = biases[och];
for (int iy = 0; iy < CHANNELS_HEIGHT; ++iy) {
const int iPos = (CHANNELS_WIDTH * iy);
int iOffset = NB_CHANNELS * iPos;
for (int iy = 0; iy < CHANNELS_HEIGHT; ++iy) {
const int iPos = (CHANNELS_WIDTH * iy);
int iOffset = NB_CHANNELS * iPos;
const int wOffset = NB_CHANNELS * CHANNELS_WIDTH
* (iy + CHANNELS_HEIGHT * och);
const int wOffset = NB_CHANNELS * CHANNELS_WIDTH
* (iy + CHANNELS_HEIGHT * och);
macsOnRange<NB_CHANNELS * CHANNELS_WIDTH>(
inputs + iOffset,
weights + wOffset,
weightedSum);
}
macsOnRange<NB_CHANNELS * CHANNELS_WIDTH>(
inputs + iOffset,
weights + wOffset,
weightedSum);
}
outputs[och] = activation_forward_value<Output_T>(weightedSum, och, ACTIVATION, rescaling);
}
*/
outputs[och] = activation_forward_value<Output_T>(weightedSum, och,
ACTIVATION, rescaling);
}
*/
}
#endif // __AIDGE_EXPORT_CPP_KERNELS_FULLYCONNECTED__
\ No newline at end of file
#endif // __AIDGE_EXPORT_CPP_KERNELS_FULLYCONNECTED__
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment