Skip to content
Snippets Groups Projects

Adding MatMul operator

Merged Matthew Newson requested to merge mnewson/aidge_export_cpp:MatMul into dev
All threads resolved!
Files
4
@@ -6,9 +6,8 @@
// Generic function for matmul and activation
template<int M,
int K,
int N,
template<int INPUT_A_DIMS[], int INPUT_B_DIMS[], int OUTPUT_DIMS[],
int _SIZE_DIM_IN_A, int _SIZE_DIM_IN_B, int SIZE_DIM_OUT,
ActivationFunction_T ACTIVATION,
typename Input_T, typename Output_T,
typename Rescaling_T>
@@ -19,15 +18,96 @@ void matmul_forward (
Output_T* __restrict outputs,
const Rescaling_T& __restrict rescaling)
{
for (int m = 0; m < M; ++m) {
for (int n = 0; n < N; ++n) {
Output_T sum = Output_T(0);
for (int k = 0; k < K; ++k) {
sum += inputs1[K*m + k] * inputs2[N*k + n];
//initialize arrays storing broadcasted(or not) dims
int ndim_a[SIZE_DIM_OUT];
int ndim_b[SIZE_DIM_OUT];
if ( _SIZE_DIM_IN_A == 1){
ndim_a[0] = 1;
ndim_a[1] =INPUT_A_DIMS[0];
}
if ( _SIZE_DIM_IN_B == 1){
ndim_b[0] =INPUT_B_DIMS[0];
ndim_b[1] = 1;
}
for (int i= 0; i<SIZE_DIM_OUT; i++){
int idx = SIZE_DIM_OUT-_SIZE_DIM_IN_A;
ndim_a[i] = (i< idx) ? 1 :INPUT_A_DIMS[i-idx];
}
for (int i= 0; i<SIZE_DIM_OUT; i++){
int idx = SIZE_DIM_OUT-_SIZE_DIM_IN_B;
ndim_b[i] = (i< idx) ? 1 :INPUT_B_DIMS[i-idx];
}
// initialize strides to iterate through data because of broadcasting
int stride_post0[SIZE_DIM_OUT-2] ;
int stride_post1[SIZE_DIM_OUT-2] ;
int stride_step0[SIZE_DIM_OUT-2] ;
int stride_step1[SIZE_DIM_OUT-2] ;
if (SIZE_DIM_OUT > 2){
stride_post0[SIZE_DIM_OUT - 3] = 1;
stride_post1[SIZE_DIM_OUT - 3] = 1;
for (int i = SIZE_DIM_OUT-4; i != -1; --i) {
stride_post0[i] = stride_post0[i+1]*ndim_a[i+1];
stride_post1[i] = stride_post1[i+1]*ndim_b[i+1];
}
for (int i = 0; i < SIZE_DIM_OUT-2; ++i) {
stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1;
stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1;
}
}
// if _SIZE_DIM_IN_B == _SIZE_DIM_IN_A, then _SIZE_DIM_IN_A == SIZE_DIM_OUT == _SIZE_DIM_IN_B;
// else it will be broadcasted to the correct dims
int nbMatrices = 1;
for(int i = SIZE_DIM_OUT -3; i>=0; --i){
nbMatrices *= OUTPUT_DIMS[i];
}
int dim = SIZE_DIM_OUT -3;
int offsetIn0 = 0;
int offsetIn1 = 0;
int offsetOut = 0;
const int n = ndim_a[SIZE_DIM_OUT - 2];
const int k = ndim_a[SIZE_DIM_OUT - 1];
const int m = ndim_b[SIZE_DIM_OUT - 1];
const int matrix0Size = n*k;
const int matrix1Size = k*m;
const int matrixOutSize = n*m;
for(int stack = 0; stack < nbMatrices;){
for (int i = 0; i < n; ++i) {
for (int j = 0; j < m; ++j) {
float sum = 0;
for (int l = 0; l < k; ++l) {
sum += (inputs1[ offsetIn0*matrix0Size + i*k + l] * inputs2[offsetIn1*matrix1Size + l*m + j]);
}
outputs[offsetOut*matrixOutSize + i*m + j] = activation_forward_value<Output_T>(sum, 0/*not applicable*/, ACTIVATION, rescaling);
}
}
if (++stack < nbMatrices) {
int tmp_stack = stack;
while(tmp_stack % OUTPUT_DIMS[dim] == 0) {
tmp_stack /= OUTPUT_DIMS[dim];
dim--;
}
outputs[N*m + n] = activation_forward_value<Output_T>(sum, 0/*not applicable*/, ACTIVATION, rescaling);
offsetIn0 += stride_step0[dim];
offsetIn1 += stride_step1[dim];
++offsetOut;
dim = SIZE_DIM_OUT -3;
}
}
}
#endif // __AIDGE_EXPORT_CPP_KERNELS_MATMUL__
#endif // __AIDGE_EXPORT_CPP_KERNELS_MATMUL__
\ No newline at end of file
Loading