Matthew Newson · Matthew Newson · 8d6911af · 6efdddc2 · 9ffbb22f · 8b9a9a1d
--- a/aidge_export_cpp/kernels/matmul.hpp

+ 90

− 10
+++ b/aidge_export_cpp/kernels/matmul.hpp

+ 90

− 10
 @@ -6,9 +6,8 @@

 // Generic function for matmul and activation

-template<int M,
-         int K,
-         int N,
+template<int INPUT_A_DIMS[],  int INPUT_B_DIMS[], int OUTPUT_DIMS[], 
+		int _SIZE_DIM_IN_A, int _SIZE_DIM_IN_B, int SIZE_DIM_OUT, 
         ActivationFunction_T ACTIVATION,
         typename Input_T, typename Output_T,
         typename Rescaling_T>
 @@ -19,15 +18,96 @@ void matmul_forward (
    Output_T* __restrict outputs,
    const Rescaling_T& __restrict rescaling)
 {
-    for (int m = 0; m < M; ++m) {
-        for (int n = 0; n < N; ++n) {
-            Output_T sum = Output_T(0);
-            for (int k = 0; k < K; ++k) {
-                sum += inputs1[K*m + k] * inputs2[N*k + n];
+
+    //initialize arrays storing broadcasted(or not) dims
+    int ndim_a[SIZE_DIM_OUT];     
+    int ndim_b[SIZE_DIM_OUT];
+    if ( _SIZE_DIM_IN_A == 1){ 
+        ndim_a[0] = 1;
+        ndim_a[1] =INPUT_A_DIMS[0];
+    }
+    if ( _SIZE_DIM_IN_B == 1){ 
+        ndim_b[0] =INPUT_B_DIMS[0];
+        ndim_b[1] = 1;
+    }
+    
+    for (int i= 0; i<SIZE_DIM_OUT; i++){
+        int idx = SIZE_DIM_OUT-_SIZE_DIM_IN_A;
+        ndim_a[i] = (i< idx) ? 1 :INPUT_A_DIMS[i-idx];
+    }
+    for (int i= 0; i<SIZE_DIM_OUT; i++){
+        int idx = SIZE_DIM_OUT-_SIZE_DIM_IN_B;
+        ndim_b[i] = (i< idx) ? 1 :INPUT_B_DIMS[i-idx];
+    }
+        
+    // initialize strides to iterate through data because of broadcasting
+    int stride_post0[SIZE_DIM_OUT-2] ;
+    int stride_post1[SIZE_DIM_OUT-2] ; 
+    int stride_step0[SIZE_DIM_OUT-2] ;
+    int stride_step1[SIZE_DIM_OUT-2] ; 
+    if (SIZE_DIM_OUT > 2){ 
+        stride_post0[SIZE_DIM_OUT - 3] = 1;
+        stride_post1[SIZE_DIM_OUT - 3] = 1;
+        for (int i = SIZE_DIM_OUT-4; i != -1; --i) {
+            stride_post0[i] = stride_post0[i+1]*ndim_a[i+1];
+            stride_post1[i] = stride_post1[i+1]*ndim_b[i+1];
+        }
+        for (int i = 0; i < SIZE_DIM_OUT-2; ++i) {
+            stride_step0[i] = (ndim_a[i] == 1) ? 1 - stride_post0[i] : 1;
+            stride_step1[i] = (ndim_b[i] == 1) ? 1 - stride_post1[i] : 1;
+        }
+
+    }
+
+    
+    // if _SIZE_DIM_IN_B == _SIZE_DIM_IN_A, then _SIZE_DIM_IN_A == SIZE_DIM_OUT == _SIZE_DIM_IN_B; 
+    // else it will be broadcasted to the correct dims
+
+    int nbMatrices = 1;
+    for(int i = SIZE_DIM_OUT -3; i>=0; --i){
+        nbMatrices *= OUTPUT_DIMS[i];
+    }
+    int dim = SIZE_DIM_OUT -3;
+
+
+    int offsetIn0 = 0;
+    int offsetIn1 = 0;
+    int offsetOut = 0;
+    const int n = ndim_a[SIZE_DIM_OUT - 2];
+    const int k = ndim_a[SIZE_DIM_OUT - 1];
+    const int m = ndim_b[SIZE_DIM_OUT - 1];
+    const int matrix0Size = n*k;
+    const int matrix1Size = k*m;
+    const int matrixOutSize = n*m;
+
+    for(int stack = 0; stack < nbMatrices;){
+
+        for (int i = 0; i < n; ++i) {
+
+            for (int j = 0; j < m; ++j) {
+                float sum = 0;
+
+                for (int l = 0; l < k; ++l) {
+                    sum += (inputs1[ offsetIn0*matrix0Size + i*k + l] * inputs2[offsetIn1*matrix1Size + l*m + j]);
+                }
+                outputs[offsetOut*matrixOutSize + i*m + j] = activation_forward_value<Output_T>(sum, 0/*not applicable*/, ACTIVATION, rescaling);
+            }
+        } 
+
+        if (++stack < nbMatrices) {
+            int tmp_stack = stack;
+            while(tmp_stack % OUTPUT_DIMS[dim] == 0) {
+                tmp_stack /= OUTPUT_DIMS[dim];
+                dim--;
            }
-            outputs[N*m + n] = activation_forward_value<Output_T>(sum, 0/*not applicable*/, ACTIVATION, rescaling);
+            offsetIn0 += stride_step0[dim];
+            offsetIn1 += stride_step1[dim];
+            ++offsetOut;
+            dim = SIZE_DIM_OUT -3;
        }
+
    }
+
 }

-#endif  // __AIDGE_EXPORT_CPP_KERNELS_MATMUL__
+#endif  // __AIDGE_EXPORT_CPP_KERNELS_MATMUL__
+\ No newline at end of file