From e2a71d4f81b45f8a2aa77192a70c88ca75175827 Mon Sep 17 00:00:00 2001
From: NAUD Maxence <maxence.naud@cea.fr>
Date: Tue, 13 Feb 2024 14:56:40 +0000
Subject: [PATCH] Rework MatMul kernel - [function] computation of input
 dimensions now happens in 'MatMulImpl_cpu::forward()'. It handle indexes and
 sends the right pointers to its computation kernel - [function] 'getCPUPtr()'
 new parameter: offset - [function] reduce MatMul kernel function to simple
 matrices multiplication

---
 include/aidge/backend/cpu/data/GetCPUPtr.h    |   4 +-
 .../aidge/backend/cpu/operator/MatMulImpl.hpp |   4 +-
 .../operator/MatMulImpl_forward_kernels.hpp   |  65 ++--------
 src/operator/MatMulImpl.cpp                   | 119 ++++++++++++++++--
 4 files changed, 122 insertions(+), 70 deletions(-)

diff --git a/include/aidge/backend/cpu/data/GetCPUPtr.h b/include/aidge/backend/cpu/data/GetCPUPtr.h
index 47e3b07e..335be9ca 100644
--- a/include/aidge/backend/cpu/data/GetCPUPtr.h
+++ b/include/aidge/backend/cpu/data/GetCPUPtr.h
@@ -15,9 +15,9 @@
 #include "aidge/data/Tensor.hpp"
 
 namespace Aidge {
-inline void *getCPUPtr(std::shared_ptr<Aidge::Data> const &data) {
+inline void *getCPUPtr(std::shared_ptr<Aidge::Data> const &data, const std::size_t offset = 0) {
   const auto tensor = std::static_pointer_cast<Tensor>(data);
-  return tensor->getImpl()->hostPtr(tensor->getImplOffset());
+  return tensor->getImpl()->hostPtr(tensor->getImplOffset() + offset);
 }
 } // namespace Aidge
 
diff --git a/include/aidge/backend/cpu/operator/MatMulImpl.hpp b/include/aidge/backend/cpu/operator/MatMulImpl.hpp
index ef517065..437ba404 100644
--- a/include/aidge/backend/cpu/operator/MatMulImpl.hpp
+++ b/include/aidge/backend/cpu/operator/MatMulImpl.hpp
@@ -23,12 +23,10 @@
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
 
 namespace Aidge {
-// class MatMul_Op;
 
-// compute kernel registry for forward and backward
 class MatMulImplForward_cpu
     : public Registrable<MatMulImplForward_cpu, std::tuple<DataType, DataType>,
-                         void(const std::vector<DimSize_t>&, const std::vector<DimSize_t>&,
+                         void(const std::size_t, const std::size_t, const std::size_t,
                               const void *, const void *, void *)> {};
 class MatMulImplBackward_cpu
     : public Registrable<MatMulImplBackward_cpu, std::tuple<DataType, DataType>,
diff --git a/include/aidge/backend/cpu/operator/MatMulImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/MatMulImpl_forward_kernels.hpp
index a63e9e12..5045580f 100644
--- a/include/aidge/backend/cpu/operator/MatMulImpl_forward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/MatMulImpl_forward_kernels.hpp
@@ -12,70 +12,27 @@
 #ifndef AIDGE_CPU_OPERATOR_MATMULIMPL_FORWARD_KERNEL_H_
 #define AIDGE_CPU_OPERATOR_MATMULIMPL_FORWARD_KERNEL_H_
 
-#include "aidge/utils/Registrar.hpp"
-#include <algorithm>
-
-// #include <omp.h>
 #include "aidge/backend/cpu/operator/MatMulImpl.hpp"
 
 namespace Aidge {
 
 template <class I, class O>
-void MatMulImpl_cpu_forward_kernel(const std::vector<DimSize_t>& input1Dims,const std::vector<DimSize_t>& input2Dims,
-                                   const void* input1_, const void* input2_, void* output_) {
+void MatMulImpl_cpu_forward_kernel(const std::size_t n, const std::size_t k, const std::size_t m,
+                                    const void* input1_, const void* input2_, void* output_) {
     // FIXME: missing MatMul parameters as arguments
     const I* input1 = static_cast<const I*>(input1_);
     const I* input2 = static_cast<const I*>(input2_);
     O* output = static_cast<O*>(output_);
 
-	size_t secondToLastIdx2 = input2Dims.size() > 1 ? input2Dims.size() - 2 : 0;
-	// Checking if matrix dimensions are compatible for multiplication
-	assert(input1Dims[input1Dims.size()-1] == input2Dims[secondToLastIdx2] &&
-            "Matrix dimensions are not compatible for multiplication");
-
-	std::size_t innerMulAxis = input1Dims[input1Dims.size()-1];
-	std::size_t rows1 = input1Dims[input1Dims.size()-2];
-	std::size_t cols2 = input2Dims[input2Dims.size()-1];
-	std::size_t nbMat1 = 1, nbMat2 = 1;
-	if (input1Dims.size()>2)
-	{
-		for (std::size_t i = 0; i < input1Dims.size()-2; i++)
-		{
-			nbMat1 *= input1Dims[i];
-		}
-		
-	}
-	if (input2Dims.size()>2)
-	{
-		for (std::size_t i = 0; i < input2Dims.size()-2; i++)
-		{
-			nbMat2 *= input2Dims[i];
-		}
-		
-	}
-	std::size_t mat1Size = rows1 * innerMulAxis;
-	std::size_t mat2Size = innerMulAxis * cols2;
-	std::size_t matSize = rows1 * cols2;
-	std::size_t nbMat = nbMat1 > nbMat2 ? nbMat1 : nbMat2;
-
-	for (std::size_t i = 0; i < nbMat; i++) {
-// #pragma omp parallel for num_threads(8)
-		for (std::size_t m = 0; m < rows1; m++)
-		{
-			
-			for (size_t k = 0; k < innerMulAxis; k++)
-			{
-				for (std::size_t n = 0; n < cols2; n++)
-				{
-                    if (k==0) {
-                        output[i * matSize + m * cols2 + n]  = 0;
-                    }
-                    
-					output[i * matSize + m * cols2 + n] += input1[(i%nbMat1) * mat1Size + m *innerMulAxis + k] * input2[(i%nbMat2)*mat2Size + k * cols2 + n];	
-				}
-			}	
-		}
-	}
+    for (std::size_t i = 0; i < n; ++i) {
+        for (std::size_t j = 0; j < m; ++j) {
+            O sum = O(0);
+            for (std::size_t l = 0; l < k; ++l) {
+                sum += static_cast<O>(input1[i*k + l] * input2[l*m + j]);
+            }
+            output[i*m + j] = sum;
+        }
+    }
 }
 
 namespace {
diff --git a/src/operator/MatMulImpl.cpp b/src/operator/MatMulImpl.cpp
index 5818ac64..488af176 100644
--- a/src/operator/MatMulImpl.cpp
+++ b/src/operator/MatMulImpl.cpp
@@ -9,15 +9,14 @@
  *
  ********************************************************************************/
 
-#include <cassert>
-#include <chrono>  // std::chrono::milliseconds
-#include <numeric> // std::accumulate
-#include <thread>  // std::this_thread::sleep_for
+#include <cstddef>  // std::size_t
+#include <cstdint>  // std::int32_t
+#include <numeric>  // std::accumulate
 #include <vector>
 
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
 #include "aidge/operator/MatMul.hpp"
 #include "aidge/utils/Types.h"
-#include "aidge/backend/cpu/data/GetCPUPtr.h"
 
 #include "aidge/backend/cpu/operator/MatMulImpl.hpp"
 #include "aidge/backend/cpu/operator/MatMulImpl_forward_kernels.hpp"
@@ -32,10 +31,108 @@ void Aidge::MatMulImpl_cpu::forward()
         {std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dataType(),
          std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()});
 
-    kernelFunc(
-        std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims(),
-        std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims(),
-        getCPUPtr(mOp.getRawInput(0)),
-        getCPUPtr(mOp.getRawInput(1)),
-        getCPUPtr(mOp.getRawOutput(0)));
+    // Compute compatible input dimensions
+    std::vector<std::size_t> dims0 = static_cast<const MatMul_Op&>(mOp).getInput(0)->dims();
+    std::vector<std::size_t> dims1 = static_cast<const MatMul_Op&>(mOp).getInput(1)->dims();
+
+    // keep second-to-last dimension of dims0
+    const std::size_t keepDim0 = (dims0.size() > 1) ? 1 : 0;
+    // keep last dimension of dims1
+    const std::size_t keepDim1 = (dims1.size() > 1) ? 1 : 0;
+
+    if (dims0.size() == 1) {
+        dims0.insert(dims0.cbegin(), 1);
+    }
+    if (dims1.size() == 1) {
+        dims1.push_back(1);
+    }
+
+    if (dims0.size() > dims1.size()) {
+        dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1));
+    }
+    else if (dims1.size() > dims0.size()) {
+        dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1));
+    }
+
+    // const std::size_t dims_size = std::max(dims0.size(), dims1.size());
+    // at this point, dims0.size() == dims1.size()
+    const std::size_t nbDims = dims0.size();
+
+    // initialize strides to iterate through data because of broadcasting
+    std::size_t *stride_post0;
+    std::size_t *stride_post1;
+    std::int32_t *stride_step0;
+    std::int32_t *stride_step1;
+    if (nbDims > 2) {
+        stride_post0 = new std::size_t[nbDims-2];
+        stride_post0[nbDims - 3] = 1;
+        stride_post1 = new std::size_t[nbDims-2];
+        stride_post1[nbDims - 3] = 1;
+        for (std::size_t i = nbDims-4; i != static_cast<std::size_t>(-1); --i) {
+            stride_post0[i] = stride_post0[i+1]*dims0[i+1];
+            stride_post1[i] = stride_post1[i+1]*dims1[i+1];
+        }
+        stride_step0 = new std::int32_t[nbDims-2];
+        stride_step1 = new std::int32_t[nbDims-2];
+        for (std::size_t i = 0; i != nbDims-2; ++i) {
+            stride_step0[i] = (dims0[i] == 1) ? 1 - static_cast<std::int32_t>(stride_post0[i]) : 1;
+            stride_step1[i] = (dims1[i] == 1) ? 1 - static_cast<std::int32_t>(stride_post1[i]) : 1;
+        }
+    }
+
+    const std::vector<std::size_t>& outDims = static_cast<const MatMul_Op&>(mOp).getOutput(0)->dims();
+    const std::size_t nbMatrices = std::accumulate(outDims.cbegin(), outDims.cend() - keepDim0 - keepDim1, 1, std::multiplies<std::size_t>());
+    std::size_t dim = outDims.size() - 1 - keepDim0 - keepDim1;
+
+    // variables for arrays offsets
+    std::size_t offsetIn0 = 0;
+    std::size_t offsetIn1 = 0;
+    std::size_t offsetOut = 0;
+    const std::size_t n = dims0[nbDims - 2];
+    const std::size_t k = dims0[nbDims - 1];
+    const std::size_t m = dims1[nbDims - 1];
+    const std::size_t matrix0Size = n*k;
+    const std::size_t matrix1Size = k*m;
+    const std::size_t matrixOutSize = n*m;
+    for (std::size_t stack = 0; stack < nbMatrices;) {
+        kernelFunc(n, k, m,
+                    getCPUPtr(mOp.getRawInput(0), offsetIn0*matrix0Size),
+                    getCPUPtr(mOp.getRawInput(1), offsetIn1*matrix1Size),
+                    getCPUPtr(mOp.getRawOutput(0), offsetOut*matrixOutSize));
+        if (++stack < nbMatrices) {
+            std::size_t tmp_stack = stack;
+            while(tmp_stack % outDims[dim] == 0) {
+                tmp_stack /= outDims[dim];
+                dim--;
+            }
+            offsetIn0 += stride_step0[dim];
+            offsetIn1 += stride_step1[dim];
+            ++offsetOut;
+            dim = outDims.size() - 1 - keepDim0 - keepDim1;
+        }
+    }
+    if (nbDims > 2) {
+        delete[] stride_post0;
+        delete[] stride_post1;
+        delete[] stride_step0;
+        delete[] stride_step1;
+    }
 }
+
+// void Aidge::MatMulImpl_cpu::forward()
+// {
+//     assert(std::static_pointer_cast<Tensor>(mOp.getRawInput(0)) && "missing input #0");
+//     assert(std::static_pointer_cast<Tensor>(mOp.getRawInput(1)) && "missing input #1");
+
+//     // Find the correct kernel type
+//     auto kernelFunc = Registrar<MatMulImplForward_cpu>::create(
+//         {std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dataType(),
+//          std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()});
+
+//     kernelFunc(
+//         std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims(),
+//         std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims(),
+//         getCPUPtr(mOp.getRawInput(0)),
+//         getCPUPtr(mOp.getRawInput(1)),
+//         getCPUPtr(mOp.getRawOutput(0)));
+// }
-- 
GitLab