From fc36e10e5ea35c7c67623bf875b49e6f2752fe89 Mon Sep 17 00:00:00 2001
From: hrouis <houssemeddine.rouis92@gmail.com>
Date: Wed, 6 Dec 2023 16:57:51 +0100
Subject: [PATCH] remove matmul attrs and update kernel

---
 .../aidge/backend/cpu/operator/MatMulImpl.hpp |   8 +-
 .../operator/MatMulImpl_forward_kernels.hpp   |  54 +++--
 src/operator/MatMulImpl.cpp                   |  18 +-
 unit_tests/operator/Test_MatMulImpl.cpp       | 192 +++++++++++-------
 4 files changed, 155 insertions(+), 117 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/MatMulImpl.hpp b/include/aidge/backend/cpu/operator/MatMulImpl.hpp
index e8654c6e..ef517065 100644
--- a/include/aidge/backend/cpu/operator/MatMulImpl.hpp
+++ b/include/aidge/backend/cpu/operator/MatMulImpl.hpp
@@ -27,12 +27,12 @@ namespace Aidge {
 
 // compute kernel registry for forward and backward
 class MatMulImplForward_cpu
-    : public Registrable<MatMulImplForward_cpu, std::tuple<DataType, DataType, DataType>,
-                         void(const MatMul_Op::Attrs &, const DimSize_t, const DimSize_t,
+    : public Registrable<MatMulImplForward_cpu, std::tuple<DataType, DataType>,
+                         void(const std::vector<DimSize_t>&, const std::vector<DimSize_t>&,
                               const void *, const void *, void *)> {};
 class MatMulImplBackward_cpu
-    : public Registrable<MatMulImplBackward_cpu, std::tuple<DataType, DataType, DataType>,
-                         void(const MatMul_Op::Attrs &, const DimSize_t, const DimSize_t,
+    : public Registrable<MatMulImplBackward_cpu, std::tuple<DataType, DataType>,
+                         void(const std::vector<DimSize_t>&, const std::vector<DimSize_t>&,
                               const void *, const void *, void *)> {};
 
 class MatMulImpl_cpu : public OperatorImpl {
diff --git a/include/aidge/backend/cpu/operator/MatMulImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/MatMulImpl_forward_kernels.hpp
index bc52779e..92bc5a61 100644
--- a/include/aidge/backend/cpu/operator/MatMulImpl_forward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/MatMulImpl_forward_kernels.hpp
@@ -19,38 +19,54 @@
 
 namespace Aidge {
 
-template <class I, class W, class O>
-void MatMulImpl_cpu_forward_kernel(const MatMul_Op::Attrs& attrs, const DimSize_t batchSize, const DimSize_t oneInputSize,
-                                   const void* input_, const void* weights_, void* output_) {
+template <class I, class O>
+void MatMulImpl_cpu_forward_kernel(const std::vector<DimSize_t>& input1Dims,const std::vector<DimSize_t>& input2Dims,
+                                   const void* input1_, const void* input2_, void* output_) {
     // FIXME: missing MatMul parameters as arguments
-    const I* input = static_cast<const I*>(input_);
-    const W* weights = static_cast<const W*>(weights_);
+    const I* input1 = static_cast<const I*>(input1_);
+    const I* input2 = static_cast<const I*>(input2_);
     O* output = static_cast<O*>(output_);
+	size_t secondToLastIdx1 = input1Dims.size() > 1 ? input1Dims.size() - 2 : 0;
+	size_t secondToLastIdx2 = input2Dims.size() > 1 ? input2Dims.size() - 2 : 0;
+	// Checking if matrix dimensions are compatible for multiplication
+	assert(input1Dims.back() == input2Dims[secondToLastIdx2] && "Matrix dimensions are not compatible for multiplication");
 
+    // Extracting dimensions
+    size_t rows1 = 1, cols1 = 1,  cols2 = 1;
 
-    std::fill(output, output+(batchSize*std::get<0>(attrs)), O(0));
+    // For input1
+    for (size_t i = 0; i < input1Dims.size() - 1; ++i) {
+        rows1 *= input1Dims[i];
+    }
+    cols1 = input1Dims.back();
+
+    // For input2
+    for (size_t i = 1; i < input2Dims.size(); ++i) {
+        cols2 *= input2Dims[i];
+    }
 
-    for (std::size_t batch = 0; batch < batchSize; ++batch) {
-        for (std::size_t out = 0; out < std::get<0>(attrs); ++out) {
-            output[out + batch*std::get<0>(attrs)] = std::inner_product(input + batch*oneInputSize,
-                                                        input + (batch + 1)*oneInputSize,
-                                                        weights + out*oneInputSize,
-                                                        output[out + batch*std::get<0>(attrs)]);
+    // Multiplication
+    for (size_t i = 0; i < rows1; ++i) {
+        for (size_t j = 0; j < cols2; ++j) {
+            float sum = 0.0;
+            for (size_t k = 0; k < cols1; ++k) {
+                sum += input1[i * cols1 + k] * input2[k * cols2 + j];
+            }
+            output[i * cols2 + j] = sum;
         }
     }
 }
 
-
 namespace {
 static Registrar<MatMulImplForward_cpu> registrarMatMulImpl2DForward_cpu_Float32(
-        {DataType::Float32, DataType::Float32, DataType::Float32},
-        Aidge::MatMulImpl_cpu_forward_kernel<float, float, float>);
+        {DataType::Float32, DataType::Float32},
+        Aidge::MatMulImpl_cpu_forward_kernel<float, float>);
 static Registrar<MatMulImplForward_cpu> registrarMatMulImpl2DForward_cpu_Int32(
-        {DataType::Int32, DataType::Int32, DataType::Int32},
-        Aidge::MatMulImpl_cpu_forward_kernel<int, int, int>);
+        {DataType::Int32, DataType::Int32},
+        Aidge::MatMulImpl_cpu_forward_kernel<int, int>);
 static Registrar<MatMulImplForward_cpu> registrarMatMulImpl2DForward_cpu_Float64(
-        {DataType::Float64, DataType::Float64, DataType::Float64},
-        Aidge::MatMulImpl_cpu_forward_kernel<double, double, double>);
+        {DataType::Float64, DataType::Float64},
+        Aidge::MatMulImpl_cpu_forward_kernel<double, double>);
 }  // namespace
 
 }  // namespace Aidge
diff --git a/src/operator/MatMulImpl.cpp b/src/operator/MatMulImpl.cpp
index f02effb3..c1c3ccb0 100644
--- a/src/operator/MatMulImpl.cpp
+++ b/src/operator/MatMulImpl.cpp
@@ -30,24 +30,12 @@ void Aidge::MatMulImpl_cpu::forward()
     // Find the correct kernel type
     auto kernelFunc = Registrar<MatMulImplForward_cpu>::create(
         {std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dataType(),
-         std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dataType(),
          std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()});
 
-    // Call kernel
-    // if (mOp.getInput(0)->nbDims() == 4) {
-    //     kernelFunc(
-    //         mOp.getStaticAttributes(),
-    //         std::static_pointer_cast<Tensor>(mOp.getInput(0))->template dims<4>(),
-    //         mOp.getInput(0))->getImpl()->rawPtr(),
-    //         mOp.mInputs[1]->getImpl()->rawPtr(),
-    //         mOp.mInputs[2]->getImpl()->rawPtr(),
-    //         getCPUPtr(mOp.getRawOutput(0));
-    // }
-    // else
+
     kernelFunc(
-        dynamic_cast<const MatMul_Op&>(mOp).getStaticAttributes(),
-        std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims()[0],
-        std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->size() / std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims()[0],
+        std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims(),
+        std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims(),
         getCPUPtr(mOp.getRawInput(0)),
         getCPUPtr(mOp.getRawInput(1)),
         getCPUPtr(mOp.getRawOutput(0)));
diff --git a/unit_tests/operator/Test_MatMulImpl.cpp b/unit_tests/operator/Test_MatMulImpl.cpp
index 1edb915f..ae10df27 100644
--- a/unit_tests/operator/Test_MatMulImpl.cpp
+++ b/unit_tests/operator/Test_MatMulImpl.cpp
@@ -20,92 +20,126 @@
 using namespace Aidge;
 
 TEST_CASE("[cpu/operator] MatMul(forward)", "[MatMul][CPU]") {
-    // Test MatMul forward with batch size = 2 and feature size = 75
-    std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array2D<int, 5, 75>{
-            {{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,
-              5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,
-              9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
-              13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15},
-             {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,
-              5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,
-              9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
-              13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15},
-             {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,
-              5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,
-              9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
-              13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15},
-             {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,
-              5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,
-              9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
-              13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15},
-             {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,
-              5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,
-              9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
-              13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}}});
-    std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<int, 2, 5>{
-            {{23600, 23600, 23600, 23600, 23600}, {68600, 68600, 68600, 68600, 68600}}});
-
-    std::shared_ptr<Node> myMatMul = MatMul(75, 5, "mymatmul");
-    auto op = std::static_pointer_cast<OperatorTensor>(myMatMul -> getOperator());
-    op->associateInput(1, myWeights);
-
-    SECTION("2D input") {
-        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<int, 2, 75>{
-                {{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18,
-                  19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
-                  38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56,
-                  57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74},
-                 {75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
-                  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
-                  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
-                  120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
-                  135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149}}});
-        op->associateInput(0, myInput);
-        op->setDataType(DataType::Int32);
+    SECTION("2D Tensors") {
+        std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array2D<float,2,2> {
+            {
+                {0.16672266, 0.39773488},
+                {0.83746278, 0.54205710}
+            }
+        });
+        std::shared_ptr<Tensor> input_2 =  std::make_shared<Tensor>(Array2D<float,2,2>{
+            {
+                {0.50658345, 0.04777747},
+                {0.22279310, 0.41348755}
+            }
+        });
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array2D<float,2,2> {
+            {
+                {0.17307153, 0.17242400},
+                {0.54501140, 0.26414573}
+            }
+        });
+
+        std::shared_ptr<Node> myMatMul = MatMul();
+        auto op = std::static_pointer_cast<OperatorTensor>(myMatMul -> getOperator());
+        op->associateInput(0, input_1);
+        op->associateInput(1, input_2);
+        op->setDataType(DataType::Float32);
         op->setBackend("cpu");
         op->computeOutputDims();
         myMatMul->forward();
-        REQUIRE(*(op->getOutput(0)) == *myOutput);
+		expectedOutput->print();
+		op->getOutput(0)->print();
+
+        float* resPtr = static_cast<float*>(op->getOutput(0)->getImpl()->rawPtr());
+        float* expectedPtr = static_cast<float*>(expectedOutput->getImpl()->rawPtr());
+        for (std::size_t i = 0; i< expectedOutput->size(); ++i) {
+            REQUIRE(std::abs(resPtr[i]-expectedPtr[i]) < 0.00001);
+        }
+
     }
-    SECTION("4D input") {
-        std::shared_ptr<Tensor> myInput =
-                std::make_shared<Tensor>(Array4D<int, 2, 3, 5, 5>{{{{{0, 1, 2, 3, 4},
-                                                                     {5, 6, 7, 8, 9},
-                                                                     {10, 11, 12, 13, 14},
-                                                                     {15, 16, 17, 18, 19},
-                                                                     {20, 21, 22, 23, 24}},
-                                                                    {{25, 26, 27, 28, 29},
-                                                                     {30, 31, 32, 33, 34},
-                                                                     {35, 36, 37, 38, 39},
-                                                                     {40, 41, 42, 43, 44},
-                                                                     {45, 46, 47, 48, 49}},
-                                                                    {{50, 51, 52, 53, 54},
-                                                                     {55, 56, 57, 58, 59},
-                                                                     {60, 61, 62, 63, 64},
-                                                                     {65, 66, 67, 68, 69},
-                                                                     {70, 71, 72, 73, 74}}},
-                                                                   {{{75, 76, 77, 78, 79},
-                                                                     {80, 81, 82, 83, 84},
-                                                                     {85, 86, 87, 88, 89},
-                                                                     {90, 91, 92, 93, 94},
-                                                                     {95, 96, 97, 98, 99}},
-                                                                    {{100, 101, 102, 103, 104},
-                                                                     {105, 106, 107, 108, 109},
-                                                                     {110, 111, 112, 113, 114},
-                                                                     {115, 116, 117, 118, 119},
-                                                                     {120, 121, 122, 123, 124}},
-                                                                    {{125, 126, 127, 128, 129},
-                                                                     {130, 131, 132, 133, 134},
-                                                                     {135, 136, 137, 138, 139},
-                                                                     {140, 141, 142, 143, 144},
-                                                                     {145, 146, 147, 148, 149}}}}});
-        op->associateInput(0, myInput);
-        op->setDataType(DataType::Int32);
+
+
+    SECTION("3D Tensor by 1D Tensor") {
+        std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array3D<float,2,2,3> {
+            {
+                {{0.82786506, 0.19047028, 0.62954658},
+         		 {0.63160968, 0.12468684, 0.49015969}},
+
+        		{{0.49215794, 0.42231840, 0.02699018},
+        		 {0.66403216, 0.94622904, 0.42048711}}
+            }
+        });
+        std::shared_ptr<Tensor> input_2 =  std::make_shared<Tensor>(Array1D<float,3>{
+            {0.82458717, 0.88598752, 0.78737932}
+        });
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array2D<float,2,2> {
+            {
+                {1.34709311, 1.01722980},
+        		{0.80124742, 1.71698236}
+            }
+        });
+
+        std::shared_ptr<Node> myMatMul = MatMul();
+        auto op = std::static_pointer_cast<OperatorTensor>(myMatMul -> getOperator());
+        op->associateInput(0, input_1);
+        op->associateInput(1, input_2);
+        op->setDataType(DataType::Float32);
         op->setBackend("cpu");
         op->computeOutputDims();
         myMatMul->forward();
-        REQUIRE(*(op->getOutput(0)) == *myOutput);
+		expectedOutput->print();
+		op->getOutput(0)->print();
+
+        float* resPtr = static_cast<float*>(op->getOutput(0)->getImpl()->rawPtr());
+        float* expectedPtr = static_cast<float*>(expectedOutput->getImpl()->rawPtr());
+        for (std::size_t i = 0; i< expectedOutput->size(); ++i) {
+            REQUIRE(std::abs(resPtr[i]-expectedPtr[i]) < 0.00001);
+        }
+
     }
 
-    // std::cout << static_cast<Tensor>((*myMatMul->getOperator())["weight"])[0][0][0][0] << std::endl;
+    SECTION("3D Tensor by 2D Tensor") {
+        std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array3D<float,1,2,3> {
+            {
+                {
+					{0.53427607, 0.69181818, 0.30088913},
+         		 	{0.20866227, 0.67821276, 0.25695610}
+				}
+            }
+        });
+        std::shared_ptr<Tensor> input_2 =  std::make_shared<Tensor>(Array2D<float,3,4>{
+            {
+				{0.03158629, 0.21031839, 0.95692378, 0.05287921},
+				{0.66182911, 0.91662365, 0.07928377, 0.86983263},
+				{0.12386280, 0.63736272, 0.15963674, 0.465079722}
+			}
+        });
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array3D<float,1,2,4> {
+            {
+                {
+					{0.51201022, 0.93828046, 0.61414438, 0.76995558},
+         			{0.48727912, 0.82932562, 0.29446477, 0.72047055}
+				}
+            }
+        });
+
+        std::shared_ptr<Node> myMatMul = MatMul();
+        auto op = std::static_pointer_cast<OperatorTensor>(myMatMul -> getOperator());
+        op->associateInput(0, input_1);
+        op->associateInput(1, input_2);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cpu");
+        op->computeOutputDims();
+        myMatMul->forward();
+		expectedOutput->print();
+		op->getOutput(0)->print();
+
+        float* resPtr = static_cast<float*>(op->getOutput(0)->getImpl()->rawPtr());
+        float* expectedPtr = static_cast<float*>(expectedOutput->getImpl()->rawPtr());
+        for (std::size_t i = 0; i< expectedOutput->size(); ++i) {
+            REQUIRE(std::abs(resPtr[i]-expectedPtr[i]) < 0.00001);
+        }
+
+    }
 }
\ No newline at end of file
-- 
GitLab