From 1cd0d0715b5b55c092cf0229c2c5ce593a240f53 Mon Sep 17 00:00:00 2001
From: Jerome Hue <jerome.hue@cea.fr>
Date: Fri, 6 Sep 2024 17:03:46 +0200
Subject: [PATCH] Add backward pass CPU implementation for Mul operator

---
 .../aidge/backend/cpu/operator/MulImpl.hpp    |  24 +-
 .../cpu/operator/MulImpl_backward_kernels.hpp |  92 ++++
 src/operator/MulImpl.cpp                      |  31 ++
 unit_tests/operator/Test_MulImpl.cpp          | 412 +++++++++++++++++-
 4 files changed, 534 insertions(+), 25 deletions(-)
 create mode 100644 include/aidge/backend/cpu/operator/MulImpl_backward_kernels.hpp

diff --git a/include/aidge/backend/cpu/operator/MulImpl.hpp b/include/aidge/backend/cpu/operator/MulImpl.hpp
index 2d42194c..008edf17 100644
--- a/include/aidge/backend/cpu/operator/MulImpl.hpp
+++ b/include/aidge/backend/cpu/operator/MulImpl.hpp
@@ -25,11 +25,25 @@ namespace Aidge {
 
 // compute kernel registry for forward and backward
 class MulImplForward_cpu
-    : public Registrable<MulImplForward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*,void*)> {
-};
+    : public Registrable<MulImplForward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::vector<std::size_t>&, 
+                                                                                            const std::vector<std::size_t>&, 
+                                                                                            const std::vector<std::size_t>&, 
+                                                                                            const void*, 
+                                                                                            const void*,
+                                                                                            void*)> {};
+
 class MulImplBackward_cpu
-    : public Registrable<MulImplBackward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*, void*)> {
-};
+    : public Registrable<MulImplBackward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::size_t, 
+                                                                                             const std::size_t, 
+                                                                                             const std::size_t,
+                                                                                             const std::vector<std::size_t>,
+                                                                                             const std::vector<std::size_t>,
+                                                                                             const void*, 
+                                                                                             const void*, 
+                                                                                             const void*, 
+                                                                                             void*, 
+                                                                                             void*)> {};
+
 
 class MulImpl_cpu : public OperatorImpl {
 public:
@@ -40,7 +54,9 @@ public:
     }
 
     Elts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
+
     void forward() override;
+    void backward() override;
 };
 
 namespace {
diff --git a/include/aidge/backend/cpu/operator/MulImpl_backward_kernels.hpp b/include/aidge/backend/cpu/operator/MulImpl_backward_kernels.hpp
new file mode 100644
index 00000000..db4cf81f
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/MulImpl_backward_kernels.hpp
@@ -0,0 +1,92 @@
+#ifndef AIDGE_CPU_OPERATOR_MULIMPL_BACKWARD_KERNEL_H_
+#define AIDGE_CPU_OPERATOR_MULIMPL_BACKWARD_KERNEL_H_
+
+
+#include "aidge/utils/Registrar.hpp"
+
+#include <cstdint>     // std::int32_t, std::int64_t
+#include <algorithm>
+
+#include "aidge/backend/cpu/data/Broadcasting.hpp"
+#include "aidge/backend/cpu/operator/MulImpl.hpp"
+
+
+namespace Aidge {
+
+template <class I1, class I2, class O>
+void MulImpl_cpu_backward_kernel(const std::size_t input0Length, 
+                                 const std::size_t input1Length,
+                                 const std::size_t grad0Length,
+                                 const std::vector<std::size_t> input0Dims,
+                                 const std::vector<std::size_t> input1Dims,
+                                 const void* input0_, 
+                                 const void* input1_, 
+                                 const void* grad_output_, 
+                                 void* gradientInput0,
+                                 void* gradientInput1)
+{
+    const auto* input0 = static_cast<const I1*>(input0_);
+    const auto* input1 = static_cast<const I1*>(input1_);
+    const auto* grad_output = static_cast<const O*>(grad_output_);
+    auto* grad_input_0 = static_cast<I1*>(gradientInput0);
+    auto* grad_input_1 = static_cast<I2*>(gradientInput1);
+
+
+    if(input0Dims.size() >= input1Dims.size())
+    {
+        AIDGE_ASSERT(input0Length == grad0Length, "Incorrect dimensions between Mul input and output tensors");
+
+        for(auto i = 0U; i < input0Length; ++i)
+        {
+            const auto indices = getMultiDimIndices(input1Dims, i);
+            const auto flattenedIndex = getFlattenedIndex(input1Dims, indices);
+
+            grad_input_0[i] = input1[flattenedIndex] * grad_output[i];
+        }
+
+        for(std::size_t i = 0 ; i < grad0Length; ++i)
+        {
+            const auto indices = getMultiDimIndices(input1Dims, i);
+            const auto flattenedIndex = getFlattenedIndex(input1Dims, indices);
+
+            grad_input_1[flattenedIndex] += input0[i] * grad_output[i];
+        }
+
+    } else {
+        AIDGE_ASSERT(input1Length == grad0Length, "Incorrect dimensions between Mul input and output tensors");
+
+        for(auto i = 0U; i < input1Length; ++i)
+        {
+            const auto indices = getMultiDimIndices(input0Dims, i);
+            const auto flattenedIndex = getFlattenedIndex(input0Dims, indices);
+
+            grad_input_1[i] = input0[flattenedIndex] * grad_output[i];
+        }
+
+        for(std::size_t i = 0 ; i < grad0Length; ++i)
+        {
+            const auto indices = getMultiDimIndices(input0Dims, i);
+            const auto flattenedIndex = getFlattenedIndex(input0Dims, indices);
+
+            grad_input_0[flattenedIndex] += input1[i] * grad_output[i];
+        }
+    }
+}
+
+
+namespace {
+static Registrar<MulImplBackward_cpu> registrarMulImplBackward_cpu_Float32(
+        {DataType::Float32, DataType::Float32, DataType::Float32},
+        Aidge::MulImpl_cpu_backward_kernel<float, float, float>);
+static Registrar<MulImplBackward_cpu> registrarMulImplBackward_cpu_Float64(
+        {DataType::Float64, DataType::Float64, DataType::Float64},
+        Aidge::MulImpl_cpu_backward_kernel<double, double, double>);
+static Registrar<MulImplBackward_cpu> registrarMulImplBackward_cpu_Int32(
+        {DataType::Int32, DataType::Int32, DataType::Int32},
+        Aidge::MulImpl_cpu_backward_kernel<std::int32_t, std::int32_t, std::int32_t>);
+static Registrar<MulImplBackward_cpu> registrarMulImplBackward_cpu_Int64(
+        {DataType::Int64, DataType::Int64, DataType::Int64},
+        Aidge::MulImpl_cpu_backward_kernel<std::int64_t, std::int64_t, std::int64_t>);
+}  // namespace
+}  // namespace Aidge
+#endif
diff --git a/src/operator/MulImpl.cpp b/src/operator/MulImpl.cpp
index d7feb9b7..e5fd911c 100644
--- a/src/operator/MulImpl.cpp
+++ b/src/operator/MulImpl.cpp
@@ -22,6 +22,7 @@
 
 #include "aidge/backend/cpu/operator/MulImpl.hpp"
 #include "aidge/backend/cpu/operator/MulImpl_forward_kernels.hpp"
+#include "aidge/backend/cpu/operator/MulImpl_backward_kernels.hpp"
 
 Aidge::Elts_t Aidge::MulImpl_cpu::getNbRequiredProtected(const Aidge::IOIndex_t /*inputIdx*/) const {
     // this implementation can be in-place
@@ -40,6 +41,7 @@ void Aidge::MulImpl_cpu::forward() {
     const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
                                                                    std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims());
 
+
     // Call kernel
     kernelFunc(inputDims0,
         inputDims1,
@@ -48,3 +50,32 @@ void Aidge::MulImpl_cpu::forward() {
         getCPUPtr(mOp.getRawInput(1)),
         getCPUPtr(mOp.getRawOutput(0)));
 }
+
+void Aidge::MulImpl_cpu::backward() {
+
+    const Mul_Op& op_ = dynamic_cast<const Mul_Op&>(mOp);
+    
+    auto in0 = op_.getInput(0);
+    auto in1 = op_.getInput(1);
+    auto in0grad = op_.getInput(0)->grad();
+    auto in1grad = op_.getInput(1)->grad();
+    auto out0grad = op_.getOutput(0)->grad();
+
+    // Find kernel function
+    auto kernelFunc = Registrar<MulImplBackward_cpu>::create({
+        out0grad->dataType(),
+        in0grad->dataType(),
+        in1grad->dataType()});
+
+    kernelFunc(/* input0Length */ in0grad->size(), 
+               /* input1Length */ in1grad->size(),
+               /* grad0Length  */ out0grad->size(),
+               /* input0Dims   */ in0->dims(),
+               /* input1Dims   */ in1->dims(),
+               getCPUPtr(in0), 
+               getCPUPtr(in1), 
+               getCPUPtr(out0grad), 
+               getCPUPtr(in0grad), 
+               getCPUPtr(in1grad));
+}
+
diff --git a/unit_tests/operator/Test_MulImpl.cpp b/unit_tests/operator/Test_MulImpl.cpp
index 9d592d31..3378861d 100644
--- a/unit_tests/operator/Test_MulImpl.cpp
+++ b/unit_tests/operator/Test_MulImpl.cpp
@@ -24,6 +24,337 @@
 
 namespace Aidge {
 
+    TEST_CASE("[CPU/Operator] Mul Backward", "[Mul][CPU][Backward]")
+    {
+        std::shared_ptr<Node> myMul = Mul();
+        auto op = std::static_pointer_cast<OperatorTensor>(myMul->getOperator());
+        op->setDataType(DataType::Float32);
+        op->setBackend("cpu");
+
+        SECTION("Case 1: 2D and 1D tensors") {
+            const auto T0 = std::make_shared<Tensor>(Array2D<float,2,3>(
+                {
+                    {
+                        {1,2,3},{4,5,6}
+                    }
+                }
+            ));
+
+            const auto T1 = std::make_shared<Tensor>(Array1D<float,3>(
+                {0.1,0.2,0.3}
+            ));
+
+            T0->setDataType(DataType::Float32);
+            T0->setBackend("cpu");
+            T1->setDataType(DataType::Float32);
+            T1->setBackend("cpu");
+
+            op->getOutput(0)->setGrad(std::make_shared<Tensor>(Array2D<float,2,3>({{{1.0,1.0,1.0},{1.0,1.0,1.0}}})));
+
+            op->associateInput(0,T0);
+            op->associateInput(1,T1);
+            op->forwardDims();
+
+            myMul->forward();
+            myMul->backward();
+
+            auto T0Grad = std::make_shared<Tensor>(Array2D<float, 2,3>({{{0.1,0.2,0.3},{0.1, 0.2, 0.3}}}));
+            auto T1Grad = std::make_shared<Tensor>(Array1D<float, 3>({5,7,9}));
+
+            REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *T0Grad));
+            REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *T1Grad));
+        }
+
+        SECTION("Case 2: 3D and 1D tensors") {
+            const auto T0 = std::make_shared<Tensor>(Array3D<float,2,2,3>(
+                {
+                    {
+                        {
+                            {1.0, 2.0, 3.0},
+                            {4.0, 5.0, 6.0}
+                        },
+                        {
+                            {7.0, 8.0, 9.0},
+                            {10.0, 11.0, 12.0}
+                        }
+                    }
+                }
+            ));
+
+            const auto T1 = std::make_shared<Tensor>(Array1D<float, 3>({0.3,0.2,0.1}));
+
+            const auto newGrad = std::make_shared<Tensor>(Array3D<float,2,2,3>(
+                    {
+                        {
+                            {
+                                {1, 1, 1},
+                                {1, 1, 1}
+                            },
+                            {
+                                {1, 1, 1},
+                                {1, 1, 1}
+                            }
+                        }
+                    }
+                ));
+
+            const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float,2,2,3>(
+                {
+                    {
+                        {
+                            {0.3, 0.2, 0.1},
+                            {0.3, 0.2, 0.1}
+                        },
+                        {
+                            {0.3, 0.2, 0.1},
+                            {0.3, 0.2, 0.1}
+                        }
+                    }
+                }
+            ));
+
+            const auto expectedGrad1 = std::make_shared<Tensor>(Array1D<float,3>(
+                {22.0, 26.0, 30.0}
+            ));
+
+            for(auto T: {T0, T1, newGrad, expectedGrad0, expectedGrad1})
+            {
+                    T->setBackend("cpu") ;
+                    T->setDataType(DataType::Float32);
+            }
+
+            op->associateInput(0, T0);
+            op->associateInput(1, T1);
+            op->getOutput(0)->setGrad(newGrad);
+            op->forwardDims();
+
+            myMul->backward();
+
+            REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0));
+            REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1));
+        }
+
+        SECTION("Case 3: 4D and 2D tensors") {
+            const auto T0 = std::make_shared<Tensor>(Array4D<float,2, 2, 3, 3>(
+                {
+                    {
+                        {
+                            {
+                                {1.0, 2.0, 3.0},
+                                {4.0, 5.0, 6.0},
+                                {7.0, 8.0, 9.0}
+                            },
+                            {
+                                {10.0, 11.0, 12.0},
+                                {13.0, 14.0, 15.0},
+                                {16.0, 17.0, 18.0}
+                            }
+                        },
+                        {
+                            {
+                                {19.0, 20.0, 21.0},
+                                {22.0, 23.0, 24.0},
+                                {25.0, 26.0, 27.0}
+                            },
+                            {
+                                {28.0, 29.0, 30.0},
+                                {31.0, 32.0, 33.0},
+                                {34.0, 35.0, 36.0}
+                            }
+                        }
+                    }
+                }
+            ));
+
+            const auto T1 = std::make_shared<Tensor>(Array2D<float, 3,3>(
+                {
+                    {
+                        {0.5,0.3,0.1},
+                        {0.4,0.2,0.6},
+                        {0.7,0.8,0.9}
+                    }
+                }
+            ));
+
+            const auto newGrad = std::make_shared<Tensor>(Array4D<float,2, 2, 3, 3>(
+                {
+                    {
+                        {
+                            {
+                                {1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0}
+                            },
+                            {
+                                {1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0}
+                            }
+                        },
+                        {
+                            {
+                                {1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0}
+                            },
+                            {
+                                {1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0}
+                            }
+                        }
+                    }
+                }
+            ));
+
+            const auto expectedGrad0 = std::make_shared<Tensor>(Array4D<float,2,2,3,3>(
+                {
+                    {
+                        {
+                            {
+                                {0.5, 0.3, 0.1},
+                                {0.4, 0.2, 0.6},
+                                {0.7, 0.8, 0.9}
+                            },
+                            {
+                                {0.5, 0.3, 0.1},
+                                {0.4, 0.2, 0.6},
+                                {0.7, 0.8, 0.9}
+                            }
+                        },
+                        {
+                            {
+                                {0.5, 0.3, 0.1},
+                                {0.4, 0.2, 0.6},
+                                {0.7, 0.8, 0.9}
+                            },
+                            {
+                                {0.5, 0.3, 0.1},
+                                {0.4, 0.2, 0.6},
+                                {0.7, 0.8, 0.9}
+                            }
+                        }
+                    }
+                }
+            ));
+
+            const auto expectedGrad1 = std::make_shared<Tensor>(Array2D<float,3, 3>(
+                {
+                    {
+                        {58.0, 62.0, 66.0},
+                        {70.0, 74.0, 78.0},
+                        {82.0, 86.0, 90.0}
+                    }
+                }
+            ));
+
+            for(const auto T: {T0, T1, newGrad, expectedGrad0, expectedGrad1})
+            {
+                    T->setBackend("cpu") ;
+                    T->setDataType(DataType::Float32);
+            }
+
+            op->associateInput(0, T0);
+            op->associateInput(1, T1);
+            op->getOutput(0)->setGrad(newGrad);
+            op->forwardDims();
+
+            myMul->backward();
+
+            REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0));
+            REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1));
+        }
+
+        SECTION("Case 4: 3D and 2D tensors") {
+            const auto T0 = std::make_shared<Tensor>(Array3D<float, 2, 3, 4>(
+                {
+                    {
+                        {
+                            {1.0, 2.0, 3.0, 4.0},
+                            {5.0, 6.0, 7.0, 8.0},
+                            {9.0, 10.0, 11.0, 12.0},
+                        },
+                        {
+                            {13.0, 14.0, 15.0, 16.0},
+                            {17.0, 18.0, 19.0, 20.0},
+                            {21.0, 22.0, 23.0, 24.0},
+                        }
+                    }
+                }
+            ));
+
+            const auto T1 = std::make_shared<Tensor>(Array2D<float, 3, 4>(
+                {
+                    {
+                        {0.1, 0.2, 0.3, 0.4},
+                        {0.5, 0.6, 0.7, 0.8},
+                        {0.9, 1.0, 1.1, 1.2}
+                    }
+                }
+            ));
+
+            const auto newGrad = std::make_shared<Tensor>(Array3D<float, 2,3,4>(
+                {
+                    {
+                        {
+                            {1.0, 1.0, 1.0, 1.0},
+                            {1.0, 1.0, 1.0, 1.0},
+                            {1.0, 1.0, 1.0, 1.0},
+                        },
+                        {
+                            {1.0, 1.0, 1.0, 1.0},
+                            {1.0, 1.0, 1.0, 1.0},
+                            {1.0, 1.0, 1.0, 1.0},
+                        }
+                    }
+                }
+            ));
+
+            const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float,2,3,4>(
+                {
+                    {
+                        {
+                            {0.1, 0.2, 0.3, 0.4},
+                            {0.5, 0.6, 0.7, 0.8},
+                            {0.9, 1.0, 1.1, 1.2}
+                        },
+                        {
+                            {0.1, 0.2, 0.3, 0.4},
+                            {0.5, 0.6, 0.7, 0.8},
+                            {0.9, 1.0, 1.1, 1.2}
+                        }
+                    }
+                }
+            ));
+
+            const auto expectedGrad1 = std::make_shared<Tensor>(Array2D<float,3, 4>(
+                {
+                    {
+                        {14.0, 16.0, 18.0, 20.0},
+                        {22.0, 24.0, 26.0, 28.0},
+                        {30.0, 32.0, 34.0, 36.0}
+                    }
+                }
+            ));
+
+            for(const auto T: {T0, T1, newGrad, expectedGrad0, expectedGrad1})
+            {
+                T->setBackend("cpu") ;
+                T->setDataType(DataType::Float32);
+            }
+
+            op->associateInput(0, T0);
+            op->associateInput(1, T1);
+            op->getOutput(0)->setGrad(newGrad);
+            op->forwardDims();
+
+            myMul->backward();
+
+            REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0));
+            REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1));
+        }
+    }
+
 TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
     constexpr std::uint16_t NBTRIALS = 10;
     // Create a random number generator
@@ -31,7 +362,7 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
     std::mt19937 gen(rd());
     std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1
     std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
-    std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5));
+    std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(3));
     std::uniform_int_distribution<int> boolDist(0,1);
 
     // Create MatMul Operator
@@ -60,6 +391,7 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
     std::chrono::time_point<std::chrono::system_clock> end;
     std::chrono::duration<double, std::micro> duration{};
 
+
     SECTION("MulImpl_cpu::forward()") {
         SECTION("Scalar / Scalar") {
 
@@ -68,16 +400,20 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
 
         }
         SECTION("+1-D Tensor / +1-D Tensor - same dimensions") {
+
             std::size_t number_of_operation = 0;
 
             for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+
                 // generate 2 random Tensors
-                const std::size_t nbDims = nbDimsDist(gen);
-                std::vector<std::size_t> dims;
+                const auto nbDims = nbDimsDist(gen);
+                auto dims = std::vector<std::size_t>{};
+
                 for (std::size_t i = 0; i < nbDims; ++i) {
                     dims.push_back(dimSizeDist(gen));
                 }
-                const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+
+                const auto nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
                 number_of_operation += nb_elements;
 
                 // without broadcasting
@@ -114,67 +450,101 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
                 delete[] array0;
                 delete[] array1;
                 delete[] result;
-
-                // with broadcasting
             }
             std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
             std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
         }
 
+
         SECTION("+1-D Tensor / +1-D Tensor - broadcasting") {
             std::size_t number_of_operation = 0;
 
             for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+
                 // generate 2 random Tensors
                 // handle dimensions, replace some dimensions with '1' to get broadcasting
+
                 constexpr std::size_t nbDims = 4;
-                std::vector<std::size_t> dims;
-                for (std::size_t i = 0; i < nbDims; ++i) {
-                    dims.push_back(dimSizeDist(gen));
+                std::vector<std::size_t> dimensions;
+
+                for (std::size_t i = 0; i < nbDims; ++i)
+                {
+                    dimensions.push_back(dimSizeDist(gen));
                 }
-                std::vector<std::size_t> dims0 = dims;
-                std::vector<std::size_t> dims1 = dims;
-                std::vector<std::size_t> dimsOut = dims;
-                for (std::size_t i = 0; i < nbDims; ++i) {
-                    if (boolDist(gen)) {
+
+                auto dims0 = dimensions;
+                auto dims1 = dimensions;
+                auto dimsOut = dimensions;
+
+                for (std::size_t i = 0; i < nbDims; ++i)
+                {
+                    if (boolDist(gen))
+                    {
                         dims0[i] = 1;
                     }
-                    if (boolDist(gen)) {
+
+                    if (boolDist(gen))
+                    {
                         dims1[i] = 1;
                     }
+
                     dimsOut[i] = (dims0[i] == 1) ? dims1[i] : dims0[i];
                 }
 
+                for(auto dim : dims0)
+                {
+                    Log::info("Dimension of input 0 : {}", dim);
+                }
+
+                for(auto dim : dims1)
+                {
+                    Log::info("Dimension of input 1 : {}", dim);
+                }
+
                 // create arrays and fill them with random values
                 float* array0 = new float[dims0[0]*dims0[1]*dims0[2]*dims0[3]];
                 float* array1 = new float[dims1[0]*dims1[1]*dims1[2]*dims1[3]];
                 float* result = new float[dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]];
 
-                for (std::size_t i = 0; i < dims0[0]*dims0[1]*dims0[2]*dims0[3]; ++i) {
+
+                for (std::size_t i = 0; i < dims0[0]*dims0[1]*dims0[2]*dims0[3]; ++i)
+                {
                     array0[i] = valueDist(gen);
                 }
-                for (std::size_t i = 0; i < dims1[0]*dims1[1]*dims1[2]*dims1[3]; ++i) {
+
+                for (std::size_t i = 0; i < dims1[0]*dims1[1]*dims1[2]*dims1[3]; ++i)
+                {
                     array1[i] = valueDist(gen);
                 }
 
                 // compute true result
                 const std::size_t strides0[nbDims] = {dims0[1]*dims0[2]*dims0[3], dims0[2]*dims0[3], dims0[3], 1};
                 const std::size_t strides1[nbDims] = {dims1[1]*dims1[2]*dims1[3], dims1[2]*dims1[3], dims1[3], 1};
-                for (std::size_t a = 0; a < dimsOut[0]; ++a) {
-                    for (std::size_t b = 0; b < dimsOut[1]; ++b) {
+
+                for (std::size_t a = 0; a < dimsOut[0]; ++a)
+                {
+                    for (std::size_t b = 0; b < dimsOut[1]; ++b)
+                    {
                         const std::size_t idx0_0 = strides0[0] * ((dims0[0] > 1) ? a : 0)
                                                     + strides0[1] * ((dims0[1] > 1) ? b : 0);
+
                         const std::size_t idx1_0 = strides1[0] * ((dims1[0] > 1) ? a : 0)
                                                     + strides1[1] * ((dims1[1] > 1) ? b : 0);
-                        for (std::size_t c = 0; c < dimsOut[2]; ++c) {
+
+                        for (std::size_t c = 0; c < dimsOut[2]; ++c)
+                        {
                             const std::size_t idx_out = dimsOut[3] * (c + dimsOut[2] * (b + dimsOut[1] * a));
-                            for (std::size_t d = 0; d < dimsOut[3]; ++d) {
+
+                            for (std::size_t d = 0; d < dimsOut[3]; ++d)
+                            {
                                 std::size_t idx0 = idx0_0
                                                     + strides0[2] * ((dims0[2] > 1) ? c : 0)
                                                     + ((dims0[3] > 1) ? d : 0);
+
                                 std::size_t idx1 = idx1_0
                                                     + strides1[2] * ((dims1[2] > 1) ? c : 0)
                                                     + ((dims1[3] > 1) ? d : 0);
+
                                 result[idx_out + d] = array0[idx0] * array1[idx1];
                                 // std::cout << "(" << idx0 << ", " << idx1 << ") -> " << array0[idx0] << " * " << array1[idx1] << " -> " << idx_out + d << std::endl;
                             }
-- 
GitLab