Merge branch 'feat/mul-backward' into 'dev'

Add backward pass CPU implementation for Mul operator See merge request !85

Merge branch 'feat/mul-backward' into 'dev'
Add backward pass CPU implementation for Mul operator See merge request !85
aa30d4b3 · Olivier BICHLER · ddcd9dbc · 1cd0d071 · aa30d4b3 · aa30d4b3
Commit aa30d4b3 authored 10 months ago by Olivier BICHLER
--- a/include/aidge/backend/cpu/operator/MulImpl.hpp
+++ b/include/aidge/backend/cpu/operator/MulImpl.hpp
@@ -25,11 +25,25 @@ namespace Aidge {
 // compute kernel registry for forward and backward
 class MulImplForward_cpu
-    : public Registrable<MulImplForward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*,void*)> {
+    : public Registrable<MulImplForward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::vector<std::size_t>&, 
-};
+                                                                                            const std::vector<std::size_t>&, 
+                                                                                            const std::vector<std::size_t>&, 
+                                                                                            const void*, 
+                                                                                            const void*,
+                                                                                            void*)> {};
 class MulImplBackward_cpu
-    : public Registrable<MulImplBackward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*, void*)> {
+    : public Registrable<MulImplBackward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::size_t, 
-};
+                                                                                             const std::size_t, 
+                                                                                             const std::size_t,
+                                                                                             const std::vector<std::size_t>,
+                                                                                             const std::vector<std::size_t>,
+                                                                                             const void*, 
+                                                                                             const void*, 
+                                                                                             const void*, 
+                                                                                             void*, 
+                                                                                             void*)> {};
 class MulImpl_cpu : public OperatorImpl {
 public:
@@ -40,7 +54,9 @@ public:
    }
    Elts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
    void forward() override;
+    void backward() override;
 };
 namespace {

--- a/include/aidge/backend/cpu/operator/MulImpl_backward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/MulImpl_backward_kernels.hpp
+#ifndef AIDGE_CPU_OPERATOR_MULIMPL_BACKWARD_KERNEL_H_
+#define AIDGE_CPU_OPERATOR_MULIMPL_BACKWARD_KERNEL_H_
+#include "aidge/utils/Registrar.hpp"
+#include <cstdint>     // std::int32_t, std::int64_t
+#include <algorithm>
+#include "aidge/backend/cpu/data/Broadcasting.hpp"
+#include "aidge/backend/cpu/operator/MulImpl.hpp"
+namespace Aidge {
+template <class I1, class I2, class O>
+void MulImpl_cpu_backward_kernel(const std::size_t input0Length, 
+                                 const std::size_t input1Length,
+                                 const std::size_t grad0Length,
+                                 const std::vector<std::size_t> input0Dims,
+                                 const std::vector<std::size_t> input1Dims,
+                                 const void* input0_, 
+                                 const void* input1_, 
+                                 const void* grad_output_, 
+                                 void* gradientInput0,
+                                 void* gradientInput1)
+{
+    const auto* input0 = static_cast<const I1*>(input0_);
+    const auto* input1 = static_cast<const I1*>(input1_);
+    const auto* grad_output = static_cast<const O*>(grad_output_);
+    auto* grad_input_0 = static_cast<I1*>(gradientInput0);
+    auto* grad_input_1 = static_cast<I2*>(gradientInput1);
+    if(input0Dims.size() >= input1Dims.size())
+    {
+        AIDGE_ASSERT(input0Length == grad0Length, "Incorrect dimensions between Mul input and output tensors");
+        for(auto i = 0U; i < input0Length; ++i)
+        {
+            const auto indices = getMultiDimIndices(input1Dims, i);
+            const auto flattenedIndex = getFlattenedIndex(input1Dims, indices);
+            grad_input_0[i] = input1[flattenedIndex] * grad_output[i];
+        }
+        for(std::size_t i = 0 ; i < grad0Length; ++i)
+        {
+            const auto indices = getMultiDimIndices(input1Dims, i);
+            const auto flattenedIndex = getFlattenedIndex(input1Dims, indices);
+            grad_input_1[flattenedIndex] += input0[i] * grad_output[i];
+        }
+    } else {
+        AIDGE_ASSERT(input1Length == grad0Length, "Incorrect dimensions between Mul input and output tensors");
+        for(auto i = 0U; i < input1Length; ++i)
+        {
+            const auto indices = getMultiDimIndices(input0Dims, i);
+            const auto flattenedIndex = getFlattenedIndex(input0Dims, indices);
+            grad_input_1[i] = input0[flattenedIndex] * grad_output[i];
+        }
+        for(std::size_t i = 0 ; i < grad0Length; ++i)
+        {
+            const auto indices = getMultiDimIndices(input0Dims, i);
+            const auto flattenedIndex = getFlattenedIndex(input0Dims, indices);
+            grad_input_0[flattenedIndex] += input1[i] * grad_output[i];
+        }
+    }
+}
+namespace {
+static Registrar<MulImplBackward_cpu> registrarMulImplBackward_cpu_Float32(
+        {DataType::Float32, DataType::Float32, DataType::Float32},
+        Aidge::MulImpl_cpu_backward_kernel<float, float, float>);
+static Registrar<MulImplBackward_cpu> registrarMulImplBackward_cpu_Float64(
+        {DataType::Float64, DataType::Float64, DataType::Float64},
+        Aidge::MulImpl_cpu_backward_kernel<double, double, double>);
+static Registrar<MulImplBackward_cpu> registrarMulImplBackward_cpu_Int32(
+        {DataType::Int32, DataType::Int32, DataType::Int32},
+        Aidge::MulImpl_cpu_backward_kernel<std::int32_t, std::int32_t, std::int32_t>);
+static Registrar<MulImplBackward_cpu> registrarMulImplBackward_cpu_Int64(
+        {DataType::Int64, DataType::Int64, DataType::Int64},
+        Aidge::MulImpl_cpu_backward_kernel<std::int64_t, std::int64_t, std::int64_t>);
+}  // namespace
+}  // namespace Aidge
+#endif
--- a/src/operator/MulImpl.cpp
+++ b/src/operator/MulImpl.cpp
@@ -22,6 +22,7 @@
 #include "aidge/backend/cpu/operator/MulImpl.hpp"
 #include "aidge/backend/cpu/operator/MulImpl_forward_kernels.hpp"
+#include "aidge/backend/cpu/operator/MulImpl_backward_kernels.hpp"
 Aidge::Elts_t Aidge::MulImpl_cpu::getNbRequiredProtected(const Aidge::IOIndex_t /*inputIdx*/) const {
    // this implementation can be in-place
@@ -40,6 +41,7 @@ void Aidge::MulImpl_cpu::forward() {
    const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
                                                                   std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims());
    // Call kernel
    kernelFunc(inputDims0,
        inputDims1,
@@ -48,3 +50,32 @@ void Aidge::MulImpl_cpu::forward() {
        getCPUPtr(mOp.getRawInput(1)),
        getCPUPtr(mOp.getRawOutput(0)));
 }
+void Aidge::MulImpl_cpu::backward() {
+    const Mul_Op& op_ = dynamic_cast<const Mul_Op&>(mOp);
+    auto in0 = op_.getInput(0);
+    auto in1 = op_.getInput(1);
+    auto in0grad = op_.getInput(0)->grad();
+    auto in1grad = op_.getInput(1)->grad();
+    auto out0grad = op_.getOutput(0)->grad();
+    // Find kernel function
+    auto kernelFunc = Registrar<MulImplBackward_cpu>::create({
+        out0grad->dataType(),
+        in0grad->dataType(),
+        in1grad->dataType()});
+    kernelFunc(/* input0Length */ in0grad->size(), 
+               /* input1Length */ in1grad->size(),
+               /* grad0Length  */ out0grad->size(),
+               /* input0Dims   */ in0->dims(),
+               /* input1Dims   */ in1->dims(),
+               getCPUPtr(in0), 
+               getCPUPtr(in1), 
+               getCPUPtr(out0grad), 
+               getCPUPtr(in0grad), 
+               getCPUPtr(in1grad));
+}
--- a/unit_tests/operator/Test_MulImpl.cpp
+++ b/unit_tests/operator/Test_MulImpl.cpp
@@ -24,6 +24,337 @@
 namespace Aidge {
+    TEST_CASE("[CPU/Operator] Mul Backward", "[Mul][CPU][Backward]")
+    {
+        std::shared_ptr<Node> myMul = Mul();
+        auto op = std::static_pointer_cast<OperatorTensor>(myMul->getOperator());
+        op->setDataType(DataType::Float32);
+        op->setBackend("cpu");
+        SECTION("Case 1: 2D and 1D tensors") {
+            const auto T0 = std::make_shared<Tensor>(Array2D<float,2,3>(
+                {
+                    {
+                        {1,2,3},{4,5,6}
+                    }
+                }
+            ));
+            const auto T1 = std::make_shared<Tensor>(Array1D<float,3>(
+                {0.1,0.2,0.3}
+            ));
+            T0->setDataType(DataType::Float32);
+            T0->setBackend("cpu");
+            T1->setDataType(DataType::Float32);
+            T1->setBackend("cpu");
+            op->getOutput(0)->setGrad(std::make_shared<Tensor>(Array2D<float,2,3>({{{1.0,1.0,1.0},{1.0,1.0,1.0}}})));
+            op->associateInput(0,T0);
+            op->associateInput(1,T1);
+            op->forwardDims();
+            myMul->forward();
+            myMul->backward();
+            auto T0Grad = std::make_shared<Tensor>(Array2D<float, 2,3>({{{0.1,0.2,0.3},{0.1, 0.2, 0.3}}}));
+            auto T1Grad = std::make_shared<Tensor>(Array1D<float, 3>({5,7,9}));
+            REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *T0Grad));
+            REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *T1Grad));
+        }
+        SECTION("Case 2: 3D and 1D tensors") {
+            const auto T0 = std::make_shared<Tensor>(Array3D<float,2,2,3>(
+                {
+                    {
+                        {
+                            {1.0, 2.0, 3.0},
+                            {4.0, 5.0, 6.0}
+                        },
+                        {
+                            {7.0, 8.0, 9.0},
+                            {10.0, 11.0, 12.0}
+                        }
+                    }
+                }
+            ));
+            const auto T1 = std::make_shared<Tensor>(Array1D<float, 3>({0.3,0.2,0.1}));
+            const auto newGrad = std::make_shared<Tensor>(Array3D<float,2,2,3>(
+                    {
+                        {
+                            {
+                                {1, 1, 1},
+                                {1, 1, 1}
+                            },
+                            {
+                                {1, 1, 1},
+                                {1, 1, 1}
+                            }
+                        }
+                    }
+                ));
+            const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float,2,2,3>(
+                {
+                    {
+                        {
+                            {0.3, 0.2, 0.1},
+                            {0.3, 0.2, 0.1}
+                        },
+                        {
+                            {0.3, 0.2, 0.1},
+                            {0.3, 0.2, 0.1}
+                        }
+                    }
+                }
+            ));
+            const auto expectedGrad1 = std::make_shared<Tensor>(Array1D<float,3>(
+                {22.0, 26.0, 30.0}
+            ));
+            for(auto T: {T0, T1, newGrad, expectedGrad0, expectedGrad1})
+            {
+                    T->setBackend("cpu") ;
+                    T->setDataType(DataType::Float32);
+            }
+            op->associateInput(0, T0);
+            op->associateInput(1, T1);
+            op->getOutput(0)->setGrad(newGrad);
+            op->forwardDims();
+            myMul->backward();
+            REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0));
+            REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1));
+        }
+        SECTION("Case 3: 4D and 2D tensors") {
+            const auto T0 = std::make_shared<Tensor>(Array4D<float,2, 2, 3, 3>(
+                {
+                    {
+                        {
+                            {
+                                {1.0, 2.0, 3.0},
+                                {4.0, 5.0, 6.0},
+                                {7.0, 8.0, 9.0}
+                            },
+                            {
+                                {10.0, 11.0, 12.0},
+                                {13.0, 14.0, 15.0},
+                                {16.0, 17.0, 18.0}
+                            }
+                        },
+                        {
+                            {
+                                {19.0, 20.0, 21.0},
+                                {22.0, 23.0, 24.0},
+                                {25.0, 26.0, 27.0}
+                            },
+                            {
+                                {28.0, 29.0, 30.0},
+                                {31.0, 32.0, 33.0},
+                                {34.0, 35.0, 36.0}
+                            }
+                        }
+                    }
+                }
+            ));
+            const auto T1 = std::make_shared<Tensor>(Array2D<float, 3,3>(
+                {
+                    {
+                        {0.5,0.3,0.1},
+                        {0.4,0.2,0.6},
+                        {0.7,0.8,0.9}
+                    }
+                }
+            ));
+            const auto newGrad = std::make_shared<Tensor>(Array4D<float,2, 2, 3, 3>(
+                {
+                    {
+                        {
+                            {
+                                {1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0}
+                            },
+                            {
+                                {1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0}
+                            }
+                        },
+                        {
+                            {
+                                {1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0}
+                            },
+                            {
+                                {1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0}
+                            }
+                        }
+                    }
+                }
+            ));
+            const auto expectedGrad0 = std::make_shared<Tensor>(Array4D<float,2,2,3,3>(
+                {
+                    {
+                        {
+                            {
+                                {0.5, 0.3, 0.1},
+                                {0.4, 0.2, 0.6},
+                                {0.7, 0.8, 0.9}
+                            },
+                            {
+                                {0.5, 0.3, 0.1},
+                                {0.4, 0.2, 0.6},
+                                {0.7, 0.8, 0.9}
+                            }
+                        },
+                        {
+                            {
+                                {0.5, 0.3, 0.1},
+                                {0.4, 0.2, 0.6},
+                                {0.7, 0.8, 0.9}
+                            },
+                            {
+                                {0.5, 0.3, 0.1},
+                                {0.4, 0.2, 0.6},
+                                {0.7, 0.8, 0.9}
+                            }
+                        }
+                    }
+                }
+            ));
+            const auto expectedGrad1 = std::make_shared<Tensor>(Array2D<float,3, 3>(
+                {
+                    {
+                        {58.0, 62.0, 66.0},
+                        {70.0, 74.0, 78.0},
+                        {82.0, 86.0, 90.0}
+                    }
+                }
+            ));
+            for(const auto T: {T0, T1, newGrad, expectedGrad0, expectedGrad1})
+            {
+                    T->setBackend("cpu") ;
+                    T->setDataType(DataType::Float32);
+            }
+            op->associateInput(0, T0);
+            op->associateInput(1, T1);
+            op->getOutput(0)->setGrad(newGrad);
+            op->forwardDims();
+            myMul->backward();
+            REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0));
+            REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1));
+        }
+        SECTION("Case 4: 3D and 2D tensors") {
+            const auto T0 = std::make_shared<Tensor>(Array3D<float, 2, 3, 4>(
+                {
+                    {
+                        {
+                            {1.0, 2.0, 3.0, 4.0},
+                            {5.0, 6.0, 7.0, 8.0},
+                            {9.0, 10.0, 11.0, 12.0},
+                        },
+                        {
+                            {13.0, 14.0, 15.0, 16.0},
+                            {17.0, 18.0, 19.0, 20.0},
+                            {21.0, 22.0, 23.0, 24.0},
+                        }
+                    }
+                }
+            ));
+            const auto T1 = std::make_shared<Tensor>(Array2D<float, 3, 4>(
+                {
+                    {
+                        {0.1, 0.2, 0.3, 0.4},
+                        {0.5, 0.6, 0.7, 0.8},
+                        {0.9, 1.0, 1.1, 1.2}
+                    }
+                }
+            ));
+            const auto newGrad = std::make_shared<Tensor>(Array3D<float, 2,3,4>(
+                {
+                    {
+                        {
+                            {1.0, 1.0, 1.0, 1.0},
+                            {1.0, 1.0, 1.0, 1.0},
+                            {1.0, 1.0, 1.0, 1.0},
+                        },
+                        {
+                            {1.0, 1.0, 1.0, 1.0},
+                            {1.0, 1.0, 1.0, 1.0},
+                            {1.0, 1.0, 1.0, 1.0},
+                        }
+                    }
+                }
+            ));
+            const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float,2,3,4>(
+                {
+                    {
+                        {
+                            {0.1, 0.2, 0.3, 0.4},
+                            {0.5, 0.6, 0.7, 0.8},
+                            {0.9, 1.0, 1.1, 1.2}
+                        },
+                        {
+                            {0.1, 0.2, 0.3, 0.4},
+                            {0.5, 0.6, 0.7, 0.8},
+                            {0.9, 1.0, 1.1, 1.2}
+                        }
+                    }
+                }
+            ));
+            const auto expectedGrad1 = std::make_shared<Tensor>(Array2D<float,3, 4>(
+                {
+                    {
+                        {14.0, 16.0, 18.0, 20.0},
+                        {22.0, 24.0, 26.0, 28.0},
+                        {30.0, 32.0, 34.0, 36.0}
+                    }
+                }
+            ));
+            for(const auto T: {T0, T1, newGrad, expectedGrad0, expectedGrad1})
+            {
+                T->setBackend("cpu") ;
+                T->setDataType(DataType::Float32);
+            }
+            op->associateInput(0, T0);
+            op->associateInput(1, T1);
+            op->getOutput(0)->setGrad(newGrad);
+            op->forwardDims();
+            myMul->backward();
+            REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0));
+            REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1));
+        }
+    }
 TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
    constexpr std::uint16_t NBTRIALS = 10;
    // Create a random number generator
@@ -31,7 +362,7 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
    std::mt19937 gen(rd());
    std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1
    std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
-    std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5));
+    std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(3));
    std::uniform_int_distribution<int> boolDist(0,1);
    // Create MatMul Operator
@@ -60,6 +391,7 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
    std::chrono::time_point<std::chrono::system_clock> end;
    std::chrono::duration<double, std::micro> duration{};
    SECTION("MulImpl_cpu::forward()") {
        SECTION("Scalar / Scalar") {
@@ -68,16 +400,20 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
        }
        SECTION("+1-D Tensor / +1-D Tensor - same dimensions") {
            std::size_t number_of_operation = 0;
            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
                // generate 2 random Tensors
-                const std::size_t nbDims = nbDimsDist(gen);
+                const auto nbDims = nbDimsDist(gen);
-                std::vector<std::size_t> dims;
+                auto dims = std::vector<std::size_t>{};
                for (std::size_t i = 0; i < nbDims; ++i) {
                    dims.push_back(dimSizeDist(gen));
                }
-                const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                const auto nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
                number_of_operation += nb_elements;
                // without broadcasting
@@ -114,67 +450,101 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
                delete[] array0;
                delete[] array1;
                delete[] result;
-                // with broadcasting
            }
            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
            std::cout << "total time: " << duration.count() << "μs" << std::endl;
        }
        SECTION("+1-D Tensor / +1-D Tensor - broadcasting") {
            std::size_t number_of_operation = 0;
            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
                // generate 2 random Tensors
                // handle dimensions, replace some dimensions with '1' to get broadcasting
                constexpr std::size_t nbDims = 4;
-                std::vector<std::size_t> dims;
+                std::vector<std::size_t> dimensions;
-                for (std::size_t i = 0; i < nbDims; ++i) {
-                    dims.push_back(dimSizeDist(gen));
+                for (std::size_t i = 0; i < nbDims; ++i)
+                {
+                    dimensions.push_back(dimSizeDist(gen));
                }
-                std::vector<std::size_t> dims0 = dims;
-                std::vector<std::size_t> dims1 = dims;
+                auto dims0 = dimensions;
-                std::vector<std::size_t> dimsOut = dims;
+                auto dims1 = dimensions;
-                for (std::size_t i = 0; i < nbDims; ++i) {
+                auto dimsOut = dimensions;
-                    if (boolDist(gen)) {
+                for (std::size_t i = 0; i < nbDims; ++i)
+                {
+                    if (boolDist(gen))
+                    {
                        dims0[i] = 1;
                    }
-                    if (boolDist(gen)) {
+                    if (boolDist(gen))
+                    {
                        dims1[i] = 1;
                    }
                    dimsOut[i] = (dims0[i] == 1) ? dims1[i] : dims0[i];
                }
+                for(auto dim : dims0)
+                {
+                    Log::info("Dimension of input 0 : {}", dim);
+                }
+                for(auto dim : dims1)
+                {
+                    Log::info("Dimension of input 1 : {}", dim);
+                }
                // create arrays and fill them with random values
                float* array0 = new float[dims0[0]*dims0[1]*dims0[2]*dims0[3]];
                float* array1 = new float[dims1[0]*dims1[1]*dims1[2]*dims1[3]];
                float* result = new float[dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]];
-                for (std::size_t i = 0; i < dims0[0]*dims0[1]*dims0[2]*dims0[3]; ++i) {
+                for (std::size_t i = 0; i < dims0[0]*dims0[1]*dims0[2]*dims0[3]; ++i)
+                {
                    array0[i] = valueDist(gen);
                }
-                for (std::size_t i = 0; i < dims1[0]*dims1[1]*dims1[2]*dims1[3]; ++i) {
+                for (std::size_t i = 0; i < dims1[0]*dims1[1]*dims1[2]*dims1[3]; ++i)
+                {
                    array1[i] = valueDist(gen);
                }
                // compute true result
                const std::size_t strides0[nbDims] = {dims0[1]*dims0[2]*dims0[3], dims0[2]*dims0[3], dims0[3], 1};
                const std::size_t strides1[nbDims] = {dims1[1]*dims1[2]*dims1[3], dims1[2]*dims1[3], dims1[3], 1};
-                for (std::size_t a = 0; a < dimsOut[0]; ++a) {
-                    for (std::size_t b = 0; b < dimsOut[1]; ++b) {
+                for (std::size_t a = 0; a < dimsOut[0]; ++a)
+                {
+                    for (std::size_t b = 0; b < dimsOut[1]; ++b)
+                    {
                        const std::size_t idx0_0 = strides0[0] * ((dims0[0] > 1) ? a : 0)
                                                    + strides0[1] * ((dims0[1] > 1) ? b : 0);
                        const std::size_t idx1_0 = strides1[0] * ((dims1[0] > 1) ? a : 0)
                                                    + strides1[1] * ((dims1[1] > 1) ? b : 0);
-                        for (std::size_t c = 0; c < dimsOut[2]; ++c) {
+                        for (std::size_t c = 0; c < dimsOut[2]; ++c)
+                        {
                            const std::size_t idx_out = dimsOut[3] * (c + dimsOut[2] * (b + dimsOut[1] * a));
-                            for (std::size_t d = 0; d < dimsOut[3]; ++d) {
+                            for (std::size_t d = 0; d < dimsOut[3]; ++d)
+                            {
                                std::size_t idx0 = idx0_0
                                                    + strides0[2] * ((dims0[2] > 1) ? c : 0)
                                                    + ((dims0[3] > 1) ? d : 0);
                                std::size_t idx1 = idx1_0
                                                    + strides1[2] * ((dims1[2] > 1) ? c : 0)
                                                    + ((dims1[3] > 1) ? d : 0);
                                result[idx_out + d] = array0[idx0] * array1[idx1];
                                // std::cout << "(" << idx0 << ", " << idx1 << ") -> " << array0[idx0] << " * " << array1[idx1] << " -> " << idx_out + d << std::endl;
                            }