add ReduceSum op

102f1995 · Houssem ROUIS · 60319519 · 102f1995 · 102f1995 · 102f1995
Commit 102f1995 authored 9 months ago by Houssem ROUIS
--- a/include/aidge/backend/cpu.hpp
+++ b/include/aidge/backend/cpu.hpp
@@ -30,6 +30,7 @@
 #include "aidge/backend/cpu/operator/PadImpl.hpp"
 #include "aidge/backend/cpu/operator/PowImpl.hpp"
 #include "aidge/backend/cpu/operator/ReduceMeanImpl.hpp"
+#include "aidge/backend/cpu/operator/ReduceSumImpl.hpp"
 #include "aidge/backend/cpu/operator/ReLUImpl.hpp"
 #include "aidge/backend/cpu/operator/ScalingImpl.hpp"
 #include "aidge/backend/cpu/operator/SigmoidImpl.hpp"

--- a/include/aidge/backend/cpu/operator/ReduceSumImpl.hpp
+++ b/include/aidge/backend/cpu/operator/ReduceSumImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_REDUCESUMIMPL_H_
+#define AIDGE_CPU_OPERATOR_REDUCESUMIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/ReduceSum.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+class ReduceSumImplForward_cpu
+    : public Registrable<ReduceSumImplForward_cpu,
+                        std::tuple<DataType, DataType>,
+                        void(const std::vector<std::int32_t>&,
+                            DimSize_t,
+                            const std::vector<DimSize_t>&,
+                            const void *,
+                            void *)> {};
+class ReduceSumImpl1DBackward_cpu
+    : public Registrable<ReduceSumImpl1DBackward_cpu,
+                        std::tuple<DataType, DataType>,
+                        void(const std::vector<std::int32_t>&,
+                            DimSize_t,
+                            const std::vector<DimSize_t>&,
+                            const void *,
+                            void *)> {};
+
+class ReduceSumImpl_cpu : public OperatorImpl {
+   public:
+    ReduceSumImpl_cpu(const ReduceSum_Op& op) : OperatorImpl(op, "cpu") {}
+
+    static std::unique_ptr<ReduceSumImpl_cpu> create(const ReduceSum_Op &op) {
+        return std::make_unique<ReduceSumImpl_cpu>(op);
+    }
+
+   public:
+    void forward() override;
+};
+
+namespace {
+static Registrar<ReduceSum_Op> registrarReduceSumImpl_cpu("cpu", Aidge::ReduceSumImpl_cpu::create);
+}  // namespace
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_REDUCESUMIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/ReduceSumImpl_forward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ReduceSumImpl_forward_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_REDUCESUMIMPL_FORWARD_KERNEL_H_
+#define AIDGE_CPU_OPERATOR_REDUCESUMIMPL_FORWARD_KERNEL_H_
+
+#include <algorithm>   // std::for_each
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::int32_t
+#include <functional>  //std::multiplies
+#include <numeric>     //std::accumulate
+#include <vector>
+
+#include "aidge/backend/cpu/operator/ReduceSumImpl.hpp"
+#include "aidge/data/Data.hpp"
+#include "aidge/operator/ReduceSum.hpp"
+#include "aidge/utils/Registrar.hpp"
+
+namespace Aidge {
+template <class I, class O>
+void ReduceSumImpl_cpu_forward_kernel(const std::vector<std::int32_t>& axes,
+                                    DimSize_t /*keepDims*/,
+                                    const std::vector<DimSize_t>& inputDims,
+                                    const void* input_,
+                                    void* output_) {
+
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+
+    const std::size_t nb_dims = inputDims.size();
+    const std::size_t totalElements = std::accumulate(inputDims.cbegin(), inputDims.cend(), 1, std::multiplies<std::size_t>());
+
+    if (axes.size() == 1) {
+        const std::size_t stride_pre = std::accumulate(inputDims.cbegin(), inputDims.cbegin() + axes[0], 1, std::multiplies<std::size_t>());
+        const std::size_t stride_post = std::accumulate(inputDims.crbegin(), inputDims.crbegin() + nb_dims -1 - axes[0], 1, std::multiplies<std::size_t>());
+
+        const std::size_t dim_i = inputDims[axes[0]];
+        for (std::size_t pre = 0; pre < stride_pre; ++pre) {
+            for (std::size_t post = 0; post < stride_post; ++post) {
+                const std::size_t idx_i = pre * dim_i * stride_post + post;
+                const std::size_t idx_o = pre * stride_post + post;
+                O sum = 0;
+                for (std::size_t i = 0; i < dim_i; ++i) {
+                    sum +=input[idx_i + i*stride_post];
+                }
+                output[idx_o]  = sum;
+            }
+        }
+    } else {
+        std::size_t outputElements = totalElements;
+
+        auto stride_post = std::unique_ptr<std::size_t[]>(new std::size_t[nb_dims]);
+        stride_post[nb_dims - 1] = 1;
+        for (std::size_t i = nb_dims-2; i != static_cast<std::size_t>(-1); --i) {
+            stride_post[i] = stride_post[i+1]*inputDims[i+1];
+        }
+        auto stride_pre = std::unique_ptr<std::size_t[]>(new std::size_t[nb_dims]);
+        stride_pre[0] = 1;
+        for (std::size_t i = 1; i < nb_dims; ++i) {
+            stride_pre[i] = stride_pre[i-1]*inputDims[i-1];
+        }
+
+        const I* inputAccumulation = input;
+        I* outputAccumulation = nullptr;
+
+        for (const auto& axisInt : axes) {
+            const std::size_t a = static_cast<std::size_t>(axisInt);
+            outputElements /= inputDims[a];
+            outputAccumulation = new I[outputElements];
+            const std::size_t dim_i = inputDims[a];
+            for (std::size_t pre = 0; pre < stride_pre[a]; ++pre) {
+                for (std::size_t post = 0; post < stride_post[a]; ++post) {
+                    const std::size_t idx_i = pre * dim_i * stride_post[a] + post;
+                    const std::size_t idx_o = pre * stride_post[a] + post;
+                    I sum = 0;
+                    for (std::size_t i = 0; i < dim_i; ++i) {
+                        sum += inputAccumulation[idx_i + i*stride_post[a]];
+                    }
+                    outputAccumulation[idx_o] = sum;
+                }
+            }
+            std::for_each(stride_pre.get()+a+1, stride_pre.get()+nb_dims, [dim_i] (std::size_t& val) { val /= dim_i; });
+            if (inputAccumulation != input) {
+                delete[] inputAccumulation;
+            }
+            inputAccumulation = outputAccumulation;
+        }
+
+        // Copy elements from inputAccumulation to output while dividing by divisor
+        std::copy(inputAccumulation, inputAccumulation + outputElements, output);
+        if (outputAccumulation) {
+            delete[] outputAccumulation;
+        }
+    }
+}
+
+namespace {
+static Registrar<ReduceSumImplForward_cpu> registrarReduceSumImplForward_cpu_Float32(
+        {DataType::Float32, DataType::Float32}, Aidge::ReduceSumImpl_cpu_forward_kernel<float, float>);
+static Registrar<ReduceSumImplForward_cpu> registrarReduceSumImplForward_cpu_Int32(
+        {DataType::Int32, DataType::Int32}, Aidge::ReduceSumImpl_cpu_forward_kernel<int, int>);
+static Registrar<ReduceSumImplForward_cpu> registrarReduceSumImplForward_cpu_Float64(
+        {DataType::Float64, DataType::Float64}, Aidge::ReduceSumImpl_cpu_forward_kernel<double, double>);
+}  // namespace
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_REDUCESUMIMPL_FORWARD_KERNEL_H_ */
--- a/src/operator/ReduceSumImpl.cpp
+++ b/src/operator/ReduceSumImpl.cpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include "aidge/backend/cpu/operator/ReduceSumImpl.hpp"
+
+#include <memory>
+#include <vector>
+
+#include "aidge/utils/Types.h"
+#include "aidge/operator/ReduceSum.hpp"
+#include "aidge/backend/cpu/operator/ReduceSumImpl_forward_kernels.hpp"
+
+void Aidge::ReduceSumImpl_cpu::forward() {
+    const ReduceSum_Op& op_ = dynamic_cast<const ReduceSum_Op&>(mOp);
+    // Find the correct kernel type
+    auto kernelFunc = Registrar<ReduceSumImplForward_cpu>::create({
+        op_.getInput(0)->dataType(),
+        op_.getOutput(0)->dataType()});
+
+    // Call kernel
+    kernelFunc(op_.axes(),
+                op_.keepDims(),
+                op_.getInput(0)->dims(),
+                op_.getInput(0)->getImpl()->rawPtr(),
+                op_.getOutput(0)->getImpl()->rawPtr());
+}
--- a/unit_tests/operator/Test_ArgMax.cpp
+++ b/unit_tests/operator/Test_ArgMax.cpp
@@ -13,7 +13,7 @@
 #include <memory>

 #include "aidge/data/Tensor.hpp"
-#include "aidge/operator/ArgMax.hpp"
+#include "aidge/operator/ReduceSum.hpp"
 #include "aidge/operator/Conv.hpp"

 #include "aidge/backend/cpu.hpp"
@@ -21,96 +21,173 @@

 using namespace Aidge;

-TEST_CASE("[cpu/operator] ArgMax(forward)", "[ArgMax][CPU]") {
-    SECTION("3D Tensor") {
-            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,2,3,4> {
+TEST_CASE("[cpu/operator] ReduceSum(forward)", "[ReduceSum][CPU]") {
+    SECTION("KeepDims") {
+        SECTION("test 1") {
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
                {
                    {
-                        { 1.0, 2.0, 3.0, 4.0},
-                        { 8.0, 0.0, 17.0, 1.0},
-                        { 5.0, 10.0, 6.0, 0.0}
+                        { 5.0, 1.0 },
+                        { 20.0, 2.0 }
                    },
                    {
-                        { 7.0, 1.0, 9.0, 4.0},
-                        { 0.0, 8.0, 4.0, 2.0},
-                        { 9.0, 2.0, 0.0, 5.0}
+                        { 30.0, 1.0 },
+                        { 40.0, 2.0 }
+                    },
+                    {
+                        { 55.0, 1.0 },
+                        { 60.0, 2.0 }
                    }
                }
            });
-        SECTION("Axis 2") {
+            Tensor myOutput = Tensor(Array3D<float,3,1,2> {
+                {

-            Tensor myOutput = Tensor(Array2D<float,2,3> {
-               { 
-                    { 3.0, 2.0, 1.0 },
-                    { 2.0, 1.0, 0.0}
-               }
+                    {{ 25.0, 3.0 }},
+                    {{ 70.0, 3.0 }},
+                    {{ 115.0, 3.0 }}
+                }
            });

-            std::shared_ptr<Node> myArgMax = ArgMax(2);
-            auto op = std::static_pointer_cast<OperatorTensor>(myArgMax -> getOperator());
+            std::shared_ptr<Node> myReduceSum = ReduceSum({1}, 1);
+            auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
            op->associateInput(0,myInput);
            op->setDataType(DataType::Float32);
            op->setBackend("cpu");
-            myArgMax->forward();
+            myReduceSum->forward();
            op->getOutput(0)->print();

            REQUIRE(*(op->getOutput(0)) == myOutput);
        }
-        SECTION("Axis 1") {
-            Tensor myOutput = Tensor(Array2D<float,2,4> {
+        SECTION("test 2") {
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,3,2> {
                {
-                    { 1.0, 2.0, 1.0, 0.0 },
-                    { 2.0, 1.0, 0.0, 2.0 }
+                    {
+                        { 0.0, 0.0 },
+                        { 1.0, 1.0 },
+                        { 2.0, 2.0 }
+                    },
+                    {
+                        { 3.0, 3.0 },
+                        { 4.0, 4.0 },
+                        { 5.0, 5.0 }
+                    },
+                    {
+                        { 6.0, 6.0 },
+                        { 7.0, 7.0 },
+                        { 8.0, 8.0 }
+                    }
                }
            });
-
-            std::shared_ptr<Node> myArgMax = ArgMax(1);
-            auto op = std::static_pointer_cast<OperatorTensor>(myArgMax -> getOperator());
-            op->associateInput(0,myInput);
-            op->setDataType(DataType::Float32);
-            op->setBackend("cpu");
-            myArgMax->forward();
-            myOutput.print();
-            op->getOutput(0)->print();
-            REQUIRE(*(op->getOutput(0)) == myOutput);
-        }
-        SECTION("Axis 0") {
-            Tensor myOutput = Tensor(Array2D<float,3,4> {
+            Tensor myOutput = Tensor(Array3D<float,3,1,1> {
                {
-                    { 1.0, 0.0, 1.0, 0.0 },
-                    { 0.0, 1.0, 0.0, 1.0 },
-                    { 1.0, 0.0, 0.0, 1.0 }
-                },
+
+                    {{ 6.0 }},
+                    {{ 24.0 }},
+                    {{ 42.0 }}
+                }
            });

-            std::shared_ptr<Node> myArgMax = ArgMax(1);
-            auto op = std::static_pointer_cast<OperatorTensor>(myArgMax -> getOperator());
+            std::shared_ptr<Node> myReduceSum = ReduceSum({1, 2}, 1);
+            auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
            op->associateInput(0,myInput);
            op->setDataType(DataType::Float32);
            op->setBackend("cpu");
-            myArgMax->forward();
+            myReduceSum->forward();
            myOutput.print();
            op->getOutput(0)->print();
            REQUIRE(*(op->getOutput(0)) == myOutput);
        }
    }
-    SECTION("Select_Last_Index") {
-        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array1D<float,10> {
+    SECTION("not_KeepDims") {
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
+            {
+                {
+                    { 5.0, 1.0 },
+                    { 20.0, 2.0 }
+                },
+                {
+                    { 30.0, 1.0 },
+                    { 40.0, 2.0 }
+                },
+                {
+                    { 55.0, 1.0 },
+                    { 60.0, 2.0 }
+                }
+            }
+        });
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float,3,2> {
            {
-                1.0, 5.0, 9.0, 0.0, 6.0, 2.0, 9.0, 4.0, 3.0, 9.0
+                { 25.0, 3.0 },
+                { 70.0, 3.0 },
+                { 115.0, 3.0 }
            }
        });
-        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array1D<float,1> {{9}});

-        std::shared_ptr<Node> myArgMax = ArgMax(0, 1, 1);
-        auto op = std::static_pointer_cast<OperatorTensor>(myArgMax -> getOperator());
+        std::shared_ptr<Node> myReduceSum = ReduceSum({1}, 0);
+        auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
        op->associateInput(0,myInput);
        op->setDataType(DataType::Float32);
        op->setBackend("cpu");
-        myArgMax->forward();
+        myReduceSum->forward();
        op->getOutput(0)->print();

        REQUIRE(*(op->getOutput(0)) == *myOutput);

    }
+    SECTION("all_axes") {
+        SECTION("1") {
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
+                {
+                    {
+                        { 5.0, 1.0 },
+                        { 20.0, 2.0 }
+                    },
+                    {
+                        { 30.0, 1.0 },
+                        { 40.0, 2.0 }
+                    },
+                    {
+                        { 55.0, 1.0 },
+                        { 60.0, 2.0 }
+                    }
+                }
+            });
+            std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array1D<float,1> {
+                {219.0}
+            });
+
+            std::shared_ptr<Node> myReduceSum = ReduceSum({0, 1, 2}, 0);
+            auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            myReduceSum->forward();
+            op->getOutput(0)->print();
+
+            REQUIRE(*(op->getOutput(0)) == *myOutput);
+        }
+        SECTION("2") {
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<float,5,4> {
+               {{ 0.004232f, 0.105120f, 0.045124f, 0.009205f},
+                { 0.000766f, 0.272162f, 0.503560f, 0.044163f},
+                { 0.049755f, 0.000305f, 0.143634f, 0.013253f},
+                { 0.096258f, 0.311231f, 0.358143f, 0.000452f},
+                { 0.468617f, 0.015693f, 0.145316f, 0.000105f}}
+            });
+            std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array1D<float,1> {
+                {2.587094f}
+            });
+
+            std::shared_ptr<Node> myReduceSum = ReduceSum({0, 1}, 0);
+            auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            myReduceSum->forward();
+            op->getOutput(0)->print();
+            // approxEq<float>(*(op->getOutput(0)), *myOutput);
+            REQUIRE(approxEq<float>(*(op->getOutput(0)), *myOutput));
+        }
+    }
 }
\ No newline at end of file