From af6a86d49a65c280b9588e44698b562cc2676f8c Mon Sep 17 00:00:00 2001
From: IKucher <Inna.KUCHER@cea.fr>
Date: Wed, 4 Oct 2023 09:43:31 +0000
Subject: [PATCH] max pool implem + unit_test

---
 include/aidge/backend/cpu.hpp                 |   1 +
 .../backend/cpu/operator/MaxPoolingImpl.hpp   |  70 ++++++
 .../MaxPoolingImpl_forward_kernels.hpp        | 211 ++++++++++++++++++
 src/operator/MaxPoolingImpl.cpp               |  80 +++++++
 unit_tests/operator/Test_MaxPoolingImpl.cpp   |  82 +++++++
 5 files changed, 444 insertions(+)
 create mode 100644 include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp
 create mode 100644 include/aidge/backend/cpu/operator/MaxPoolingImpl_forward_kernels.hpp
 create mode 100644 src/operator/MaxPoolingImpl.cpp
 create mode 100644 unit_tests/operator/Test_MaxPoolingImpl.cpp

diff --git a/include/aidge/backend/cpu.hpp b/include/aidge/backend/cpu.hpp
index 59dc3cec..336d549a 100644
--- a/include/aidge/backend/cpu.hpp
+++ b/include/aidge/backend/cpu.hpp
@@ -15,6 +15,7 @@
 #include "aidge/backend/cpu/data/TensorImpl.hpp"
 #include "aidge/backend/cpu/operator/AddImpl.hpp"
 #include "aidge/backend/cpu/operator/AvgPoolingImpl.hpp"
+#include "aidge/backend/cpu/operator/MaxPoolingImpl.hpp"
 #include "aidge/backend/cpu/operator/BatchNormImpl.hpp"
 #include "aidge/backend/cpu/operator/ConvDepthWiseImpl.hpp"
 #include "aidge/backend/cpu/operator/ConvImpl.hpp"
diff --git a/include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp b/include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp
new file mode 100644
index 00000000..a29be1f3
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp
@@ -0,0 +1,70 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_MaxPOOLINGIMPL_H_
+#define AIDGE_CPU_OPERATOR_MaxPOOLINGIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/MaxPooling.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+// class MaxPooling_Op;
+
+// compute kernel registry for forward and backward
+class MaxPoolingImpl2DForward_cpu
+    : public Registrable<MaxPoolingImpl2DForward_cpu,
+                         std::tuple<DataType, DataType>,
+                         void(const MaxPooling_Op<2>::Parameters &, const std::array<DimSize_t, 4> &, const void *, void *)> {};
+class MaxPoolingImpl2DBackward_cpu
+    : public Registrable<MaxPoolingImpl2DBackward_cpu,
+                         std::tuple<DataType, DataType>,
+                         void(const MaxPooling_Op<2>::Parameters &, const std::array<DimSize_t, 4> &, const void *, void *)> {};
+
+class MaxPoolingImpl2D_cpu : public OperatorImpl {
+   private:
+    const MaxPooling_Op<2> &mOp;
+    std::array<NbElts_t, 1> mNbConsumedData;
+    std::array<NbElts_t, 1> mNbProducedData;
+
+   public:
+    MaxPoolingImpl2D_cpu(const MaxPooling_Op<2> &op) : mOp(op), mNbConsumedData({0}), mNbProducedData({0}) {}
+
+    static std::unique_ptr<MaxPoolingImpl2D_cpu> create(const MaxPooling_Op<2> &op) {
+        return std::make_unique<MaxPoolingImpl2D_cpu>(op);
+    }
+
+   public:
+    NbElts_t getNbRequiredData(const IOIndex_t inputIdx) const override final;
+    NbElts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
+    NbElts_t getRequiredMemory(const IOIndex_t /*outputIdx*/, const std::vector<DimSize_t> &inputsSize) const override final;
+    NbElts_t getNbConsumedData(const IOIndex_t inputIdx) const override final;
+    NbElts_t getNbProducedData(const IOIndex_t outputIdx) const override final;
+    void updateConsummerProducer() override final;
+
+    void forward();
+
+    void backward();
+};
+
+namespace {
+// add cpu backend to MaxPooling_Op<2> implementation registry
+static Registrar<MaxPooling_Op<2>> registrarMaxPoolingImpl2D_cpu("cpu", Aidge::MaxPoolingImpl2D_cpu::create);
+}  // namespace
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_MaxPOOLINGIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/MaxPoolingImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/MaxPoolingImpl_forward_kernels.hpp
new file mode 100644
index 00000000..c278e271
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/MaxPoolingImpl_forward_kernels.hpp
@@ -0,0 +1,211 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_MaxPOOLINGIMPL_FORWARD_KERNEL_H_
+#define AIDGE_CPU_OPERATOR_MaxPOOLINGIMPL_FORWARD_KERNEL_H_
+
+#include "aidge/utils/Registrar.hpp"
+
+#include "aidge/backend/cpu/operator/MaxPoolingImpl.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/data/Data.hpp"
+#include <array>
+#include <tuple>
+#include <cmath>
+
+namespace Aidge {
+/**
+ * @brief Forward kernel for 2D MaxPoolingolution on CPU backend.
+ * @tparam I Input data type.
+ * @tparam O Output data type.
+ * @param params tuple of Parameters from the Operator
+ * @param dims Array of input dimensions.
+ * @param input_ const input Tensor.
+ * @param output_ Output Tensor.
+ */
+template <class I, class O>
+void MaxPoolingImpl2D_cpu_forward_kernel(const MaxPooling_Op<2>::Parameters &params,
+                                             const std::array<DimSize_t, 4> &dims,
+                                             const void *input_,
+                                             void *output_) {
+    // FIXME: missing convolution parameters as arguments
+    const I *input = static_cast<const I *>(input_);
+    O *output = static_cast<O *>(output_);
+
+
+    // output H size
+    const std::size_t oxSize =
+            static_cast<std::size_t>(std::floor(static_cast<float>(dims[2] + std::get<2>(params)[0] + std::get<2>(params)[2] - std::get<1>(params)[0] + std::get<0>(params)[0]) /
+                                static_cast<float>(std::get<0>(params)[0])));
+    // output W size
+    const std::size_t oySize =
+            static_cast<std::size_t>(std::floor(static_cast<float>(dims[3] + std::get<2>(params)[1] + std::get<2>(params)[3] - std::get<1>(params)[1] + std::get<0>(params)[1]) /
+                                static_cast<float>(std::get<0>(params)[1])));
+
+    // TODO: kernel computation
+    // output (batch, outCh, Xout, Yout)
+    // input  (batch, ch, Xin, Yin)
+    // weight (outCh, ch, kernelX, kernelY)
+    // does not take Dilation parameter into account
+    using signedsize = std::make_signed<std::size_t>::type;
+    for (std::size_t batch = 0; batch < dims[0]; ++batch) {
+        for (std::size_t ch = 0; ch < dims[1]; ++ch) {
+            const std::size_t oIndex = (ch + batch*dims[1]) * oxSize * oySize;
+            const std::size_t iIndex = (ch + batch*dims[1]) * dims[2] * dims[3];
+            for (std::size_t ox = 0; ox < oxSize; ++ox) {
+                const signedsize difx = static_cast<signedsize>(std::get<2>(params)[0] - ox * std::get<0>(params)[0]);
+                const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
+                const std::size_t sxMax = (static_cast<signedsize>(dims[2]) + difx) < 0 ? 0 : ((dims[2] + difx) > std::get<1>(params)[0] ? std::get<1>(params)[0] : dims[2] + difx);
+                for (std::size_t oy = 0; oy < oySize; ++oy) {
+                    const signedsize dify = static_cast<signedsize>(std::get<2>(params)[1] - oy * std::get<0>(params)[1]);
+                    const std::size_t syMin = static_cast<std::size_t>(std::max(dify, signedsize(0)));
+                    const std::size_t syMax = (static_cast<signedsize>(dims[3]) + dify) < 0 ? 0 : ((dims[3] + dify) > std::get<1>(params)[1] ? std::get<1>(params)[1] : dims[3] + dify);
+                    const std::size_t oIndexFull = oIndex + ox*oySize + oy;
+                    const std::size_t ix = ox * std::get<0>(params)[0];
+                    const std::size_t iy = oy * std::get<0>(params)[1];
+
+                    I poolValue(0.0);
+
+                    for (unsigned int channel = 0; channel < dims[1];
+                            ++channel){
+                        for (unsigned int sy = syMin; sy < syMax; ++sy) {
+                            for (unsigned int sx = sxMin; sx < sxMax; ++sx)
+                            {
+                                const I value = input[iIndex + (ix+sx)*dims[3] + (iy+sy)];
+
+                                if (!valid || value > poolValue) {
+                                    poolValue = value;
+                                }
+                            }
+                        }
+                    }
+                    output[oIndexFull] = poolValue;
+                }
+            }
+        }
+    }
+}
+
+//N2D2 version
+/*
+template <class T>
+void N2D2::PoolCell_Frame_Kernels::forwardMax(const T* alpha,
+                                              const Tensor<T>&
+                                              inputs,
+                                              const Descriptor& desc,
+                                              const T* beta,
+                                              Tensor<T>& outputs,
+                                              Tensor<ArgMax>& argMax,
+                                              bool useArgMax,
+                                              const Tensor<bool>& maps)
+{
+    const unsigned int size = inputs.dimB() * outputs.dimZ();
+
+#if defined(_OPENMP) && _OPENMP >= 200805
+#pragma omp parallel for collapse(2) if (size > 16)
+#else
+#pragma omp parallel for if (inputs.dimB() > 4 && size > 16)
+#endif
+    for (int batchPos = 0; batchPos < (int)inputs.dimB(); ++batchPos) {
+        for (unsigned int output = 0; output < outputs.dimZ(); ++output) {
+            for (unsigned int oy = 0; oy < outputs.dimY(); ++oy) {
+                for (unsigned int ox = 0; ox < outputs.dimX(); ++ox) {
+                    const unsigned int sxMin = (unsigned int)std::max(
+                        desc.padding[0] - (int)(ox * desc.stride[0]), 0);
+                    const unsigned int syMin = (unsigned int)std::max(
+                        desc.padding[1] - (int)(oy * desc.stride[1]), 0);
+                    const unsigned int sxMax = Utils::clamp
+                        <int>(inputs.dimX() + desc.padding[0] - ox * desc.stride[0],
+                              0,
+                              desc.pool[0]);
+                    const unsigned int syMax = Utils::clamp
+                        <int>(inputs.dimY() + desc.padding[1] - oy * desc.stride[1],
+                              0,
+                              desc.pool[1]);
+
+                    const int ix = (int)(ox * desc.stride[0]) - desc.padding[0];
+                    const int iy = (int)(oy * desc.stride[1]) - desc.padding[1];
+
+                    T poolValue(0.0);
+
+                    // For each output, compute the pool value
+                    if (useArgMax) {
+                        const ArgMax inputMax
+                            = argMax(ox, oy, output, batchPos);
+
+                        if (inputMax.valid) {
+                            poolValue = inputs(inputMax.ix,
+                                               inputMax.iy,
+                                               inputMax.channel,
+                                               batchPos);
+                        }
+                    }
+                    else {
+                        unsigned int ixMax = 0;
+                        unsigned int iyMax = 0;
+                        unsigned int channelMax = 0;
+                        bool valid = false;
+
+                        for (unsigned int channel = 0; channel < inputs.dimZ();
+                             ++channel)
+                        {
+                            if (!maps.empty() && !maps(output, channel))
+                                continue;
+
+                            for (unsigned int sy = syMin; sy < syMax; ++sy) {
+                                for (unsigned int sx = sxMin; sx < sxMax; ++sx)
+                                {
+                                    const T value = inputs(ix + sx,
+                                                                 iy + sy,
+                                                                 channel,
+                                                                 batchPos);
+
+                                    if (!valid || value > poolValue) {
+                                        poolValue = value;
+                                        valid = true;
+
+                                        ixMax = ix + sx;
+                                        iyMax = iy + sy;
+                                        channelMax = channel;
+                                    }
+                                }
+                            }
+                        }
+
+                        argMax(ox, oy, output, batchPos)
+                            = ArgMax(ixMax, iyMax, channelMax, valid);
+                    }
+
+                    outputs(ox, oy, output, batchPos)
+                        = (*alpha) * poolValue
+                          + (*beta) * outputs(ox, oy, output, batchPos);
+                }
+            }
+        }
+    }
+}
+
+*/
+
+namespace {
+static Registrar<MaxPoolingImpl2DForward_cpu> registrarMaxPoolingImpl2DForward_cpu_Float32(
+        std::tuple<DataType, DataType>({DataType::Float32, DataType::Float32}),
+        Aidge::MaxPoolingImpl2D_cpu_forward_kernel<float, float>);
+static Registrar<MaxPoolingImpl2DForward_cpu> registrarMaxPoolingImpl2DForward_cpu_Int32(
+        {DataType::Int32, DataType::Int32},
+        Aidge::MaxPoolingImpl2D_cpu_forward_kernel<int, int>);
+static Registrar<MaxPoolingImpl2DForward_cpu> registrarMaxPoolingImpl2DForward_cpu_Float64(
+        {DataType::Float64, DataType::Float64},
+        Aidge::MaxPoolingImpl2D_cpu_forward_kernel<double, double>);
+}  // namespace
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_MaxPOOLINGIMPL_FORWARD_KERNEL_H_ */
diff --git a/src/operator/MaxPoolingImpl.cpp b/src/operator/MaxPoolingImpl.cpp
new file mode 100644
index 00000000..a4aa5f09
--- /dev/null
+++ b/src/operator/MaxPoolingImpl.cpp
@@ -0,0 +1,80 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <cassert>
+#include <numeric>
+#include <thread>
+#include <vector>
+
+#include "aidge/utils/Types.h"
+#include "aidge/operator/MaxPooling.hpp"
+
+#include "aidge/backend/cpu/operator/MaxPoolingImpl.hpp"
+#include "aidge/backend/cpu/operator/MaxPoolingImpl_forward_kernels.hpp"
+
+Aidge::NbElts_t Aidge::MaxPoolingImpl2D_cpu::getNbRequiredData(const Aidge::IOIndex_t inputIdx) const {
+    assert(mOp.getInput(inputIdx) && "requires valid input");
+
+    // Requires the whole tensors
+    const auto &inputDims = std::static_pointer_cast<Tensor>(mOp.getInput(inputIdx))->dims();
+
+    return std::accumulate(inputDims.begin(), inputDims.end(), Aidge::NbElts_t(1), std::multiplies<NbElts_t>());
+}
+
+Aidge::NbElts_t Aidge::MaxPoolingImpl2D_cpu::getNbRequiredProtected(IOIndex_t /*inputIdx*/) const {
+    // for the direct convolution algorithm, convolutions can be in-place, if
+    // there is no padding!
+    return 0;
+}
+
+Aidge::NbElts_t Aidge::MaxPoolingImpl2D_cpu::getRequiredMemory(const Aidge::IOIndex_t outputIdx,
+                                                           const std::vector<Aidge::DimSize_t> & /*inputsSize*/) const {
+    // Requires the whole tensors, regardless of available data on inputs
+    assert(outputIdx == 0 && "operator has only one output");
+    (void) outputIdx;
+
+    const auto &outputDims = std::static_pointer_cast<Tensor>(mOp.getOutput(0))->dims();
+    return std::accumulate(outputDims.begin(), outputDims.end(), NbElts_t(1), std::multiplies<NbElts_t>());
+}
+
+Aidge::NbElts_t Aidge::MaxPoolingImpl2D_cpu::getNbConsumedData(Aidge::IOIndex_t inputIdx) const {
+    assert(static_cast<std::size_t>(inputIdx) < mNbConsumedData.size());
+    return mNbConsumedData[static_cast<std::size_t>(inputIdx)];
+}
+
+Aidge::NbElts_t Aidge::MaxPoolingImpl2D_cpu::getNbProducedData(Aidge::IOIndex_t outputIdx) const {
+    assert((outputIdx == 0) && (static_cast<std::size_t>(outputIdx) < mNbProducedData.size()));
+    return mNbProducedData[static_cast<std::size_t>(outputIdx)];
+}
+void Aidge::MaxPoolingImpl2D_cpu::updateConsummerProducer(){
+    // Update producer-consumer data
+    for (std::size_t inputIdx = 0; inputIdx < mNbConsumedData.size(); ++inputIdx)
+        mNbConsumedData[inputIdx] += getNbRequiredData(static_cast<IOIndex_t>(inputIdx));  // each input is consumed by the minimum
+                                                                                           // amount for a forward pass
+    mNbProducedData[0] += getRequiredMemory(0, {});
+}
+void Aidge::MaxPoolingImpl2D_cpu::forward() {
+    // FIXME: uncomment the following code once memory handling will work
+    assert(mOp.getInput(0) && "missing input #0");
+
+    // Find the correct kernel type
+    auto kernelFunc =
+            Registrar<MaxPoolingImpl2DForward_cpu>::create({mOp.getInput(0)->dataType(), mOp.getOutput(0)->dataType()});
+
+    // Call kernel
+    kernelFunc(mOp.getParams(),
+               mOp.getInput(0)->dims<4>(),
+               mOp.getInput(0)->getImpl()->rawPtr(),
+               mOp.getOutput(0)->getImpl()->rawPtr());
+
+}
+
+void Aidge::MaxPoolingImpl2D_cpu::backward() { printf("Not implemented yet.\n"); }
diff --git a/unit_tests/operator/Test_MaxPoolingImpl.cpp b/unit_tests/operator/Test_MaxPoolingImpl.cpp
new file mode 100644
index 00000000..83fa7eaa
--- /dev/null
+++ b/unit_tests/operator/Test_MaxPoolingImpl.cpp
@@ -0,0 +1,82 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <catch2/catch_test_macros.hpp>
+#include <memory>
+#include <cstdlib>
+
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/MaxPooling.hpp"
+
+#include "aidge/backend/cpu.hpp"
+
+using namespace Aidge;
+
+
+TEST_CASE("[cpu/operator] MaxPooling(forward)") {
+    std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,2,5,5> { //NCHW
+        {
+            {
+                {{-0.3848,  0.2166, -0.4373,  0.6142,  0.5277},
+                 {0.7995,  0.3638, -1.4589, -1.0843,  1.0918},
+            	 {0.7147,  0.0936, -1.2902,  1.2037,  0.4874},
+                 {-0.5981,  2.1184, -0.9175,  1.3859,  0.3305},
+                 {-1.7700,  0.0563, -0.3914,  0.0538, -0.3955}},
+
+                {{-3.1409, -0.4554,  0.0524,  2.2291,  0.4859},
+                 {-0.7465, -0.6567, -2.3703, -0.6386, -1.4152},
+                 { 2.2329, -0.5850,  0.0700,  1.2838, -1.7363},
+                 { 0.2139,  0.0624, -1.0689, -0.8221, -0.8038},
+                 { 0.1886, -0.7840, -0.2313,  0.2651, -1.6244}}
+            },
+            {
+                {{ 0.4371,  1.6417,  0.9129,  0.6325,  0.5438},
+                 {-2.3552, -0.8850, -0.0232, -0.5462, -1.2011},
+                 {1.7653, -1.6668, -1.0814,  0.6182,  1.2071},
+                 {0.9541, -0.5133,  0.8664, -0.8892,  1.4585},
+                 {1.0220, -0.5107,  0.1829, -0.2301, -0.4268}},
+
+                {{ 1.0429,  0.6279, -0.2875,  0.7187, -0.1500},
+                 {1.6041,  2.9635,  1.4172, -0.7517,  0.5441},
+                 {-0.2276,  0.0857,  0.6776, -0.1389, -0.0614},
+                 {-0.1547, -0.3435,  0.0650, -0.5095, -1.8073},
+                 {1.7217,  0.3999, -0.5953,  1.0604, -0.4126}}
+            }
+        }
+    });
+    SECTION("Stride") {
+        std::shared_ptr<Node> myMaxPool = MaxPooling({2,2}, "mycdw", {2,2});
+        myMaxPool->getOperator()->setDatatype(DataType::Float32);
+        myMaxPool->getOperator()->setBackend("cpu");
+
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,2,2,2> { 
+            {
+                {
+                    {{  0.7995,  0.6142},
+                     { 2.1184,  1.3859}},
+                    {{ -0.4554,  2.2291},
+                     {  2.2329,  1.2838}}
+                },
+                {
+                    {{1.6417,  0.9129},
+                     {1.7653,  0.8664}},
+                    {{2.9635,  1.4172},
+                     {0.0857,  0.6776}}
+                }
+            }
+        });
+        myMaxPool->getOperator()->associateInput(0,myInput);
+        myMaxPool->getOperator()->computeOutputDims();
+        myMaxPool->forward();
+        myMaxPool->getOperator()->getOutput(0)->print();
+        REQUIRE(*(myMaxPool->getOperator()->getOutput(0)) == *myOutput);
+    }
+}
\ No newline at end of file
-- 
GitLab