From 993b769e64cd2dd43f012ee336414a727d06e104 Mon Sep 17 00:00:00 2001
From: Adam Maroni <adamaroni@hotmail.fr>
Date: Wed, 26 Mar 2025 12:53:20 +0100
Subject: [PATCH] [Issue #250]: Add backward CPU implementation and unit
 testing for MaxPooling2D CPU

---
 .../backend/cpu/operator/MaxPoolingImpl.hpp   |   7 +
 .../cpu/operator/MaxPoolingImpl_kernels.hpp   | 135 ++++++++++++-
 src/operator/MaxPoolingImpl.cpp               |  19 +-
 unit_tests/operator/Test_MaxPoolingImpl.cpp   | 185 +++++++++++++++++-
 4 files changed, 337 insertions(+), 9 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp b/include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp
index 062088a1..804fb33a 100644
--- a/include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp
+++ b/include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp
@@ -27,6 +27,13 @@ namespace Aidge {
 // Operator implementation entry point for the backend
 using MaxPooling2D_Op = MaxPooling_Op<2>;
 using MaxPoolingImpl2D_cpu = OperatorImpl_cpu<MaxPooling_Op<2>,
+    void(const std::array<DimSize_t, 2>&,
+                            const std::array<DimSize_t, 2>&,
+                            const std::array<DimSize_t, 2>&,
+                            const bool,
+                            const std::array<DimSize_t, 4> &,
+                            const void *,
+                            void *),
     void(const std::array<DimSize_t, 2>&,
                             const std::array<DimSize_t, 2>&,
                             const std::array<DimSize_t, 2>&,
diff --git a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
index 21eefb02..9a52c149 100644
--- a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
@@ -114,8 +114,8 @@ void MaxPoolingImpl2D_cpu_forward_kernel(
               auto inputYPostDilation = strideYoffset + dilatedkernelY;
               if (inputXPostDilation < dims[2] && inputYPostDilation < dims[3]){
                 const I inputValue = input[
-		    inputBaseIndex + inputXPostDilation * dims[3] 
-		    + inputYPostDilation
+                inputBaseIndex + inputXPostDilation * dims[3] 
+                + inputYPostDilation
                 ];
                 if (!valid || inputValue > poolValue) {
                   poolValue = inputValue;
@@ -131,16 +131,141 @@ void MaxPoolingImpl2D_cpu_forward_kernel(
   }
 } 
 
+
+template <class I, class O>
+void MaxPoolingImpl2D_cpu_backward_kernel(
+  const std::array<DimSize_t, 2>& strideDims,
+  const std::array<DimSize_t, 2>& kernelDims,
+  const std::array<DimSize_t, 2>& dilations,
+  const bool ceilMode,
+  const std::array<DimSize_t, 4> &dims,
+  const void *input_,
+  void *grad_
+)
+{
+  const I *input = static_cast<const I *>(input_);
+  I *grad = static_cast<I *>(grad_);
+
+  // Fill the gradient with 0 to avoid garbage data
+  std::fill(grad,
+	  grad + (dims[0] * dims[1] * dims[2] * dims[3]),
+	  static_cast<I>(0)
+  );
+
+  // output H size
+  auto hOut = static_cast<float>(
+    dims[2] - (kernelDims[0] - 1) * dilations[0] - 1 + strideDims[0]
+  ) / static_cast<float>(strideDims[0]);
+  const std::size_t outXSize = ceilMode
+    ? static_cast<std::size_t>(std::ceil(hOut))
+    : static_cast<std::size_t>(std::floor(hOut));
+
+  // output W size
+  auto wOut = static_cast<float>( 
+      dims[3] - ( kernelDims[1] - 1) * dilations[1] - 1 + strideDims[1]
+    ) / static_cast<float>(strideDims[1]);
+
+  const std::size_t outYSize = ceilMode
+    ? static_cast<std::size_t>(std::ceil(wOut))
+    : static_cast<std::size_t>(std::floor(wOut));
+
+  using signedsize = std::make_signed<std::size_t>::type;
+
+  for (std::size_t batch = 0; batch < dims[0]; ++batch){
+    for (std::size_t channel = 0; channel < dims[1]; ++channel){
+      auto batchChannelIndex = (channel + batch * dims[1]);
+      const std::size_t inputBaseIndex = batchChannelIndex * dims[2] * dims[3];
+      for (std::size_t outX = 0; outX < outXSize; ++outX) {
+        const signedsize negStrideX = static_cast<signedsize>(
+          -outX * strideDims[0]
+        );
+        const std::size_t kernelXMin = static_cast<std::size_t>(
+          std::max(negStrideX, signedsize(0))
+        );
+        /* Compute kernelXMax */
+        std::size_t kernelXMax = dims[2] + negStrideX;
+        if ((static_cast<signedsize>(dims[2]) + negStrideX) < 0){
+          kernelXMax = 0;
+        }
+        else if (kernelXMax > kernelDims[0]){
+          kernelXMax = kernelDims[0];
+        }
+        for (std::size_t outY = 0; outY < outYSize; ++outY) {
+          const signedsize negStrideY = static_cast<signedsize>(-outY * strideDims[1]);
+          const std::size_t kernelYMin = static_cast<std::size_t>(
+            std::max(negStrideY, signedsize(0))
+          );
+          /* Compute kernelYMax */
+          std::size_t kernelYMax = dims[3] + negStrideY;
+          const std::size_t strideXoffset = outX * strideDims[0];
+          const std::size_t strideYoffset = outY * strideDims[1];
+          I poolValue(0.0);
+          bool valid = false;
+          if (static_cast<signedsize>(dims[3]) + negStrideY < 0){
+            kernelYMax = 0;
+          }
+          else if(kernelYMax > kernelDims[1]){
+            kernelYMax = kernelDims[1];
+          }
+	  std::size_t saveIndex = 0;
+          for (unsigned int kY = kernelYMin; kY < kernelYMax ; ++kY){
+            for (unsigned int kX = kernelXMin; kX < kernelXMax; ++kX){
+              // Apply dilation factor to kernel indices
+              const std::size_t dilatedkernelX = kX * dilations[0];
+              const std::size_t dilatedkernelY = kY * dilations[1];
+              // Ensure indices are within bounds
+              auto inputXPostDilation = strideXoffset + dilatedkernelX;
+              auto inputYPostDilation = strideYoffset + dilatedkernelY;
+              if (inputXPostDilation < dims[2] && inputYPostDilation < dims[3]){
+		std::size_t inputIndex = 
+			inputBaseIndex + inputXPostDilation * dims[3] 
+			+ inputYPostDilation;
+                const I inputValue = input[inputIndex];
+                if (!valid || inputValue > poolValue) {
+                  poolValue = inputValue;
+		  saveIndex = inputIndex;
+                  valid = true;
+                }
+              }
+            }
+          }
+	  if (valid){
+		grad[saveIndex]++;
+	  }
+        }
+      }
+    }
+  }
+}
+
+
+
+
 // Kernels registration to implementation entry point
 REGISTRAR(MaxPoolingImpl2D_cpu,
     {DataType::Float32},
-    {ProdConso::inPlaceModel, Aidge::MaxPoolingImpl2D_cpu_forward_kernel<float, float>, nullptr});
+    {
+          ProdConso::inPlaceModel,
+          Aidge::MaxPoolingImpl2D_cpu_forward_kernel<float, float>,
+          Aidge::MaxPoolingImpl2D_cpu_backward_kernel<float, float>,
+    }
+);
 REGISTRAR(MaxPoolingImpl2D_cpu,
     {DataType::Float64},
-    {ProdConso::inPlaceModel, Aidge::MaxPoolingImpl2D_cpu_forward_kernel<double, double>, nullptr});
+    {
+          ProdConso::inPlaceModel,
+          Aidge::MaxPoolingImpl2D_cpu_forward_kernel<double, double>,
+          Aidge::MaxPoolingImpl2D_cpu_backward_kernel<double, double>,
+    }
+);
 REGISTRAR(MaxPoolingImpl2D_cpu,
     {DataType::Int32},
-    {ProdConso::inPlaceModel, Aidge::MaxPoolingImpl2D_cpu_forward_kernel<int32_t, int32_t>, nullptr});
+    {
+          ProdConso::inPlaceModel,
+          Aidge::MaxPoolingImpl2D_cpu_forward_kernel<int32_t, int32_t>,
+          Aidge::MaxPoolingImpl2D_cpu_backward_kernel<int32_t, int32_t>,
+    }
+);
 }  // namespace Aidge
 
 #endif /* AIDGE_CPU_OPERATOR_MaxPOOLINGIMPL_KERNELS_H_ */
diff --git a/src/operator/MaxPoolingImpl.cpp b/src/operator/MaxPoolingImpl.cpp
index 13ef75b0..42be049d 100644
--- a/src/operator/MaxPoolingImpl.cpp
+++ b/src/operator/MaxPoolingImpl.cpp
@@ -25,7 +25,8 @@ void Aidge::MaxPoolingImpl2D_cpu::forward() {
     AIDGE_ASSERT(op_.getInput(0), "missing input #0 in MaxPooling Operator.");
 
     // Find the correct kernel type
-    const auto impl = Registrar<MaxPoolingImpl2D_cpu>::create(getBestMatch(getRequiredSpec()));
+    const auto impl =
+	Registrar<MaxPoolingImpl2D_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Call kernel
     impl.forward(op_.strideDims(),
@@ -39,5 +40,19 @@ void Aidge::MaxPoolingImpl2D_cpu::forward() {
 
 template <>
 void Aidge::MaxPoolingImpl2D_cpu::backward() {
-    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for MaxPooling_Op<2> on backend cpu");
+    const auto& op_ = dynamic_cast<const MaxPooling_Op<2>&>(mOp);
+    AIDGE_ASSERT(op_.getInput(0), "missing input #0 in MaxPooling Operator.");
+
+    // Find the correct kernel type
+    const auto impl = 
+	Registrar<MaxPoolingImpl2D_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Call kernel
+	impl.backward(op_.strideDims(),
+		       op_.kernelDims(),
+		       op_.dilations(),
+		       op_.ceilMode(),
+		       op_.getInput(0)->template dims<4>(),
+		       getCPUPtr(mOp.getRawInput(0)),
+		       op_.getInput(0)->grad()->getImpl()->rawPtr());
 }
diff --git a/unit_tests/operator/Test_MaxPoolingImpl.cpp b/unit_tests/operator/Test_MaxPoolingImpl.cpp
index d480fc30..2bc5e1ee 100644
--- a/unit_tests/operator/Test_MaxPoolingImpl.cpp
+++ b/unit_tests/operator/Test_MaxPoolingImpl.cpp
@@ -55,7 +55,11 @@ TEST_CASE("[cpu/operator] MaxPooling(forward)", "[MaxPooling][CPU]") {
         }
     });
     SECTION("Stride") {
-        std::shared_ptr<MaxPooling_Op<2>> op = std::make_shared<MaxPooling_Op<2>>(std::array<std::size_t, 2>({2,2}), std::array<std::size_t, 2>({2,2}));
+        std::shared_ptr<MaxPooling_Op<2>> op =
+		std::make_shared<MaxPooling_Op<2>>(
+			std::array<std::size_t, 2>({2, 2}),
+			std::array<std::size_t, 2>({2, 2})
+		);
 
         Tensor myOutput = Array4D<float,2,2,2,2> {
             {
@@ -172,4 +176,181 @@ TEST_CASE("[cpu/operator] MaxPooling(forward)", "[MaxPooling][CPU]") {
         op2->getOutput(0)->print();
         REQUIRE(*(op2->getOutput(0)) == *myOutput5);
     }
-}
\ No newline at end of file
+}
+
+
+
+TEST_CASE("[cpu/operator] MaxPooling(backward)", "[MaxPooling][CPU]") {
+    std::shared_ptr<Tensor> myInput = 
+	std::make_shared<Tensor>(Array4D<float,2,2,5,5> { //NCHW
+		{
+		    {
+			{{-0.3848,  0.2166, -0.4373,  0.6142,  0.5277},
+			 {0.7995,  0.3638, -1.4589, -1.0843,  1.0918},
+			 {0.7147,  0.0936, -1.2902,  1.2037,  0.4874},
+			 {-0.5981,  2.1184, -0.9175,  1.3859,  0.3305},
+			 {-1.7700,  0.0563, -0.3914,  0.0538, -0.3955}},
+
+			{{-3.1409, -0.4554,  0.0524,  2.2291,  0.4859},
+			 {-0.7465, -0.6567, -2.3703, -0.6386, -1.4152},
+			 { 2.2329, -0.5850,  0.0700,  1.2838, -1.7363},
+			 { 0.2139,  0.0624, -1.0689, -0.8221, -0.8038},
+			 { 0.1886, -0.7840, -0.2313,  0.2651, -1.6244}}
+		    },
+		    {
+			{{ 0.4371,  1.6417,  0.9129,  0.6325,  0.5438},
+			 {-2.3552, -0.8850, -0.0232, -0.5462, -1.2011},
+			 {1.7653, -1.6668, -1.0814,  0.6182,  1.2071},
+			 {0.9541, -0.5133,  0.8664, -0.8892,  1.4585},
+			 {1.0220, -0.5107,  0.1829, -0.2301, -0.4268}},
+
+			{{ 1.0429,  0.6279, -0.2875,  0.7187, -0.1500},
+			 {1.6041,  2.9635,  1.4172, -0.7517,  0.5441},
+			 {-0.2276,  0.0857,  0.6776, -0.1389, -0.0614},
+			 {-0.1547, -0.3435,  0.0650, -0.5095, -1.8073},
+			 {1.7217,  0.3999, -0.5953,  1.0604, -0.4126}}
+		    }
+		}
+	});
+    SECTION("Stride") {
+        std::shared_ptr<MaxPooling_Op<2>> op =
+		std::make_shared<MaxPooling_Op<2>>(
+			std::array<std::size_t, 2>({2,2}),
+			std::array<std::size_t, 2>({2,2})
+		);
+
+		Tensor grad = Array4D<float,2,2,5,5> {
+			{
+				{
+					{{0, 0, 0, 1, 0},
+					{1, 0, 0, 0, 0},
+					{0, 0, 0, 0, 0},
+					{0, 1, 0, 1, 0},
+					{0, 0, 0, 0, 0}},
+
+					{{0, 1, 0, 1, 0},
+					{0, 0, 0, 0, 0},
+					{1, 0, 0, 1, 0},
+					{0, 0, 0, 0, 0},
+					{0, 0, 0, 0, 0}}
+				},
+				{
+					{{0, 1, 1, 0, 0},
+					{0, 0, 0, 0, 0},
+					{1, 0, 0, 0, 0},
+					{0, 0, 1, 0, 0},
+					{0, 0, 0, 0, 0}},
+
+					{{0, 0, 0, 0, 0},
+					{0, 1, 1, 0, 0},
+					{0, 1, 1, 0, 0},
+					{0, 0, 0, 0, 0},
+					{0, 0, 0, 0, 0}}
+				}
+			}
+		};
+        op->associateInput(0,myInput);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cpu");
+	op->backward();	
+	//op->getInput(0)->grad()->print();
+        REQUIRE(*(op->getInput(0)->grad()) == grad);
+    }
+    SECTION("Dilation"){
+        std::shared_ptr<Node> myMaxPool = MaxPooling({2,2}, "mycdw", {2,2}, {2,2}); // Dilation 2x2
+        auto op = std::static_pointer_cast<OperatorTensor>(myMaxPool -> getOperator());
+
+	Tensor grad = Array4D<float,2,2,5,5> {
+		{{{{0., 0., 0., 0., 1.},
+		  {0., 0., 0., 0., 0.},
+		  {2., 0., 0., 0., 1.},
+		  {0., 0., 0., 0., 0.},
+		  {0., 0., 0., 0., 0.}},
+
+		 {{0., 0., 0., 0., 1.},
+		  {0., 0., 0., 0., 0.},
+		  {2., 0., 1., 0., 0.},
+		  {0., 0., 0., 0., 0.},
+		  {0., 0., 0., 0., 0.}}},
+
+
+		{{{0., 0., 0., 0., 0.},
+		  {0., 0., 0., 0., 0.},
+		  {2., 0., 0., 0., 2.},
+		  {0., 0., 0., 0., 0.},
+		  {0., 0., 0., 0., 0.}},
+
+		 {{1., 0., 0., 0., 0.},
+		  {0., 0., 0., 0., 0.},
+		  {0., 0., 2., 0., 0.},
+		  {0., 0., 0., 0., 0.},
+		  {1., 0., 0., 0., 0.}}}}
+	};
+        myMaxPool->getOperator()->associateInput(0,myInput);
+        myMaxPool->getOperator()->setDataType(DataType::Float32);
+        myMaxPool->getOperator()->setBackend("cpu");
+	op->backward();	
+	//op->getInput(0)->grad()->print();
+        REQUIRE(*(op->getInput(0)->grad()) == grad);
+    }
+    SECTION("Ceil mode"){
+		std::shared_ptr<Tensor> myInput4 =
+			std::make_shared<Tensor>(Array4D<float,1,1,5,5> { // NCHW
+				{{{
+					{ 1,  2,  3,  4,  5},
+					{ 6,  7,  8,  9, 10},
+					{11, 12, 13, 14, 15},
+					{16, 17, 18, 19, 20},
+					{21, 22, 23, 24, 25}
+				}}}
+			});
+
+		// MaxPool with ceil_mode = true
+		std::shared_ptr<Node> myMaxPool1 =
+			MaxPooling({2,2}, "mycdw", {2,2}, {1,1}, true);
+		auto op1 = std::static_pointer_cast<OperatorTensor>(
+			myMaxPool1 -> getOperator()
+		);
+		Tensor grad = Array4D<float,1,1,5,5> {
+			{{{
+				{0, 0, 0, 0, 0}, 
+				{0, 1, 0, 1, 1}, 
+				{0, 0, 0, 0, 0}, 
+				{0, 1, 0, 1, 1}, 
+				{0, 1, 0, 1, 1}
+			}}}
+		};
+
+		op1->associateInput(0, myInput4);
+		op1->setDataType(DataType::Float32);
+		op1->setBackend("cpu");
+		op1->backward();	
+		//op1->getInput(0)->grad()->print();
+		REQUIRE(*(op1->getInput(0)->grad()) == grad);
+
+		// MaxPool with ceil_mode = false
+		std::shared_ptr<Node> myMaxPool2 =
+			MaxPooling({2,2}, "mycdw", {2,2}, {1,1}, false);
+		auto op2 = std::static_pointer_cast<OperatorTensor>(
+				myMaxPool2 -> getOperator()
+			);
+
+		Tensor grad2 = Array4D<float,1,1,5,5> {
+			{{{
+				{0, 0, 0, 0, 0}, 
+				{0, 1, 0, 1, 0}, 
+				{0, 0, 0, 0, 0}, 
+				{0, 1, 0, 1, 0}, 
+				{0, 0, 0, 0, 0}
+			}}}
+		};
+
+		//op2->resetInput(0);
+		op2->associateInput(0, myInput4);
+		op2->setDataType(DataType::Float32);
+		op2->setBackend("cpu");
+		myMaxPool2->backward();
+		op2->getInput(0)->grad()->print();
+		REQUIRE(*(op2->getInput(0)->grad()) == grad2);
+	}
+}
-- 
GitLab