From 7bd9c550694f560301abe4bd04aaeb1c81fd8cea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gr=C3=A9goire=20KUBLER?= <gregoire.kubler@proton.me>
Date: Tue, 25 Feb 2025 13:31:12 +0000
Subject: [PATCH] chore : conv forward 1/2D formatting

---
 .../aidge/backend/cpu/operator/ConvImpl.hpp   |   1 +
 .../backend/cpu/operator/ConvImpl_kernels.hpp | 249 ++++++++++++------
 src/operator/ConvImpl.cpp                     |  21 +-
 3 files changed, 177 insertions(+), 94 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/ConvImpl.hpp b/include/aidge/backend/cpu/operator/ConvImpl.hpp
index 8bf11ac0..e480697b 100644
--- a/include/aidge/backend/cpu/operator/ConvImpl.hpp
+++ b/include/aidge/backend/cpu/operator/ConvImpl.hpp
@@ -20,6 +20,7 @@
 #include "aidge/utils/Types.h"
 
 namespace Aidge {
+
 // Operator implementation entry point for the backend
 using Conv1D_Op = Conv_Op<1>;
 using ConvImpl1D_cpu = OperatorImpl_cpu<Conv_Op<1>,
diff --git a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
index 4e1861b4..274f5f4f 100644
--- a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
@@ -37,16 +37,15 @@ using std::array;
  * @param output_ Output Tensor.
  */
 template <class I, class W, class B, class O>
-void ConvImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& strideDims,
-                            const std::array<DimSize_t, 1>& dilationDims,
-                            const std::array<DimSize_t, 1>& kernelDims,
-                            const std::array<DimSize_t, 3>& inputDims,
-                            DimSize_t outChannels,
-                            const void *input_,
-                            const void *weights_,
-                            const void *biases_,
-                            void *output_)
-{
+void ConvImpl1D_cpu_forward_kernel(const array<DimSize_t, 1> &strideDim,
+                                   const array<DimSize_t, 1> &dilationDim,
+                                   const array<DimSize_t, 1> &kernelDim,
+                                   const std::array<DimSize_t, 3> &inputDims,
+                                   DimSize_t outChannels,
+                                   const void *input_,
+                                   const void *weights_,
+                                   const void *biases_,
+                                   void *output_) {
     // FIXME: missing convolution attributes as arguments
     const I *input = static_cast<const I *>(input_);
     const W *weights = static_cast<const W *>(weights_);
@@ -54,34 +53,51 @@ void ConvImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& strideDims,
     O *output = static_cast<O *>(output_);
 
     // output H size
-    const std::size_t oxSize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilationDims[0]*(kernelDims[0] - 1) - 1 + strideDims[0]) /
-                                static_cast<float>(strideDims[0])));
-    const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1;
+    const std::size_t oxSize = static_cast<std::size_t>(std::floor(
+        static_cast<float>(inputDims[2] - dilationDim[0] * (kernelDim[0] - 1) -
+                           1 + strideDim[0]) /
+        static_cast<float>(strideDim[0])));
+    const DimSize_t dilated_kernel_x = dilationDim[0] * (kernelDim[0] - 1) + 1;
 
-    // TODO: kernel computation
-    // output (batch, outCh, Xout, Yout)
-    // input  (batch, inCh, Xin, Yin)
-    // weight (outCh, inCh, kernelX, kernelY)
-    // does not take Dilation attribute into account
     using signedsize = std::make_signed<std::size_t>::type;
     for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
         for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
-            const std::size_t oIndex = (outCh + batch*outChannels) * oxSize;
+            const std::size_t oIndex = (outCh + batch * outChannels) * oxSize;
             // If bias = nullptr, set B(0)
             B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
-            std::fill(output + oIndex, output+(oIndex+oxSize), biasVal);
+            std::fill(output + oIndex, output + (oIndex + oxSize), biasVal);
             for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
-                const std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2];
-                const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0];
+                const std::size_t iIndex =
+                    (inCh + batch * inputDims[1]) * inputDims[2];
+                const std::size_t wIndex =
+                    (inCh + outCh * inputDims[1]) * kernelDim[0];
                 for (std::size_t ox = 0; ox < oxSize; ++ox) {
-                    // const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]);
-                    // const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
-                    // const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx);
+                    // const signedsize difx = static_cast<signedsize>(- ox *
+                    // strideDim[0s); const std::size_t sxMin =
+                    // static_cast<std::size_t>(std::max(difx, signedsize(0)));
+                    // const std::size_t sxMax =
+                    // (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 :
+                    // ((inputDims[2] + difx) > kernelDim[0s[0] ? kernelDim[0s
+                    // : inputDims[2] + difx);
                     const std::size_t sxMin = 0;
                     const std::size_t sxMax = dilated_kernel_x;
                     const std::size_t oIndexFull = oIndex + ox;
-                    const signedsize ix = static_cast<signedsize>(ox * strideDims[0]);
+                    const signedsize ix =
+                        static_cast<signedsize>(ox * strideDim[0]);
+
+                    for (std::size_t sx = sxMin; sx * dilationDim[0] < sxMax;
+                         ++sx) {
+                        output[oIndexFull] +=
+                            weights[wIndex + sx] *
+                            input[iIndex + static_cast<std::size_t>(
+                                               ix + static_cast<signedsize>(
+                                                        sx * dilationDim[0]))];
+                    }
+                }
+            }
+        }
+    }
+}
 
 /**
  * @brief perform 1D backpropagation for the data input
@@ -119,9 +135,9 @@ void ConvImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& strideDims,
  * @param[inout] iGrad gradients of the input to update
  */
 template <class I, class W, class O>
-void conv1DBackwardInput(const DimSize_t &stride,
-                         const DimSize_t &dilation,
-                         const DimSize_t &kDim,
+void conv1DBackwardInput(const array<DimSize_t, 1> &stride,
+                         const array<DimSize_t, 1> &dilation,
+                         const array<DimSize_t, 1> &kDim,
                          const array<DimSize_t, 2> &kStrides,
                          const W *weights,
                          const array<DimSize_t, 3> &oDims,
@@ -434,16 +450,15 @@ REGISTRAR(ConvImpl1D_cpu,
  * @param output_ Output Tensor.
  */
 template <class I, class W, class B, class O>
-void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
-                            const std::array<DimSize_t, 2>& dilationDims,
-                            const std::array<DimSize_t, 2>& kernelDims,
-                            const std::array<DimSize_t, 4> &inputDims,
-                            DimSize_t outChannels,
-                            const void *input_,
-                            const void *weights_,
-                            const void *biases_,
-                            void *output_)
-{
+void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims,
+                                   const array<DimSize_t, 2> &dilationDims,
+                                   const array<DimSize_t, 2> &kernelDims,
+                                   const array<DimSize_t, 4> &inputDims,
+                                   DimSize_t outChannels,
+                                   const void *input_,
+                                   const void *weights_,
+                                   const void *biases_,
+                                   void *output_) {
     // FIXME: missing convolution attributes as arguments
     const I *input = static_cast<const I *>(input_);
     const W *weights = static_cast<const W *>(weights_);
@@ -451,59 +466,102 @@ void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
     O *output = static_cast<O *>(output_);
 
     // output H size
-    const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1;
-    const std::size_t oxSize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilated_kernel_x + strideDims[0]) /
-                                static_cast<float>(strideDims[0])));
+    const DimSize_t dilated_kernel_x =
+        dilationDims[0] * (kernelDims[0] - 1) + 1;
+    const std::size_t oxSize = static_cast<std::size_t>(std::floor(
+        static_cast<float>(inputDims[2] - dilated_kernel_x + strideDims[0]) /
+        static_cast<float>(strideDims[0])));
     // output W size
-    const DimSize_t dilated_kernel_y = dilationDims[1]*(kernelDims[1] - 1) + 1;
-    const std::size_t oySize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[3] - dilated_kernel_y + strideDims[1]) /
-                                static_cast<float>(strideDims[1])));
-
+    const DimSize_t dilated_kernel_y =
+        dilationDims[1] * (kernelDims[1] - 1) + 1;
+    const std::size_t oySize = static_cast<std::size_t>(std::floor(
+        static_cast<float>(inputDims[3] - dilated_kernel_y + strideDims[1]) /
+        static_cast<float>(strideDims[1])));
 
     // TODO: kernel computation
     // output (batch, outCh, Xout, Yout)
     // input  (batch, inCh, Xin, Yin)
     // weight (outCh, inCh, kernelX, kernelY)
     // does not take Dilation attribute into account
-    const std::size_t outChannels_s =  oxSize * oySize;
+    const std::size_t outChannels_s = oxSize * oySize;
 
     if (dilated_kernel_x == 3 && dilated_kernel_y == 3) {
         for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
             for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
                 // If bias = nullptr, set B(0)
                 B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
-                std::fill(output, output+outChannels_s, biasVal);
+                std::fill(output, output + outChannels_s, biasVal);
                 for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
-                    std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
-                    const std::size_t wIndex = (inCh + outCh*inputDims[1]) * 9;
-                    if (strideDims[0] == 1 && strideDims[1]==1) {
-                        for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=inputDims[3]) {
+                    std::size_t iIndex = (inCh + batch * inputDims[1]) *
+                                         inputDims[2] * inputDims[3];
+                    const std::size_t wIndex =
+                        (inCh + outCh * inputDims[1]) * 9;
+                    if (strideDims[0] == 1 && strideDims[1] == 1) {
+                        for (std::size_t ox = 0, oIndex = 0; ox < oxSize;
+                             ++ox, oIndex += oySize, iIndex -= inputDims[3]) {
                             for (std::size_t oy = 0; oy < oySize; ++oy) {
-                                output[oIndex + oy] += weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+1]+weights[wIndex+2]*input[iIndex+oy+2];
+                                output[oIndex + oy] +=
+                                    weights[wIndex + 0] * input[iIndex + oy] +
+                                    weights[wIndex + 1] *
+                                        input[iIndex + oy + 1] +
+                                    weights[wIndex + 2] *
+                                        input[iIndex + oy + 2];
                             }
-                            iIndex+=inputDims[3];
+                            iIndex += inputDims[3];
                             for (std::size_t oy = 0; oy < oySize; ++oy) {
-                                output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy]+weights[wIndex+4]*input[iIndex+oy+1]+weights[wIndex+5]*input[iIndex+oy+2];
+                                output[oIndex + oy] +=
+                                    weights[wIndex + 3] * input[iIndex + oy] +
+                                    weights[wIndex + 4] *
+                                        input[iIndex + oy + 1] +
+                                    weights[wIndex + 5] *
+                                        input[iIndex + oy + 2];
                             }
-                            iIndex+=inputDims[3];
+                            iIndex += inputDims[3];
                             for (std::size_t oy = 0; oy < oySize; ++oy) {
-                                output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy]+weights[wIndex+7]*input[iIndex+oy+1]+weights[wIndex+8]*input[iIndex+oy+2];
+                                output[oIndex + oy] +=
+                                    weights[wIndex + 6] * input[iIndex + oy] +
+                                    weights[wIndex + 7] *
+                                        input[iIndex + oy + 1] +
+                                    weights[wIndex + 8] *
+                                        input[iIndex + oy + 2];
                             }
                         }
                     } else {
-                        for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=(strideDims[0]-2)*inputDims[3]) {
+                        for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox,
+                                         oIndex += oySize,
+                                         iIndex += (strideDims[0] -
+                                                    2) * inputDims[3]) {
                             for (std::size_t oy = 0; oy < oySize; ++oy) {
-                                output[oIndex + oy] += weights[wIndex+0]*input[iIndex+oy*strideDims[1]]+weights[wIndex+1]*input[iIndex+oy*strideDims[1]+1]+weights[wIndex+2]*input[iIndex+oy*strideDims[1]+2];
+                                output[oIndex + oy] +=
+                                    weights[wIndex + 0] *
+                                        input[iIndex + oy * strideDims[1]] +
+                                    weights[wIndex + 1] *
+                                        input[iIndex + oy * strideDims[1] +
+                                              1] +
+                                    weights[wIndex + 2] *
+                                        input[iIndex + oy * strideDims[1] + 2];
                             }
-                            iIndex+=inputDims[3];
+                            iIndex += inputDims[3];
                             for (std::size_t oy = 0; oy < oySize; ++oy) {
-                                output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy*strideDims[1]]+weights[wIndex+4]*input[iIndex+oy*strideDims[1]+1]+weights[wIndex+5]*input[iIndex+oy*strideDims[1]+2];
+                                output[oIndex + oy] +=
+                                    weights[wIndex + 3] *
+                                        input[iIndex + oy * strideDims[1]] +
+                                    weights[wIndex + 4] *
+                                        input[iIndex + oy * strideDims[1] +
+                                              1] +
+                                    weights[wIndex + 5] *
+                                        input[iIndex + oy * strideDims[1] + 2];
                             }
-                            iIndex+=inputDims[3];
+                            iIndex += inputDims[3];
                             for (std::size_t oy = 0; oy < oySize; ++oy) {
-                                output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy*strideDims[1]]+weights[wIndex+7]*input[iIndex+oy*strideDims[1]+1]+weights[wIndex+8]*input[iIndex+oy*strideDims[1]+2];
+                                output[oIndex + oy] +=
+                                    weights[wIndex + 6] *
+                                        input[iIndex + oy * strideDims[1]] +
+                                    weights[wIndex + 7] *
+                                        input[iIndex + oy * strideDims[1] +
+                                              1] +
+                                    weights[wIndex + 8] *
+                                        input[iIndex + oy * strideDims[1] + 2];
                             }
                         }
                     }
@@ -516,18 +574,26 @@ void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
             for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
                 // If bias = nullptr, set B(0)
                 B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
-                std::fill(output, output+outChannels_s, biasVal);
+                std::fill(output, output + outChannels_s, biasVal);
                 for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
-                    std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
-                    const std::size_t wIndex = (inCh + outCh*inputDims[1]);
+                    std::size_t iIndex = (inCh + batch * inputDims[1]) *
+                                         inputDims[2] * inputDims[3];
+                    const std::size_t wIndex = (inCh + outCh * inputDims[1]);
                     if (strideDims[0] == 1 && strideDims[1] == 1) {
-                        for (std::size_t oIndex = 0; oIndex < oxSize*oySize; ++oIndex, ++iIndex) {
+                        for (std::size_t oIndex = 0; oIndex < oxSize * oySize;
+                             ++oIndex, ++iIndex) {
                             output[oIndex] += weights[wIndex] * input[iIndex];
                         }
-                    } else  {
-                        for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=inputDims[3]*strideDims[0]) {
-                            for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) {
-                                output[oIndex + oy] += weights[wIndex+0]*input[iIndex+iy];
+                    } else {
+                        for (std::size_t ox = 0, oIndex = 0; ox < oxSize;
+                             ++ox,
+                                         oIndex += oySize,
+                                         iIndex +=
+                                         inputDims[3] * strideDims[0]) {
+                            for (std::size_t oy = 0, iy = 0; oy < oySize;
+                                 ++oy, iy += strideDims[1]) {
+                                output[oIndex + oy] +=
+                                    weights[wIndex + 0] * input[iIndex + iy];
                             }
                         }
                     }
@@ -540,21 +606,36 @@ void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
             for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
                 // If bias = nullptr, set B(0)
                 B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
-                std::fill(output, output+outChannels_s, biasVal);
+                std::fill(output, output + outChannels_s, biasVal);
                 for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
-                    std::size_t iIndex_channel = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
-                    const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0] * kernelDims[1];
+                    std::size_t iIndex_channel =
+                        (inCh + batch * inputDims[1]) * inputDims[2] *
+                        inputDims[3];
+                    const std::size_t wIndex = (inCh + outCh * inputDims[1]) *
+                                               kernelDims[0] * kernelDims[1];
 
                     // loop over each ouput line
-                    for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex_channel+=inputDims[3]*strideDims[0]) {
+                    for (std::size_t ox = 0, oIndex = 0; ox < oxSize;
+                         ++ox,
+                                     oIndex += oySize,
+                                     iIndex_channel +=
+                                     inputDims[3] * strideDims[0]) {
                         // loop over associated input line
-                        for (std::size_t ky = 0, ix = 0; ky < kernelDims[0]; ++ky, ix += inputDims[3]*dilationDims[0]) {
+                        for (std::size_t ky = 0, ix = 0; ky < kernelDims[0];
+                             ++ky, ix += inputDims[3] * dilationDims[0]) {
                             // loop over the entire line
-                            for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) {
-                                const std::size_t iIndex = iIndex_channel + ix + iy;
-                                // loop over elements assosicated with one output
-                                for (std::size_t kx = 0;  kx < kernelDims[0]; ++kx) {
-                                    output[oIndex + oy] += weights[wIndex+kernelDims[0]*ky+kx]*input[iIndex+kx*dilationDims[1]];
+                            for (std::size_t oy = 0, iy = 0; oy < oySize;
+                                 ++oy, iy += strideDims[1]) {
+                                const std::size_t iIndex =
+                                    iIndex_channel + ix + iy;
+                                // loop over elements assosicated with one
+                                // output
+                                for (std::size_t kx = 0; kx < kernelDims[0];
+                                     ++kx) {
+                                    output[oIndex + oy] +=
+                                        weights[wIndex + kernelDims[0] * ky +
+                                                kx] *
+                                        input[iIndex + kx * dilationDims[1]];
                                 }
                             }
                         }
diff --git a/src/operator/ConvImpl.cpp b/src/operator/ConvImpl.cpp
index ffbb75c1..d23a9968 100644
--- a/src/operator/ConvImpl.cpp
+++ b/src/operator/ConvImpl.cpp
@@ -40,16 +40,17 @@ void Aidge::ConvImpl1D_cpu::forward() {
     const auto& input2 = (op_.getInput(2)) ? op_.getInput(2)->refCastFrom(input2Fallback, *op_.getOutput(0)) : Tensor();
 
     // Call kernel
-    impl.forward(op_.strideDims(),
-            op_.dilationDims(),
-            op_.kernelDims(),
-            op_.getInput(0)->template dims<3>(), // input dimensions
-            dynamic_cast<const Conv_Op<1>&>(mOp).outChannels(), // outChannels
-            input0.getImpl()->rawPtr(), // input
-            input1.getImpl()->rawPtr(), // weight
-            op_.getInput(2) ? input2.getImpl()->rawPtr() : nullptr, // bias
-            getCPUPtr(mOp.getRawOutput(0)) // output
-            );
+    impl.forward(
+        op_.strideDims(),
+        op_.dilationDims(),
+        op_.kernelDims(),
+        op_.getInput(0)->template dims<3>(), // input dimensions
+        dynamic_cast<const Conv_Op<1> &>(mOp).outChannels(),    // outChannels
+        input0.getImpl()->rawPtr(),                             // input
+        input1.getImpl()->rawPtr(),                             // weight
+        op_.getInput(2) ? input2.getImpl()->rawPtr() : nullptr, // bias
+        getCPUPtr(mOp.getRawOutput(0))                          // output
+    );
 }
 
 template <> void ConvImpl1D_cpu::backward() {
-- 
GitLab