Grégoire Kubler
--- a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp

+ 119

− 54
+++ b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp

+ 119

− 54
 @@ -450,16 +450,15 @@ REGISTRAR(ConvImpl1D_cpu,
 * @param output_ Output Tensor.
 */
 template <class I, class W, class B, class O>
-void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
-                            const std::array<DimSize_t, 2>& dilationDims,
-                            const std::array<DimSize_t, 2>& kernelDims,
-                            const std::array<DimSize_t, 4> &inputDims,
-                            DimSize_t outChannels,
-                            const void *input_,
-                            const void *weights_,
-                            const void *biases_,
-                            void *output_)
-{
+void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims,
+                                   const array<DimSize_t, 2> &dilationDims,
+                                   const array<DimSize_t, 2> &kernelDims,
+                                   const array<DimSize_t, 4> &inputDims,
+                                   DimSize_t outChannels,
+                                   const void *input_,
+                                   const void *weights_,
+                                   const void *biases_,
+                                   void *output_) {
    // FIXME: missing convolution attributes as arguments
    const I *input = static_cast<const I *>(input_);
    const W *weights = static_cast<const W *>(weights_);
 @@ -467,59 +466,102 @@ void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
    O *output = static_cast<O *>(output_);

    // output H size
-    const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1;
-    const std::size_t oxSize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilated_kernel_x + strideDims[0]) /
-                                static_cast<float>(strideDims[0])));
+    const DimSize_t dilated_kernel_x =
+        dilationDims[0] * (kernelDims[0] - 1) + 1;
+    const std::size_t oxSize = static_cast<std::size_t>(std::floor(
+        static_cast<float>(inputDims[2] - dilated_kernel_x + strideDims[0]) /
+        static_cast<float>(strideDims[0])));
    // output W size
-    const DimSize_t dilated_kernel_y = dilationDims[1]*(kernelDims[1] - 1) + 1;
-    const std::size_t oySize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[3] - dilated_kernel_y + strideDims[1]) /
-                                static_cast<float>(strideDims[1])));
-
+    const DimSize_t dilated_kernel_y =
+        dilationDims[1] * (kernelDims[1] - 1) + 1;
+    const std::size_t oySize = static_cast<std::size_t>(std::floor(
+        static_cast<float>(inputDims[3] - dilated_kernel_y + strideDims[1]) /
+        static_cast<float>(strideDims[1])));

    // TODO: kernel computation
    // output (batch, outCh, Xout, Yout)
    // input  (batch, inCh, Xin, Yin)
    // weight (outCh, inCh, kernelX, kernelY)
    // does not take Dilation attribute into account
-    const std::size_t outChannels_s =  oxSize * oySize;
+    const std::size_t outChannels_s = oxSize * oySize;

    if (dilated_kernel_x == 3 && dilated_kernel_y == 3) {
        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
            for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
                // If bias = nullptr, set B(0)
                B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
-                std::fill(output, output+outChannels_s, biasVal);
+                std::fill(output, output + outChannels_s, biasVal);
                for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
-                    std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
-                    const std::size_t wIndex = (inCh + outCh*inputDims[1]) * 9;
-                    if (strideDims[0] == 1 && strideDims[1]==1) {
-                        for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=inputDims[3]) {
+                    std::size_t iIndex = (inCh + batch * inputDims[1]) *
+                                         inputDims[2] * inputDims[3];
+                    const std::size_t wIndex =
+                        (inCh + outCh * inputDims[1]) * 9;
+                    if (strideDims[0] == 1 && strideDims[1] == 1) {
+                        for (std::size_t ox = 0, oIndex = 0; ox < oxSize;
+                             ++ox, oIndex += oySize, iIndex -= inputDims[3]) {
                            for (std::size_t oy = 0; oy < oySize; ++oy) {
-                                output[oIndex + oy] += weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+1]+weights[wIndex+2]*input[iIndex+oy+2];
+                                output[oIndex + oy] +=
+                                    weights[wIndex + 0] * input[iIndex + oy] +
+                                    weights[wIndex + 1] *
+                                        input[iIndex + oy + 1] +
+                                    weights[wIndex + 2] *
+                                        input[iIndex + oy + 2];
                            }
-                            iIndex+=inputDims[3];
+                            iIndex += inputDims[3];
                            for (std::size_t oy = 0; oy < oySize; ++oy) {
-                                output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy]+weights[wIndex+4]*input[iIndex+oy+1]+weights[wIndex+5]*input[iIndex+oy+2];
+                                output[oIndex + oy] +=
+                                    weights[wIndex + 3] * input[iIndex + oy] +
+                                    weights[wIndex + 4] *
+                                        input[iIndex + oy + 1] +
+                                    weights[wIndex + 5] *
+                                        input[iIndex + oy + 2];
                            }
-                            iIndex+=inputDims[3];
+                            iIndex += inputDims[3];
                            for (std::size_t oy = 0; oy < oySize; ++oy) {
-                                output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy]+weights[wIndex+7]*input[iIndex+oy+1]+weights[wIndex+8]*input[iIndex+oy+2];
+                                output[oIndex + oy] +=
+                                    weights[wIndex + 6] * input[iIndex + oy] +
+                                    weights[wIndex + 7] *
+                                        input[iIndex + oy + 1] +
+                                    weights[wIndex + 8] *
+                                        input[iIndex + oy + 2];
                            }
                        }
                    } else {
-                        for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=(strideDims[0]-2)*inputDims[3]) {
+                        for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox,
+                                         oIndex += oySize,
+                                         iIndex += (strideDims[0] -
+                                                    2) * inputDims[3]) {
                            for (std::size_t oy = 0; oy < oySize; ++oy) {
-                                output[oIndex + oy] += weights[wIndex+0]*input[iIndex+oy*strideDims[1]]+weights[wIndex+1]*input[iIndex+oy*strideDims[1]+1]+weights[wIndex+2]*input[iIndex+oy*strideDims[1]+2];
+                                output[oIndex + oy] +=
+                                    weights[wIndex + 0] *
+                                        input[iIndex + oy * strideDims[1]] +
+                                    weights[wIndex + 1] *
+                                        input[iIndex + oy * strideDims[1] +
+                                              1] +
+                                    weights[wIndex + 2] *
+                                        input[iIndex + oy * strideDims[1] + 2];
                            }
-                            iIndex+=inputDims[3];
+                            iIndex += inputDims[3];
                            for (std::size_t oy = 0; oy < oySize; ++oy) {
-                                output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy*strideDims[1]]+weights[wIndex+4]*input[iIndex+oy*strideDims[1]+1]+weights[wIndex+5]*input[iIndex+oy*strideDims[1]+2];
+                                output[oIndex + oy] +=
+                                    weights[wIndex + 3] *
+                                        input[iIndex + oy * strideDims[1]] +
+                                    weights[wIndex + 4] *
+                                        input[iIndex + oy * strideDims[1] +
+                                              1] +
+                                    weights[wIndex + 5] *
+                                        input[iIndex + oy * strideDims[1] + 2];
                            }
-                            iIndex+=inputDims[3];
+                            iIndex += inputDims[3];
                            for (std::size_t oy = 0; oy < oySize; ++oy) {
-                                output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy*strideDims[1]]+weights[wIndex+7]*input[iIndex+oy*strideDims[1]+1]+weights[wIndex+8]*input[iIndex+oy*strideDims[1]+2];
+                                output[oIndex + oy] +=
+                                    weights[wIndex + 6] *
+                                        input[iIndex + oy * strideDims[1]] +
+                                    weights[wIndex + 7] *
+                                        input[iIndex + oy * strideDims[1] +
+                                              1] +
+                                    weights[wIndex + 8] *
+                                        input[iIndex + oy * strideDims[1] + 2];
                            }
                        }
                    }
 @@ -532,18 +574,26 @@ void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
            for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
                // If bias = nullptr, set B(0)
                B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
-                std::fill(output, output+outChannels_s, biasVal);
+                std::fill(output, output + outChannels_s, biasVal);
                for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
-                    std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
-                    const std::size_t wIndex = (inCh + outCh*inputDims[1]);
+                    std::size_t iIndex = (inCh + batch * inputDims[1]) *
+                                         inputDims[2] * inputDims[3];
+                    const std::size_t wIndex = (inCh + outCh * inputDims[1]);
                    if (strideDims[0] == 1 && strideDims[1] == 1) {
-                        for (std::size_t oIndex = 0; oIndex < oxSize*oySize; ++oIndex, ++iIndex) {
+                        for (std::size_t oIndex = 0; oIndex < oxSize * oySize;
+                             ++oIndex, ++iIndex) {
                            output[oIndex] += weights[wIndex] * input[iIndex];
                        }
-                    } else  {
-                        for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=inputDims[3]*strideDims[0]) {
-                            for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) {
-                                output[oIndex + oy] += weights[wIndex+0]*input[iIndex+iy];
+                    } else {
+                        for (std::size_t ox = 0, oIndex = 0; ox < oxSize;
+                             ++ox,
+                                         oIndex += oySize,
+                                         iIndex +=
+                                         inputDims[3] * strideDims[0]) {
+                            for (std::size_t oy = 0, iy = 0; oy < oySize;
+                                 ++oy, iy += strideDims[1]) {
+                                output[oIndex + oy] +=
+                                    weights[wIndex + 0] * input[iIndex + iy];
                            }
                        }
                    }
 @@ -556,21 +606,36 @@ void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
            for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
                // If bias = nullptr, set B(0)
                B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
-                std::fill(output, output+outChannels_s, biasVal);
+                std::fill(output, output + outChannels_s, biasVal);
                for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
-                    std::size_t iIndex_channel = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
-                    const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0] * kernelDims[1];
+                    std::size_t iIndex_channel =
+                        (inCh + batch * inputDims[1]) * inputDims[2] *
+                        inputDims[3];
+                    const std::size_t wIndex = (inCh + outCh * inputDims[1]) *
+                                               kernelDims[0] * kernelDims[1];

                    // loop over each ouput line
-                    for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex_channel+=inputDims[3]*strideDims[0]) {
+                    for (std::size_t ox = 0, oIndex = 0; ox < oxSize;
+                         ++ox,
+                                     oIndex += oySize,
+                                     iIndex_channel +=
+                                     inputDims[3] * strideDims[0]) {
                        // loop over associated input line
-                        for (std::size_t ky = 0, ix = 0; ky < kernelDims[0]; ++ky, ix += inputDims[3]*dilationDims[0]) {
+                        for (std::size_t ky = 0, ix = 0; ky < kernelDims[0];
+                             ++ky, ix += inputDims[3] * dilationDims[0]) {
                            // loop over the entire line
-                            for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) {
-                                const std::size_t iIndex = iIndex_channel + ix + iy;
-                                // loop over elements assosicated with one output
-                                for (std::size_t kx = 0;  kx < kernelDims[0]; ++kx) {
-                                    output[oIndex + oy] += weights[wIndex+kernelDims[0]*ky+kx]*input[iIndex+kx*dilationDims[1]];
+                            for (std::size_t oy = 0, iy = 0; oy < oySize;
+                                 ++oy, iy += strideDims[1]) {
+                                const std::size_t iIndex =
+                                    iIndex_channel + ix + iy;
+                                // loop over elements assosicated with one
+                                // output
+                                for (std::size_t kx = 0; kx < kernelDims[0];
+                                     ++kx) {
+                                    output[oIndex + oy] +=
+                                        weights[wIndex + kernelDims[0] * ky +
+                                                kx] *
+                                        input[iIndex + kx * dilationDims[1]];
                                }
                            }
                        }