Maxence Naud
--- a/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp

+ 40

− 25
+++ b/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp

+ 40

− 25
 @@ -157,51 +157,66 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri
            for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {

                B biasVal = (biases != nullptr) ? biases[ch] : B(0);
-                std::fill(output, output + outChannels_s, biasVal);

-                const std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3];
+                std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3];
                const std::size_t wIndex = ch * 9;

-                for (std::size_t ox = 0; ox < oxSize; ++ox) {
-                    for (std::size_t oy = 0; oy < oySize; ++oy) {
-                        const std::size_t oIndexFull = ox*oySize + oy;
-                        const signedsize ix = static_cast<signedsize>(ox * strideDims[0]);
-                        const signedsize iy = static_cast<signedsize>(oy * strideDims[1]);
-
-                        output[oIndexFull] +=  (weights[wIndex + 0*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
-                                                weights[wIndex + 0*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
-                                                weights[wIndex + 0*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+2)] +
-                                                weights[wIndex + 1*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
-                                                weights[wIndex + 1*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
-                                                weights[wIndex + 1*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+2)] +
-                                                weights[wIndex + 2*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
-                                                weights[wIndex + 2*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
-                                                weights[wIndex + 2*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+2)]);
+                if (strideDims[0] == 1 && strideDims[1]==1) {
+                    for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=inputDims[3]) {
+                        for (std::size_t oy = 0; oy < oySize; ++oy) {
+                            output[oIndex + oy] = biasVal + weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+1]+weights[wIndex+2]*input[iIndex+oy+2];
+                        }
+                        iIndex+=inputDims[3];
+                        for (std::size_t oy = 0; oy < oySize; ++oy) {
+                            output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy]+weights[wIndex+4]*input[iIndex+oy+1]+weights[wIndex+5]*input[iIndex+oy+2];
+                        }
+                        iIndex+=inputDims[3];
+                        for (std::size_t oy = 0; oy < oySize; ++oy) {
+                            output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy]+weights[wIndex+7]*input[iIndex+oy+1]+weights[wIndex+8]*input[iIndex+oy+2];
+                        }
+                    }
+                } else {
+                    for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=strideDims[0]*inputDims[3]) {
+                        for (std::size_t oy = 0; oy < oySize; ++oy) {
+                            output[oIndex + oy] += weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+strideDims[0]]+weights[wIndex+2]*input[iIndex+oy+strideDims[0]*2];
+                        }
+                        iIndex+=strideDims[0]*inputDims[3];
+                        for (std::size_t oy = 0; oy < oySize; ++oy) {
+                            output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy]+weights[wIndex+4]*input[iIndex+oy+strideDims[0]]+weights[wIndex+5]*input[iIndex+oy+strideDims[0]*2];
+                        }
+                        iIndex+=strideDims[0]*inputDims[3];
+                        for (std::size_t oy = 0; oy < oySize; ++oy) {
+                            output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy]+weights[wIndex+7]*input[iIndex+oy+strideDims[0]]+weights[wIndex+8]*input[iIndex+oy+strideDims[0]*2];
+                        }
                    }
                }
                output += outChannels_s;
            }
        }
    } else if (dilated_kernel_x == 1 && dilated_kernel_y == 1) {
+        std::size_t index = 0;
        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
            for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {

                B biasVal = (biases != nullptr) ? biases[ch] : B(0);
-                std::fill(output, output + outChannels_s, biasVal);

                const std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3];
                const std::size_t wIndex = ch;
-                for (std::size_t ox = 0; ox < oxSize; ++ox) {
-                    for (std::size_t oy = 0; oy < oySize; ++oy) {

-                        const std::size_t oIndexFull = ox*oySize + oy;
-                        const signedsize ix = static_cast<signedsize>(ox * strideDims[0]);
-                        const signedsize iy = static_cast<signedsize>(oy * strideDims[1]);
-                        output[oIndexFull] += weights[wIndex] * input[iIndex + static_cast<std::size_t>(ix)*inputDims[3] + static_cast<std::size_t>(iy)];
+                if (strideDims[0] == 1 && strideDims[1] == 1) {
+                    for (; index < iIndex + oxSize*oySize; ++index) {
+                        output[index] = biasVal + weights[wIndex] * input[index];
+                    }
+                } else  {
+                    std::size_t oIndex =  (ch + batch*inputDims[1]) * oxSize * oySize;
+                    for (std::size_t ox = 0; ox < oxSize; ++ox, oIndex+=oySize) {
+                        index = iIndex + strideDims[0]*inputDims[3];
+                        for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) {
+                            output[oIndex + oy] += weights[wIndex]*input[index+iy];
+                        }
                    }
                }
            }
-            output += outChannels_s;
        }
    } else {
        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {