From 544f70a40e619ac1dd752a55d78437f280a3926a Mon Sep 17 00:00:00 2001
From: Maxence Naud <maxence.naud@cea.fr>
Date: Mon, 4 Nov 2024 11:58:13 +0000
Subject: [PATCH] Upd 2D Conv[DepthWise] kernels and add some tests

---
 .../operator/ConvDepthWiseImpl_kernels.hpp    | 125 +++++--
 .../backend/cpu/operator/ConvImpl_kernels.hpp | 139 +++++---
 .../operator/Test_ConvDepthWiseImpl.cpp       | 331 +++++++++++-------
 unit_tests/operator/Test_ConvImpl.cpp         | 146 +++++++-
 4 files changed, 532 insertions(+), 209 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp
index c39cf9cc..28ed8969 100644
--- a/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp
@@ -150,42 +150,93 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri
     // weight (outCh, ch, kernelX, kernelY)
     // does not take Dilation attribute into account
     using signedsize = std::make_signed<std::size_t>::type;
-    for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
-        for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
-            const std::size_t oIndex = (ch + batch*inputDims[1]) * oxSize * oySize;
-            B biasVal = (biases != nullptr) ? biases[ch] : B(0);
-            std::fill(output + oIndex, output+(oIndex+oxSize*oySize), biasVal);
-            const std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3];
-            const std::size_t wIndex = ch * kernelDims[0] * kernelDims[1];
-            for (std::size_t ox = 0; ox < oxSize; ++ox) {
-                // const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]);
-                // const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
-                // const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx);
-                const std::size_t sxMin = 0;
-                const std::size_t sxMax = dilated_kernel_x;
-                for (std::size_t oy = 0; oy < oySize; ++oy) {
-                    // const signedsize dify = static_cast<signedsize>(- oy * strideDims[1]);
-                    // const std::size_t syMin = static_cast<std::size_t>(std::max(dify, signedsize(0)));
-                    // const std::size_t syMax = (static_cast<signedsize>(inputDims[3]) + dify) < 0 ? 0 : ((inputDims[3] + dify) > kernelDims[1] ? kernelDims[1] : inputDims[3] + dify);
-                    const std::size_t syMin = 0;
-                    const std::size_t syMax = dilated_kernel_y;
-                    const std::size_t oIndexFull = oIndex + ox*oySize + oy;
-                    const signedsize ix = static_cast<signedsize>(ox * strideDims[0]);
-                    const signedsize iy = static_cast<signedsize>(oy * strideDims[1]);
-
-                    if (sxMin == 0 && syMin == 0 && sxMax == 3 && syMax == 3) {
-                        output[oIndexFull] +=  (weights[wIndex + 0*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
-                                                weights[wIndex + 0*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
-                                                weights[wIndex + 0*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+2)] +
-                                                weights[wIndex + 1*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
-                                                weights[wIndex + 1*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
-                                                weights[wIndex + 1*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+2)] +
-                                                weights[wIndex + 2*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
-                                                weights[wIndex + 2*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
-                                                weights[wIndex + 2*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+2)]);
-                    } else {
-                        for (std::size_t sx = sxMin; sx*dilationDims[0] < sxMax; ++sx) {
-                            for (std::size_t sy = syMin; sy*dilationDims[1] < syMax; ++sy) {
+    const std::size_t outChannels_s =  oxSize * oySize;
+
+    if (dilated_kernel_x ==3 && dilated_kernel_y == 3) {
+        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+            for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
+
+                B biasVal = (biases != nullptr) ? biases[ch] : B(0);
+
+                std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3];
+                const std::size_t wIndex = ch * 9;
+
+                if (strideDims[0] == 1 && strideDims[1]==1) {
+                    for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=inputDims[3]) {
+                        for (std::size_t oy = 0; oy < oySize; ++oy) {
+                            output[oIndex + oy] = biasVal + weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+1]+weights[wIndex+2]*input[iIndex+oy+2];
+                        }
+                        iIndex+=inputDims[3];
+                        for (std::size_t oy = 0; oy < oySize; ++oy) {
+                            output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy]+weights[wIndex+4]*input[iIndex+oy+1]+weights[wIndex+5]*input[iIndex+oy+2];
+                        }
+                        iIndex+=inputDims[3];
+                        for (std::size_t oy = 0; oy < oySize; ++oy) {
+                            output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy]+weights[wIndex+7]*input[iIndex+oy+1]+weights[wIndex+8]*input[iIndex+oy+2];
+                        }
+                    }
+                } else {
+                    for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=strideDims[0]*inputDims[3]) {
+                        for (std::size_t oy = 0; oy < oySize; ++oy) {
+                            output[oIndex + oy] += weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+strideDims[0]]+weights[wIndex+2]*input[iIndex+oy+strideDims[0]*2];
+                        }
+                        iIndex+=strideDims[0]*inputDims[3];
+                        for (std::size_t oy = 0; oy < oySize; ++oy) {
+                            output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy]+weights[wIndex+4]*input[iIndex+oy+strideDims[0]]+weights[wIndex+5]*input[iIndex+oy+strideDims[0]*2];
+                        }
+                        iIndex+=strideDims[0]*inputDims[3];
+                        for (std::size_t oy = 0; oy < oySize; ++oy) {
+                            output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy]+weights[wIndex+7]*input[iIndex+oy+strideDims[0]]+weights[wIndex+8]*input[iIndex+oy+strideDims[0]*2];
+                        }
+                    }
+                }
+                output += outChannels_s;
+            }
+        }
+    } else if (dilated_kernel_x == 1 && dilated_kernel_y == 1) {
+        std::size_t index = 0;
+        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+            for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
+
+                B biasVal = (biases != nullptr) ? biases[ch] : B(0);
+
+                const std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3];
+                const std::size_t wIndex = ch;
+
+                if (strideDims[0] == 1 && strideDims[1] == 1) {
+                    for (; index < iIndex + oxSize*oySize; ++index) {
+                        output[index] = biasVal + weights[wIndex] * input[index];
+                    }
+                } else  {
+                    std::size_t oIndex =  (ch + batch*inputDims[1]) * oxSize * oySize;
+                    for (std::size_t ox = 0; ox < oxSize; ++ox, oIndex+=oySize) {
+                        index = iIndex + strideDims[0]*inputDims[3];
+                        for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) {
+                            output[oIndex + oy] += weights[wIndex]*input[index+iy];
+                        }
+                    }
+                }
+            }
+        }
+    } else {
+        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+            for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
+
+                B biasVal = (biases != nullptr) ? biases[ch] : B(0);
+                std::fill(output, output+outChannels_s, biasVal);
+
+                const std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3];
+                const std::size_t wIndex = ch * kernelDims[0] * kernelDims[1];
+
+                for (std::size_t ox = 0; ox < oxSize; ++ox) {
+                    for (std::size_t oy = 0; oy < oySize; ++oy) {
+
+                        const std::size_t oIndexFull = ox*oySize + oy;
+                        const signedsize ix = static_cast<signedsize>(ox * strideDims[0]);
+                        const signedsize iy = static_cast<signedsize>(oy * strideDims[1]);
+
+                        for (std::size_t sx = 0; sx*dilationDims[0] < dilated_kernel_x; ++sx) {
+                            for (std::size_t sy = 0; sy*dilationDims[1] < dilated_kernel_y; ++sy) {
                                 output[oIndexFull] += weights[wIndex + sx*kernelDims[1] + sy] *
                                                         input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx*dilationDims[0]))*inputDims[3] + static_cast<std::size_t>(iy+static_cast<signedsize>(sy*dilationDims[1]))];
                             }
@@ -193,10 +244,12 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri
                     }
                 }
             }
+            output += outChannels_s;
         }
     }
 }
 
+
 // Kernels registration to implementation entry point
 REGISTRAR(ConvDepthWiseImpl2D_cpu,
     {{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}},
diff --git a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
index e800c252..b4abac19 100644
--- a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
@@ -141,15 +141,15 @@ void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
     O *output = static_cast<O *>(output_);
 
     // output H size
+    const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1;
     const std::size_t oxSize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilationDims[0]*(kernelDims[0] - 1) - 1 + strideDims[0]) /
+            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilated_kernel_x + strideDims[0]) /
                                 static_cast<float>(strideDims[0])));
-    const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1;
     // output W size
+    const DimSize_t dilated_kernel_y = dilationDims[1]*(kernelDims[1] - 1) + 1;
     const std::size_t oySize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[3] - dilationDims[1]*(kernelDims[1] - 1) - 1 + strideDims[1]) /
+            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[3] - dilated_kernel_y + strideDims[1]) /
                                 static_cast<float>(strideDims[1])));
-    const DimSize_t dilated_kernel_y = dilationDims[1]*(kernelDims[1] - 1) + 1;
 
 
     // TODO: kernel computation
@@ -157,57 +157,108 @@ void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
     // input  (batch, inCh, Xin, Yin)
     // weight (outCh, inCh, kernelX, kernelY)
     // does not take Dilation attribute into account
+    const std::size_t outChannels_s =  oxSize * oySize;
     using signedsize = std::make_signed<std::size_t>::type;
-    for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
-        for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
-            const std::size_t oIndex = (outCh + batch*outChannels) * oxSize * oySize;
-            // If bias = nullptr, set B(0)
-            B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
-            std::fill(output + oIndex, output+(oIndex+oxSize*oySize), biasVal);
-            for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
-                const std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
-                const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0] * kernelDims[1];
-                for (std::size_t ox = 0; ox < oxSize; ++ox) {
-                    // const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]);
-                    // const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
-                    // const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx);
-                    const std::size_t sxMin = 0;
-                    const std::size_t sxMax = dilated_kernel_x;
-                    for (std::size_t oy = 0; oy < oySize; ++oy) {
-                        // const signedsize dify = static_cast<signedsize>(- oy * strideDims[1]);
-                        // const std::size_t syMin = static_cast<std::size_t>(std::max(dify, signedsize(0)));
-                        // const std::size_t syMax = (static_cast<signedsize>(inputDims[3]) + dify) < 0 ? 0 : ((inputDims[3] + dify) > kernelDims[1] ? kernelDims[1] : inputDims[3] + dify);
-                        const std::size_t syMin = 0;
-                        const std::size_t syMax = dilated_kernel_y;
-                        const std::size_t oIndexFull = oIndex + ox*oySize + oy;
-                        const signedsize ix = static_cast<signedsize>(ox * strideDims[0]);
-                        const signedsize iy = static_cast<signedsize>(oy * strideDims[1]);
-
-                        if (sxMin == 0 && syMin == 0 && sxMax == 3 && syMax == 3) {
-                            output[oIndexFull] += (weights[wIndex + 0*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
-                                                   weights[wIndex + 0*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
-                                                   weights[wIndex + 0*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+2)] +
-                                                   weights[wIndex + 1*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
-                                                   weights[wIndex + 1*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
-                                                   weights[wIndex + 1*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+2)] +
-                                                   weights[wIndex + 2*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
-                                                   weights[wIndex + 2*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
-                                                   weights[wIndex + 2*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+2)]);
-                        } else {
-                            for (std::size_t sx = sxMin; sx*dilationDims[0] < sxMax; ++sx) {
-                                for (std::size_t sy = syMin; sy*dilationDims[1] < syMax; ++sy) {
-                                    output[oIndexFull] += weights[wIndex + sx*kernelDims[1] + sy] *
-                                                            input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx*dilationDims[0]))*inputDims[3] + static_cast<std::size_t>(iy+static_cast<signedsize>(sy*dilationDims[1]))];
+
+    if (dilated_kernel_x == 3 && dilated_kernel_y == 3) {
+        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+            for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
+                // If bias = nullptr, set B(0)
+                B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
+                std::fill(output, output+outChannels_s, biasVal);
+                for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
+                    std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
+                    const std::size_t wIndex = (inCh + outCh*inputDims[1]) * 9;
+                    if (strideDims[0] == 1 && strideDims[1]==1) {
+                        for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=inputDims[3]) {
+                            for (std::size_t oy = 0; oy < oySize; ++oy) {
+                                output[oIndex + oy] += weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+1]+weights[wIndex+2]*input[iIndex+oy+2];
+                            }
+                            iIndex+=inputDims[3];
+                            for (std::size_t oy = 0; oy < oySize; ++oy) {
+                                output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy]+weights[wIndex+4]*input[iIndex+oy+1]+weights[wIndex+5]*input[iIndex+oy+2];
+                            }
+                            iIndex+=inputDims[3];
+                            for (std::size_t oy = 0; oy < oySize; ++oy) {
+                                output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy]+weights[wIndex+7]*input[iIndex+oy+1]+weights[wIndex+8]*input[iIndex+oy+2];
+                            }
+                        }
+                    } else {
+                        for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=strideDims[0]*inputDims[3]) {
+                            for (std::size_t oy = 0; oy < oySize; ++oy) {
+                                output[oIndex + oy] += weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+strideDims[0]]+weights[wIndex+2]*input[iIndex+oy+strideDims[0]*2];
+                            }
+                            iIndex+=strideDims[0]*inputDims[3];
+                            for (std::size_t oy = 0; oy < oySize; ++oy) {
+                                output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy]+weights[wIndex+4]*input[iIndex+oy+strideDims[0]]+weights[wIndex+5]*input[iIndex+oy+strideDims[0]*2];
+                            }
+                            iIndex+=strideDims[0]*inputDims[3];
+                            for (std::size_t oy = 0; oy < oySize; ++oy) {
+                                output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy]+weights[wIndex+7]*input[iIndex+oy+strideDims[0]]+weights[wIndex+8]*input[iIndex+oy+strideDims[0]*2];
+                            }
+                        }
+                    }
+                }
+                output += outChannels_s;
+            }
+        }
+    } else if (dilated_kernel_x == 1 && dilated_kernel_y == 1) {
+        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+            for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
+                // If bias = nullptr, set B(0)
+                B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
+                std::fill(output, output+outChannels_s, biasVal);
+                for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
+                    std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
+                    const std::size_t wIndex = (inCh + outCh*inputDims[1]);
+                    if (strideDims[0] == 1 && strideDims[1] == 1) {
+                        for (std::size_t oIndex = 0; oIndex < oxSize*oySize; ++oIndex, ++iIndex) {
+                            output[oIndex] += weights[wIndex] * input[iIndex];
+                        }
+                    } else  {
+                        for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=inputDims[3]*strideDims[0]) {
+                            for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) {
+                                output[oIndex + oy] += weights[wIndex+0]*input[iIndex+iy];
+                            }
+                        }
+                    }
+                }
+                output += outChannels_s;
+            }
+        }
+    } else {
+        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+            for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
+                // If bias = nullptr, set B(0)
+                B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
+                std::fill(output, output+outChannels_s, biasVal);
+                for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
+                    std::size_t iIndex_channel = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
+                    const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0] * kernelDims[1];
+
+                    // loop over each ouput line
+                    for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex_channel+=inputDims[3]*strideDims[0]) {
+                        // loop over associated input line
+                        for (std::size_t ky = 0, ix = 0; ky < kernelDims[0]; ++ky, ix += inputDims[3]*dilationDims[0]) {
+                            // loop over the entire line
+                            for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) {
+                                const std::size_t iIndex = iIndex_channel + ix + iy;
+                                // loop over elements assosicated with one output
+                                for (std::size_t kx = 0;  kx < kernelDims[0]; ++kx) {
+                                    output[oIndex + oy] += weights[wIndex+kernelDims[0]*ky+kx]*input[iIndex+kx*dilationDims[1]];
                                 }
                             }
                         }
                     }
                 }
+                output += outChannels_s;
             }
         }
     }
 }
 
+
+
 // Kernels registration to implementation entry point
 REGISTRAR(ConvImpl2D_cpu,
     {{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}},
diff --git a/unit_tests/operator/Test_ConvDepthWiseImpl.cpp b/unit_tests/operator/Test_ConvDepthWiseImpl.cpp
index e4e46de9..f1594ef5 100644
--- a/unit_tests/operator/Test_ConvDepthWiseImpl.cpp
+++ b/unit_tests/operator/Test_ConvDepthWiseImpl.cpp
@@ -11,144 +11,219 @@
 
 #include <catch2/catch_test_macros.hpp>
 #include <memory>
+#include <vector>
 
+#include "aidge/backend/cpu/operator/ConvDepthWiseImpl.hpp"
+#include "aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/ConvDepthWise.hpp"
-
-#include "aidge/backend/cpu.hpp"
+#include "aidge/utils/TensorUtils.hpp"
 
 using namespace Aidge;
 
 TEST_CASE("[cpu/operator] ConvDepthWise(forward)", "[ConvDepthWise][CPU]") {
-    std::shared_ptr<Node> myCDW = ConvDepthWise(4, {3,3}, "mycdw");
-    auto op = std::static_pointer_cast<OperatorTensor>(myCDW -> getOperator());
-    std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array4D<int,4,1,3,3> {
-        {
-            {{
-                {  0,  1,  2},
-                {  3,  4,  5},
-                {  6,  7,  8}
-
-            }},
-            {{
-                { 27, 28, 29},
-                { 30, 31, 32},
-                { 33, 34, 35}
-
-            }},
-            {{
-                { 54, 55, 56},
-                { 57, 58, 59},
-                { 60, 61, 62}
-            }},
-            {{
-                { 81, 82, 83},
-                { 84, 85, 86},
-                { 87, 88, 89}
-            }}
-        }
-    });
-    std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<int,4> {{7,0,9,0}});
-    std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<int,2,4,5,5> { //NCHW
-        {
-            {
-                {{  0,   1,   2,   3,   4},
-                 {  5,   6,   7,   8,   9},
-                 { 10,  11,  12,  13,  14},
-                 { 15,  16,  17,  18,  19},
-                 { 20,  21,  22,  23,  24}},
-
-                {{ 25,  26,  27,  28,  29},
-                 { 30,  31,  32,  33,  34},
-                 { 35,  36,  37,  38,  39},
-                 { 40,  41,  42,  43,  44},
-                 { 45,  46,  47,  48,  49}},
-
-                {{ 50,  51,  52,  53,  54},
-                 { 55,  56,  57,  58,  59},
-                 { 60,  61,  62,  63,  64},
-                 { 65,  66,  67,  68,  69},
-                 { 70,  71,  72,  73,  74}},
-
-                {{ 75,  76,  77,  78,  79},
-                 { 80,  81,  82,  83,  84},
-                 { 85,  86,  87,  88,  89},
-                 { 90,  91,  92,  93,  94},
-                 { 95,  96,  97,  98,  99}}
-            },
+    SECTION("k[3,3]") {
+        std::shared_ptr<Node> myCDW = ConvDepthWise(4, {3,3}, "mycdw");
+        auto op = std::static_pointer_cast<OperatorTensor>(myCDW -> getOperator());
+        std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array4D<int,4,1,3,3> {
             {
-                {{100, 101, 102, 103, 104},
-                 {105, 106, 107, 108, 109},
-                 {110, 111, 112, 113, 114},
-                 {115, 116, 117, 118, 119},
-                 {120, 121, 122, 123, 124}},
-
-                {{125, 126, 127, 128, 129},
-                 {130, 131, 132, 133, 134},
-                 {135, 136, 137, 138, 139},
-                 {140, 141, 142, 143, 144},
-                 {145, 146, 147, 148, 149}},
-
-                {{150, 151, 152, 153, 154},
-                 {155, 156, 157, 158, 159},
-                 {160, 161, 162, 163, 164},
-                 {165, 166, 167, 168, 169},
-                 {170, 171, 172, 173, 174}},
-
-                {{175, 176, 177, 178, 179},
-                 {180, 181, 182, 183, 184},
-                 {185, 186, 187, 188, 189},
-                 {190, 191, 192, 193, 194},
-                 {195, 196, 197, 198, 199}}
+                {{
+                    {  0,  1,  2},
+                    {  3,  4,  5},
+                    {  6,  7,  8}
+
+                }},
+                {{
+                    { 27, 28, 29},
+                    { 30, 31, 32},
+                    { 33, 34, 35}
+
+                }},
+                {{
+                    { 54, 55, 56},
+                    { 57, 58, 59},
+                    { 60, 61, 62}
+                }},
+                {{
+                    { 81, 82, 83},
+                    { 84, 85, 86},
+                    { 87, 88, 89}
+                }}
             }
-        }
-    });
-    std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<int,2,4,3,3> {
-        {
+        });
+        std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<int,4> {{7,0,9,0}});
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<int,2,4,5,5> { //NCHW
             {
-                {{   319,    355,    391},
-                 {   499,    535,    571},
-                 {   679,    715,    751}},
-
-                {{  8745,   9024,   9303},
-                 { 10140,  10419,  10698},
-                 { 11535,  11814,  12093}},
-
-                {{ 29337,  29859,  30381},
-                 { 31947,  32469,  32991},
-                 { 34557,  35079,  35601}},
-
-                {{ 62061,  62826,  63591},
-                 { 65886,  66651,  67416},
-                 { 69711,  70476,  71241}}
-            },
+                {
+                    {{  0,   1,   2,   3,   4},
+                    {  5,   6,   7,   8,   9},
+                    { 10,  11,  12,  13,  14},
+                    { 15,  16,  17,  18,  19},
+                    { 20,  21,  22,  23,  24}},
+
+                    {{ 25,  26,  27,  28,  29},
+                    { 30,  31,  32,  33,  34},
+                    { 35,  36,  37,  38,  39},
+                    { 40,  41,  42,  43,  44},
+                    { 45,  46,  47,  48,  49}},
+
+                    {{ 50,  51,  52,  53,  54},
+                    { 55,  56,  57,  58,  59},
+                    { 60,  61,  62,  63,  64},
+                    { 65,  66,  67,  68,  69},
+                    { 70,  71,  72,  73,  74}},
+
+                    {{ 75,  76,  77,  78,  79},
+                    { 80,  81,  82,  83,  84},
+                    { 85,  86,  87,  88,  89},
+                    { 90,  91,  92,  93,  94},
+                    { 95,  96,  97,  98,  99}}
+                },
+                {
+                    {{100, 101, 102, 103, 104},
+                    {105, 106, 107, 108, 109},
+                    {110, 111, 112, 113, 114},
+                    {115, 116, 117, 118, 119},
+                    {120, 121, 122, 123, 124}},
+
+                    {{125, 126, 127, 128, 129},
+                    {130, 131, 132, 133, 134},
+                    {135, 136, 137, 138, 139},
+                    {140, 141, 142, 143, 144},
+                    {145, 146, 147, 148, 149}},
+
+                    {{150, 151, 152, 153, 154},
+                    {155, 156, 157, 158, 159},
+                    {160, 161, 162, 163, 164},
+                    {165, 166, 167, 168, 169},
+                    {170, 171, 172, 173, 174}},
+
+                    {{175, 176, 177, 178, 179},
+                    {180, 181, 182, 183, 184},
+                    {185, 186, 187, 188, 189},
+                    {190, 191, 192, 193, 194},
+                    {195, 196, 197, 198, 199}}
+                }
+            }
+        });
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<int,2,4,3,3> {
             {
-                {{  3919,   3955,   3991},
-                 {  4099,   4135,   4171},
-                 {  4279,   4315,   4351}},
-
-                {{ 36645,  36924,  37203},
-                 { 38040,  38319,  38598},
-                 { 39435,  39714,  39993}},
-
-                {{ 81537,  82059,  82581},
-                 { 84147,  84669,  85191},
-                 { 86757,  87279,  87801}},
-
-                {{138561, 139326, 140091},
-                 {142386, 143151, 143916},
-                 {146211, 146976, 147741}}
+                {
+                    {{   319,    355,    391},
+                    {   499,    535,    571},
+                    {   679,    715,    751}},
+
+                    {{  8745,   9024,   9303},
+                    { 10140,  10419,  10698},
+                    { 11535,  11814,  12093}},
+
+                    {{ 29337,  29859,  30381},
+                    { 31947,  32469,  32991},
+                    { 34557,  35079,  35601}},
+
+                    {{ 62061,  62826,  63591},
+                    { 65886,  66651,  67416},
+                    { 69711,  70476,  71241}}
+                },
+                {
+                    {{  3919,   3955,   3991},
+                    {  4099,   4135,   4171},
+                    {  4279,   4315,   4351}},
+
+                    {{ 36645,  36924,  37203},
+                    { 38040,  38319,  38598},
+                    { 39435,  39714,  39993}},
+
+                    {{ 81537,  82059,  82581},
+                    { 84147,  84669,  85191},
+                    { 86757,  87279,  87801}},
+
+                    {{138561, 139326, 140091},
+                    {142386, 143151, 143916},
+                    {146211, 146976, 147741}}
+                }
             }
-        }
-    });
-    op -> associateInput(0, myInput);
-    op -> associateInput(1, myWeights);
-    op -> associateInput(2, myBias);
-    op->setDataType(DataType::Int32);
-    op->setBackend("cpu");
-    myCDW -> forward();
-    op -> getOutput(0) -> print();
-    REQUIRE(*(op -> getOutput(0)) == *myOutput);
-
-    // std::cout << static_cast<Tensor>((*op)["weight"])[0][0][0][0] << std::endl;
+        });
+        op -> associateInput(0, myInput);
+        op -> associateInput(1, myWeights);
+        op -> associateInput(2, myBias);
+        op->setDataType(DataType::Int32);
+        op->setBackend("cpu");
+        myCDW -> forward();
+        op -> getOutput(0) -> print();
+        REQUIRE(*(op -> getOutput(0)) == *myOutput);
+    }
+    SECTION("point-wise") {
+        ConvDepthWise_Op<2> conv_op = ConvDepthWise_Op<2>({1,1});
+        std::shared_ptr<Tensor> weights = std::make_shared<Tensor>(std::vector<std::size_t>({3,1,1,1}));
+        weights -> setBackend("cpu");
+        std::shared_ptr<Tensor> biases = std::make_shared<Tensor>(std::vector<std::size_t>({3}));
+        biases -> setBackend("cpu");
+        std::shared_ptr<Tensor> input = std::make_shared<Tensor>(std::vector<std::size_t>({2,3,5,5}));
+        input -> setBackend("cpu");
+        std::shared_ptr<Tensor> expected_output = std::make_shared<Tensor>(std::vector<std::size_t>({2,3,5,5}));
+        expected_output -> setBackend("cpu");
+
+        float weighst_array[3] {-0.0045, -0.4223, -0.9452};
+        weights->getImpl()->setRawPtr(weighst_array, 3);
+
+        float biases_array[3] {-0.8595,  0.7062, -0.0062};
+        biases->getImpl()->setRawPtr(biases_array, 3);
+
+        float input_array[2*3*5*5] {
+            0.6581, 0.2509, 0.2660, 0.8270, 0.8040, 0.3147, 0.5028, 0.2591, 0.8585,
+            0.7762, 0.9972, 0.0305, 0.1202, 0.2682, 0.9306, 0.7927, 0.1494, 0.0678,
+            0.5550, 0.4132, 0.4742, 0.6199, 0.1802, 0.6350, 0.2539, 0.5594, 0.0143,
+            0.8656, 0.7105, 0.1420, 0.2464, 0.7883, 0.5715, 0.7642, 0.5492, 0.6628,
+            0.4922, 0.7941, 0.8421, 0.7914, 0.0237, 0.8081, 0.0174, 0.6018, 0.7402,
+            0.3770, 0.8786, 0.3651, 0.5355, 0.4267, 0.4457, 0.6756, 0.9631, 0.0145,
+            0.4470, 0.5202, 0.2675, 0.5815, 0.3487, 0.3457, 0.7179, 0.0518, 0.1520,
+            0.0573, 0.9219, 0.3615, 0.0866, 0.5237, 0.4725, 0.2565, 0.8726, 0.6434,
+            0.6875, 0.2919, 0.3355, 0.1886, 0.1749, 0.0785, 0.4091, 0.1907, 0.4664,
+            0.2738, 0.4784, 0.7807, 0.0687, 0.3091, 0.4557, 0.2277, 0.2424, 0.8691,
+            0.1893, 0.2918, 0.5691, 0.1926, 0.2866, 0.0097, 0.5445, 0.5085, 0.1110,
+            0.7099, 0.8927, 0.6182, 0.2538, 0.8694, 0.7872, 0.3196, 0.0710, 0.2888,
+            0.0403, 0.1670, 0.6840, 0.7323, 0.4861, 0.3390, 0.1096, 0.5070, 0.3872,
+            0.7473, 0.6224, 0.6910, 0.7530, 0.0149, 0.0866, 0.9022, 0.5027, 0.3849,
+            0.5255, 0.1977, 0.0570, 0.9581, 0.5461, 0.4623, 0.0101, 0.2362, 0.5922,
+            0.8398, 0.1497, 0.5160, 0.2862, 0.5931, 0.9728, 0.1353, 0.7790, 0.9137,
+            0.9351, 0.4036, 0.7638, 0.3873, 0.0494, 0.7450};
+        input->getImpl()->setRawPtr(input_array, 2*3*5*5);
+
+        float expected_output_array[2*3*5*5] {
+            -0.8624, -0.8606, -0.8607, -0.8632, -0.8631, -0.8609, -0.8617, -0.8606,
+            -0.8633, -0.8629, -0.8639, -0.8596, -0.8600, -0.8607, -0.8636, -0.8630,
+            -0.8601, -0.8598, -0.8620, -0.8613, -0.8616, -0.8622, -0.8603, -0.8623,
+            -0.8606,  0.4700,  0.7002,  0.3407,  0.4062,  0.6463,  0.6022,  0.3733,
+            0.4649,  0.3835,  0.4743,  0.4263,  0.4984,  0.3709,  0.3506,  0.3720,
+            0.6962,  0.3650,  0.6989,  0.4521,  0.3936,  0.5470,  0.3352,  0.5520,
+            0.4801,  0.5260, -0.4274, -0.6447, -0.9165, -0.0199, -0.4287, -0.4979,
+            -0.2590, -0.5559, -0.3358, -0.3329, -0.6847, -0.0552, -0.1499, -0.0603,
+            -0.8776, -0.3479, -0.0881, -0.5011, -0.4528, -0.2486, -0.8309, -0.6143,
+            -0.6561, -0.2821, -0.3233, -0.8603, -0.8603, -0.8598, -0.8613, -0.8603,
+            -0.8616, -0.8607, -0.8616, -0.8630, -0.8598, -0.8609, -0.8615, -0.8605,
+            -0.8606, -0.8634, -0.8603, -0.8608, -0.8620, -0.8603, -0.8608, -0.8595,
+            -0.8619, -0.8617, -0.8600, -0.8626,  0.3292,  0.4451,  0.5991,  0.3390,
+            0.3738,  0.5712,  0.6762,  0.5843,  0.6892,  0.6357,  0.4174,  0.3969,
+            0.5009,  0.5631,  0.6599,  0.4921,  0.5427,  0.3906,  0.4434,  0.4144,
+            0.3882,  0.6999,  0.6697,  0.3252,  0.4939, -0.3700, -0.5029, -0.1931,
+            -0.0601, -0.9118, -0.5224, -0.4432, -0.0157, -0.2294, -0.5660, -0.7999,
+            -0.1477, -0.4939, -0.2767, -0.5668, -0.9257, -0.1341, -0.7425, -0.8698,
+            -0.8900, -0.3877, -0.7282, -0.3722, -0.0529, -0.7103};
+        expected_output->getImpl()->setRawPtr(expected_output_array, 2*3*5*5);
+
+        conv_op.associateInput(0, input);
+        conv_op.associateInput(1, weights);
+        conv_op.associateInput(2, biases);
+
+        conv_op.setBackend("cpu");
+        conv_op.setDataType(DataType::Float32);
+        conv_op.forwardDims();
+
+        conv_op.forward();
+
+        conv_op.getOutput(0)->print();
+
+        REQUIRE(approxEq<float>(*(conv_op.getOutput(0)),*expected_output, 1e-3f, 1e-4f));
+    }
 }
\ No newline at end of file
diff --git a/unit_tests/operator/Test_ConvImpl.cpp b/unit_tests/operator/Test_ConvImpl.cpp
index b5208513..e48d69c8 100644
--- a/unit_tests/operator/Test_ConvImpl.cpp
+++ b/unit_tests/operator/Test_ConvImpl.cpp
@@ -15,6 +15,7 @@
 
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/Conv.hpp"
+#include "aidge/utils/TensorUtils.hpp"
 
 #include "aidge/backend/cpu.hpp"
 
@@ -153,7 +154,7 @@ TEST_CASE("[cpu/operator] Conv(forward)", "[Conv][CPU]") {
         op->setDataType(DataType::Int32);
         op->setBackend("cpu");
         myConv->forward();
-        // op->getOutput(0)->print();
+        op->getOutput(0)->print();
         REQUIRE(*(op->getOutput(0)) == *myOutput);
     }
     SECTION("Point-wise") {
@@ -251,4 +252,147 @@ TEST_CASE("[cpu/operator] Conv(forward)", "[Conv][CPU]") {
             REQUIRE(std::abs(resPtr[i]-expectedPtr[i]) < 0.00001);
         }
     }
+    SECTION("Strided and dilated Conv") {
+        std::shared_ptr<Node> myConv = Conv(3,4,{3,3}, "myconv", {3,3},{2,2});
+        auto op = std::static_pointer_cast<OperatorTensor>(myConv -> getOperator());
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,3,8,8> {
+            {{{
+                {0.0107F, 0.5076F, 0.2293F, 0.0486F, 0.7375F, 0.2637F, 0.9615F, 0.9138F},
+                {0.0678F, 0.5604F, 0.1940F, 0.0287F, 0.1029F, 0.2059F, 0.5058F, 0.9885F},
+                {0.9904F, 0.2890F, 0.4606F, 0.1055F, 0.9028F, 0.1654F, 0.6499F, 0.4775F},
+                {0.9499F, 0.4695F, 0.1713F, 0.0731F, 0.4913F, 0.8921F, 0.1782F, 0.1111F},
+                {0.2479F, 0.4669F, 0.1078F, 0.6153F, 0.0299F, 0.6484F, 0.2397F, 0.1814F},
+                {0.3779F, 0.9032F, 0.5651F, 0.3896F, 0.8439F, 0.6404F, 0.3813F, 0.0841F},
+                {0.5566F, 0.8950F, 0.1226F, 0.8881F, 0.9870F, 0.6256F, 0.6387F, 0.0628F},
+                {0.2857F, 0.0579F, 0.6247F, 0.1286F, 0.0951F, 0.1268F, 0.9510F, 0.3789F}},
+
+               {{0.7648F, 0.5340F, 0.1024F, 0.4098F, 0.9958F, 0.7941F, 0.1190F, 0.7328F},
+                {0.4532F, 0.6598F, 0.9146F, 0.1690F, 0.6041F, 0.7230F, 0.5719F, 0.9282F},
+                {0.2862F, 0.2329F, 0.7302F, 0.6717F, 0.1983F, 0.1876F, 0.4561F, 0.2126F},
+                {0.7849F, 0.0239F, 0.7977F, 0.5935F, 0.9958F, 0.4703F, 0.4612F, 0.1627F},
+                {0.6393F, 0.3544F, 0.8643F, 0.5039F, 0.8087F, 0.6521F, 0.5086F, 0.9331F},
+                {0.7749F, 0.9798F, 0.6820F, 0.7869F, 0.5144F, 0.2941F, 0.8137F, 0.4561F},
+                {0.6505F, 0.3974F, 0.6909F, 0.7019F, 0.2729F, 0.4240F, 0.0162F, 0.1536F},
+                {0.3529F, 0.8821F, 0.1812F, 0.3426F, 0.3472F, 0.0300F, 0.8841F, 0.8088F}},
+
+               {{0.5099F, 0.3323F, 0.1488F, 0.3424F, 0.1494F, 0.6225F, 0.8103F, 0.5995F},
+                {0.9198F, 0.5635F, 0.8908F, 0.9378F, 0.6689F, 0.3176F, 0.3755F, 0.3883F},
+                {0.0626F, 0.5309F, 0.0307F, 0.3955F, 0.2794F, 0.1420F, 0.4758F, 0.7558F},
+                {0.6154F, 0.5280F, 0.2318F, 0.3832F, 0.4435F, 0.3490F, 0.4043F, 0.5872F},
+                {0.3705F, 0.3848F, 0.2182F, 0.8332F, 0.4559F, 0.5310F, 0.4611F, 0.4236F},
+                {0.6141F, 0.8103F, 0.2260F, 0.9907F, 0.5615F, 0.4520F, 0.6949F, 0.0175F},
+                {0.3969F, 0.5021F, 0.0970F, 0.9937F, 0.9270F, 0.4302F, 0.2868F, 0.3891F},
+                {0.8693F, 0.5170F, 0.5348F, 0.2676F, 0.9769F, 0.3356F, 0.9427F, 0.3908F}}
+            },
+            {
+               {{0.4803F, 0.5223F, 0.6395F, 0.8402F, 0.4442F, 0.6377F, 0.7852F, 0.9063F},
+                {0.0361F, 0.0470F, 0.3104F, 0.6921F, 0.0543F, 0.4490F, 0.9541F, 0.7395F},
+                {0.3832F, 0.3828F, 0.2236F, 0.2068F, 0.4369F, 0.7443F, 0.6952F, 0.6394F},
+                {0.5309F, 0.8483F, 0.1991F, 0.9756F, 0.8969F, 0.7284F, 0.4657F, 0.5486F},
+                {0.8839F, 0.3260F, 0.6892F, 0.4074F, 0.9473F, 0.5526F, 0.4147F, 0.4786F},
+                {0.9674F, 0.0952F, 0.8379F, 0.2163F, 0.9420F, 0.4046F, 0.1339F, 0.5234F},
+                {0.4213F, 0.8392F, 0.3184F, 0.4576F, 0.9349F, 0.8267F, 0.0931F, 0.8009F},
+                {0.5570F, 0.5871F, 0.4175F, 0.5465F, 0.6679F, 0.9224F, 0.0049F, 0.9421F}},
+
+               {{0.3739F, 0.6230F, 0.7613F, 0.1337F, 0.8527F, 0.0557F, 0.6424F, 0.8463F},
+                {0.7179F, 0.5638F, 0.2457F, 0.4579F, 0.0487F, 0.8693F, 0.8216F, 0.0415F},
+                {0.1724F, 0.5108F, 0.9103F, 0.0850F, 0.0080F, 0.8927F, 0.7706F, 0.3600F},
+                {0.7751F, 0.8828F, 0.7872F, 0.4541F, 0.3181F, 0.1855F, 0.2486F, 0.0033F},
+                {0.5558F, 0.3500F, 0.6034F, 0.1763F, 0.7418F, 0.5190F, 0.5147F, 0.4090F},
+                {0.4476F, 0.1249F, 0.8116F, 0.9091F, 0.1738F, 0.6150F, 0.3285F, 0.3133F},
+                {0.5657F, 0.4447F, 0.5049F, 0.3425F, 0.7443F, 0.2718F, 0.2466F, 0.5586F},
+                {0.3684F, 0.7616F, 0.5165F, 0.9621F, 0.2864F, 0.7747F, 0.8110F, 0.7045F}},
+
+               {{0.4570F, 0.4577F, 0.0373F, 0.6084F, 0.4632F, 0.3472F, 0.9917F, 0.2011F},
+                {0.7921F, 0.2202F, 0.9525F, 0.7274F, 0.3357F, 0.0076F, 0.5786F, 0.3034F},
+                {0.6510F, 0.0798F, 0.2757F, 0.1738F, 0.3046F, 0.2197F, 0.3872F, 0.5650F},
+                {0.1532F, 0.3204F, 0.6094F, 0.3287F, 0.8903F, 0.9773F, 0.7950F, 0.2845F},
+                {0.2482F, 0.3395F, 0.8795F, 0.4325F, 0.1395F, 0.2457F, 0.2968F, 0.5424F},
+                {0.8636F, 0.7426F, 0.2151F, 0.6900F, 0.3938F, 0.0062F, 0.4980F, 0.4098F},
+                {0.8026F, 0.0464F, 0.2662F, 0.7835F, 0.8444F, 0.0688F, 0.8796F, 0.7625F},
+                {0.2764F, 0.5341F, 0.1773F, 0.6671F, 0.7555F, 0.5235F, 0.7142F, 0.9423F}}}}
+        });
+        std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float,4> {{ 0.1902F, -0.1789F, -0.0314F, -0.0589F}});
+        std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array4D<float,4,3,3,3> { //NCHW
+            {
+                {
+                    {{ 0.0039F,  0.1098F, -0.0834F},
+                     {-0.0890F,  0.0725F, -0.1178F},
+                     { 0.1056F, -0.0924F, -0.0574F}},
+                    {{ 0.0070F, -0.0730F, -0.0674F},
+                     {-0.0380F, -0.1025F, -0.0085F},
+                     {-0.1451F, -0.0656F,  0.1137F}},
+                    {{ 0.1020F,  0.1025F, -0.0678F},
+                     { 0.0028F,  0.1512F, -0.0871F},
+                     { 0.1563F, -0.1446F, -0.1636F}}
+                },
+                {
+                    {{ 0.1472F,  0.0025F, -0.0281F},
+                     { 0.0350F,  0.0296F, -0.1711F},
+                     {-0.1197F, -0.1198F, -0.1130F}},
+                    {{-0.1492F,  0.1554F, -0.1044F},
+                     { 0.1203F, -0.1596F,  0.0589F},
+                     {-0.0436F, -0.1876F, -0.0816F}},
+                    {{ 0.1572F, -0.0982F,  0.1293F},
+                     { 0.1358F,  0.1559F,  0.1322F},
+                     { 0.0296F, -0.0354F, -0.0632F}}
+                },
+                {
+                    {{-0.0941F, -0.0479F,  0.0908F},
+                     {-0.1319F, -0.1333F,  0.1223F},
+                     {-0.1098F,  0.1924F,  0.1075F}},
+                    {{ 0.1796F,  0.0213F,  0.0626F},
+                     { 0.0275F,  0.1883F, -0.0818F},
+                     { 0.0363F,  0.0684F,  0.1094F}},
+                    {{ 0.1131F,  0.1258F, -0.0558F},
+                     { 0.1498F,  0.0322F, -0.0186F},
+                     {-0.1801F, -0.0358F,  0.1727F}}
+                },
+                {
+                    {{-0.1500F, -0.0554F, -0.0994F},
+                     {-0.0818F, -0.1223F,  0.1365F},
+                     { 0.1281F,  0.1507F, -0.0890F}},
+                    {{-0.0444F, -0.1071F, -0.1632F},
+                     { 0.0757F, -0.1235F,  0.0408F},
+                     { 0.0401F, -0.1914F,  0.1772F}},
+                    {{-0.0714F,  0.1582F, -0.0065F},
+                     {-0.0119F,  0.1375F, -0.0727F},
+                     {-0.1532F, -0.1826F, -0.0417F}}
+                }
+            }
+        });
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,4,2,2> {
+            {
+                {
+                    {{-0.2174F, -0.0778F},
+                     {-0.2584F,  0.2303F}},
+                    {{-0.7686F, -0.3879F},
+                     {-0.1775F,  0.0119F}},
+                    {{ 0.5180F,  0.5087F},
+                     { 0.5398F,  0.3476F}},
+                    {{-0.5258F, -0.3128F},
+                     {-0.6673F, -0.1827F}}
+                },
+                {
+                    {{-0.1902F, -0.0467F},
+                     {-0.3327F, -0.1701F}},
+                    {{-0.5505F, -0.4875F},
+                     {-0.4119F, -0.5726F}},
+                    {{ 0.5777F,  0.4428F},
+                     { 0.6121F,  0.7221F}},
+                    {{-0.6009F, -0.6335F},
+                     {-0.5159F, -0.3353F}}
+                }
+            }
+        });
+        op->associateInput(0,myInput);
+        op->associateInput(1,myWeights);
+        op->associateInput(2,myBias);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cpu");
+        op->forwardDims();
+        myConv->forward();
+        op->getOutput(0)->print();
+        REQUIRE(approxEq<float>(*(op->getOutput(0)),*myOutput, 1e-3f, 1e-4f));
+    }
 }
\ No newline at end of file
-- 
GitLab