diff --git a/include/aidge/backend/cuda/operator/ConvImpl.hpp b/include/aidge/backend/cuda/operator/ConvImpl.hpp
index 8c591927ce0e52daeff447726c114ce3ae4d0103..0722048f7cf021104a9694a621b1c0dad00ce423 100644
--- a/include/aidge/backend/cuda/operator/ConvImpl.hpp
+++ b/include/aidge/backend/cuda/operator/ConvImpl.hpp
@@ -21,11 +21,13 @@
 
 #include "aidge/backend/OperatorImpl.hpp"
 #include "aidge/operator/Conv.hpp"
+#include "aidge/operator/ConvDepthWise.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
 
 #include "aidge/backend/cuda/utils/CudaUtils.hpp"
 
+
 namespace Aidge {
 template <DimIdx_t DIM>
 class ConvImpl_cuda : public OperatorImpl {
@@ -42,14 +44,19 @@ private:
     std::shared_ptr<Tensor> mInput0Fallback;
     std::shared_ptr<Tensor> mInput1Fallback;
     std::shared_ptr<Tensor> mInput2Fallback;
+    bool mDepthWise = false;
 
 public:
-    ConvImpl_cuda(const Conv_Op<DIM> &op) : OperatorImpl(op, "cuda") {}
+    ConvImpl_cuda(const Operator&op, bool depthWise = false) : OperatorImpl(op, "cuda"), mDepthWise(depthWise) {}
 
     static std::unique_ptr<ConvImpl_cuda> create(const Conv_Op<DIM> &op) {
         return std::make_unique<ConvImpl_cuda>(op);
     }
 
+    static std::unique_ptr<ConvImpl_cuda> createDW(const ConvDepthWise_Op<DIM> &op) {
+        return std::make_unique<ConvImpl_cuda>(op, true);
+    }
+
 public:
     void forward();
     void backward();
@@ -61,8 +68,8 @@ private:
 };
 
 namespace {
-// add cuda backend to Conv_Op<2> implementation registry
 static Registrar<Conv_Op<2>> registrarConvImpl_cuda("cuda", Aidge::ConvImpl_cuda<2>::create);
+static Registrar<ConvDepthWise_Op<2>> registrarConvDepthWiseImpl_cuda("cuda", Aidge::ConvImpl_cuda<2>::createDW);
 }  // namespace
 }  // namespace Aidge
 
diff --git a/src/operator/ConvImpl.cpp b/src/operator/ConvImpl.cpp
index 096ee9485a03b736326f46e9a569c6b3c9b5a631..78c3c3276ba24b9ea9d5cb497369ca38ebaf947f 100644
--- a/src/operator/ConvImpl.cpp
+++ b/src/operator/ConvImpl.cpp
@@ -16,6 +16,7 @@
 #include "aidge/backend/cuda/operator/ConvImpl.hpp"
 #include "aidge/backend/cuda/utils/CudaUtils.hpp"
 #include "aidge/operator/Conv.hpp"
+#include "aidge/operator/ConvDepthWise.hpp"
 #include "aidge/utils/Types.h"
 
 template <Aidge::DimIdx_t DIM>
@@ -33,19 +34,27 @@ void Aidge::ConvImpl_cuda<DIM>::forward() {
 
     // Lazy-initialize CuDNN convolution descriptor
     if (mConvDesc == nullptr) {
-        const Conv_Op<DIM>& convOp = static_cast<const Conv_Op<DIM>&>(mOp);
-        const std::vector<int> strides(convOp.template getAttr<ConvAttr::StrideDims>().begin(), convOp.template getAttr<ConvAttr::StrideDims>().end());
         const std::vector<int> paddings(DIM, 0);
-        const std::vector<int> upscales(convOp.template getAttr<ConvAttr::DilationDims>().begin(), convOp.template getAttr<ConvAttr::DilationDims>().end());
-
-        CHECK_CUDNN_STATUS(cudnnCreateConvolutionDescriptor(&mConvDesc));
-        CHECK_CUDNN_STATUS(cudnnSetConvolutionNdDescriptor(mConvDesc,
-            DIM,
-            &paddings[0],
-            &strides[0],
-            &upscales[0],
-            CUDNN_CROSS_CORRELATION,
-            DataTypeToCudnn(op.getOutput(0)->dataType())));
+        std::vector<int> strides, upscales;
+        if (mDepthWise) {
+            const ConvDepthWise_Op<DIM>& convDWOp = static_cast<const ConvDepthWise_Op<DIM>&>(mOp);
+            strides = std::vector<int>(convDWOp.template getAttr<ConvDepthWiseAttr::StrideDims>().begin(), convDWOp.template getAttr<ConvDepthWiseAttr::StrideDims>().end());
+            upscales = std::vector<int>(convDWOp.template getAttr<ConvDepthWiseAttr::DilationDims>().begin(), convDWOp.template getAttr<ConvDepthWiseAttr::DilationDims>().end());
+        }
+        else {
+            const Conv_Op<DIM>& convOp = static_cast<const Conv_Op<DIM>&>(mOp);
+            strides = std::vector<int>(convOp.template getAttr<ConvAttr::StrideDims>().begin(), convOp.template getAttr<ConvAttr::StrideDims>().end());
+            upscales = std::vector<int>(convOp.template getAttr<ConvAttr::DilationDims>().begin(), convOp.template getAttr<ConvAttr::DilationDims>().end());
+        }
+
+            CHECK_CUDNN_STATUS(cudnnCreateConvolutionDescriptor(&mConvDesc));
+            CHECK_CUDNN_STATUS(cudnnSetConvolutionNdDescriptor(mConvDesc,
+                DIM,
+                &paddings[0],
+                &strides[0],
+                &upscales[0],
+                CUDNN_CROSS_CORRELATION,
+                DataTypeToCudnn(op.getOutput(0)->dataType())));
     }
 
     // Lazy-initialize CuDNN filter descriptor
diff --git a/unit_tests/Test_ConvDepthWiseImpl.cpp b/unit_tests/Test_ConvDepthWiseImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1fad201a2c775a8d802010def44ff44c1d5d4a3f
--- /dev/null
+++ b/unit_tests/Test_ConvDepthWiseImpl.cpp
@@ -0,0 +1,167 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <array>
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "Test_cuda.hpp"
+
+#include "aidge/data/Tensor.hpp"
+
+#include "aidge/backend/cpu.hpp"
+#include "aidge/backend/cuda.hpp"
+
+using namespace Aidge;
+
+TEST_CASE("[cpu/operator] ConvDepthWise(forward)", "[ConvDepthWise][CPU]") {
+    std::shared_ptr<Node> myCDW = ConvDepthWise(4, {3,3}, "mycdw");
+    auto op = std::static_pointer_cast<OperatorTensor>(myCDW -> getOperator());
+    std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array4D<float,4,1,3,3> {
+        {
+            {{
+                {  0,  1,  2},
+                {  3,  4,  5},
+                {  6,  7,  8}
+
+            }},
+            {{
+                { 27, 28, 29},
+                { 30, 31, 32},
+                { 33, 34, 35}
+
+            }},
+            {{
+                { 54, 55, 56},
+                { 57, 58, 59},
+                { 60, 61, 62}
+            }},
+            {{
+                { 81, 82, 83},
+                { 84, 85, 86},
+                { 87, 88, 89}
+            }}
+        }
+    });
+    std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float,4> {{7,0,9,0}});
+    std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,4,5,5> { //NCHW
+        {
+            {
+                {{  0,   1,   2,   3,   4},
+                 {  5,   6,   7,   8,   9},
+                 { 10,  11,  12,  13,  14},
+                 { 15,  16,  17,  18,  19},
+                 { 20,  21,  22,  23,  24}},
+
+                {{ 25,  26,  27,  28,  29},
+                 { 30,  31,  32,  33,  34},
+                 { 35,  36,  37,  38,  39},
+                 { 40,  41,  42,  43,  44},
+                 { 45,  46,  47,  48,  49}},
+
+                {{ 50,  51,  52,  53,  54},
+                 { 55,  56,  57,  58,  59},
+                 { 60,  61,  62,  63,  64},
+                 { 65,  66,  67,  68,  69},
+                 { 70,  71,  72,  73,  74}},
+
+                {{ 75,  76,  77,  78,  79},
+                 { 80,  81,  82,  83,  84},
+                 { 85,  86,  87,  88,  89},
+                 { 90,  91,  92,  93,  94},
+                 { 95,  96,  97,  98,  99}}
+            },
+            {
+                {{100, 101, 102, 103, 104},
+                 {105, 106, 107, 108, 109},
+                 {110, 111, 112, 113, 114},
+                 {115, 116, 117, 118, 119},
+                 {120, 121, 122, 123, 124}},
+
+                {{125, 126, 127, 128, 129},
+                 {130, 131, 132, 133, 134},
+                 {135, 136, 137, 138, 139},
+                 {140, 141, 142, 143, 144},
+                 {145, 146, 147, 148, 149}},
+
+                {{150, 151, 152, 153, 154},
+                 {155, 156, 157, 158, 159},
+                 {160, 161, 162, 163, 164},
+                 {165, 166, 167, 168, 169},
+                 {170, 171, 172, 173, 174}},
+
+                {{175, 176, 177, 178, 179},
+                 {180, 181, 182, 183, 184},
+                 {185, 186, 187, 188, 189},
+                 {190, 191, 192, 193, 194},
+                 {195, 196, 197, 198, 199}}
+            }
+        }
+    });
+    std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,4,3,3> {
+        {
+            {
+                {{   319,    355,    391},
+                 {   499,    535,    571},
+                 {   679,    715,    751}},
+
+                {{  8745,   9024,   9303},
+                 { 10140,  10419,  10698},
+                 { 11535,  11814,  12093}},
+
+                {{ 29337,  29859,  30381},
+                 { 31947,  32469,  32991},
+                 { 34557,  35079,  35601}},
+
+                {{ 62061,  62826,  63591},
+                 { 65886,  66651,  67416},
+                 { 69711,  70476,  71241}}
+            },
+            {
+                {{  3919,   3955,   3991},
+                 {  4099,   4135,   4171},
+                 {  4279,   4315,   4351}},
+
+                {{ 36645,  36924,  37203},
+                 { 38040,  38319,  38598},
+                 { 39435,  39714,  39993}},
+
+                {{ 81537,  82059,  82581},
+                 { 84147,  84669,  85191},
+                 { 86757,  87279,  87801}},
+
+                {{138561, 139326, 140091},
+                 {142386, 143151, 143916},
+                 {146211, 146976, 147741}}
+            }
+        }
+    });
+    myInput->setBackend("cuda");
+    myWeights->setBackend("cuda");
+    myBias->setBackend("cuda");
+    op -> associateInput(0, myInput);
+    op -> associateInput(1, myWeights);
+    op -> associateInput(2, myBias);
+    op->setDataType(DataType::Float32);
+    op->setBackend("cuda");
+    op -> computeOutputDims();
+    myCDW -> forward();
+
+    float* computedOutput   = new float[myOutput->size()]();
+    cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+
+    for(int i = 0; i < myOutput->size(); i++){
+        const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+        REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
+    }
+
+    delete[] computedOutput;
+}
\ No newline at end of file