diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6e30fb0010f05825123586e4c4d4c9e56873e854..21f57bc75b2fa4ad8b57711b092693c70b20d628 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -86,8 +86,8 @@ endif()
 target_compile_features(${module_name} PRIVATE cxx_std_14)
 
 target_compile_options(${module_name} PRIVATE
-    $<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:
-    -Wall -Wextra -Wold-style-cast -Winline -pedantic -Werror=narrowing -Wshadow $<$<BOOL:${WERROR}>:-Werror>>)
+    $<$<COMPILE_LANGUAGE:CPP>:$<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:
+    -Wall -Wextra -Wold-style-cast -Winline -pedantic -Werror=narrowing -Wshadow $<$<BOOL:${WERROR}>:-Werror>>>)
 target_compile_options(${module_name} PRIVATE
     $<$<COMPILE_LANGUAGE:CUDA>:
     -Wall>)
diff --git a/include/aidge/backend/cuda.hpp b/include/aidge/backend/cuda.hpp
index f1dbc28597660273b346802c48c01de4713ce081..a6bae174471e665f229d08a489d6b9f7911a6e9f 100644
--- a/include/aidge/backend/cuda.hpp
+++ b/include/aidge/backend/cuda.hpp
@@ -13,5 +13,6 @@
 #define AIDGE_BACKEND_CUDA_IMPORTS_H_
 
 #include "aidge/backend/cuda/data/TensorImpl.hpp"
+#include "aidge/backend/cuda/operator/ConvImpl.hpp"
 
 #endif /* AIDGE_BACKEND_CUDA_IMPORTS_H_ */
\ No newline at end of file
diff --git a/include/aidge/backend/cuda/data/TensorImpl.hpp b/include/aidge/backend/cuda/data/TensorImpl.hpp
index 1b939d70d02615eb04d89890c40a9da3aedbd531..f09cf5d91d60d05762cc86a8fb9ca1400cf1be8b 100644
--- a/include/aidge/backend/cuda/data/TensorImpl.hpp
+++ b/include/aidge/backend/cuda/data/TensorImpl.hpp
@@ -2,7 +2,7 @@
 #define AIDGE_BACKEND_CUDA_DATA_TENSORIMPL_H_
 
 #include <thrust/equal.h>
-#include <thrust/execution_policy.h>
+#include <thrust/device_ptr.h>
 
 #include "aidge/backend/TensorImpl.hpp"
 #include "aidge/data/Tensor.hpp"
@@ -31,14 +31,7 @@ class TensorImpl_cuda : public TensorImpl, public TensorImpl_cuda_  {
 
     TensorImpl_cuda(const Tensor &tensor) : TensorImpl(Backend), mTensor(tensor) {}
 
-    bool operator==(const TensorImpl &otherImpl) const override final {
-        const auto& otherImplCuda = static_cast<const TensorImpl_cuda<T>&>(otherImpl);
-
-        if (mTensor.size() != otherImplCuda.mTensor.size())
-            return false;
-
-        return thrust::equal(mData, mData + mTensor.size(), otherImplCuda.mData);
-    }
+    bool operator==(const TensorImpl &otherImpl) const override final;
 
     static std::unique_ptr<TensorImpl_cuda> create(const Tensor &tensor) {
         return std::make_unique<TensorImpl_cuda<T>>(tensor);
@@ -87,9 +80,6 @@ class TensorImpl_cuda : public TensorImpl, public TensorImpl_cuda_  {
                     stride *= mTensor.dims()[dim];
                 }
 
-                std::reverse(dims.begin(), dims.end());
-                std::reverse(strides.begin(), strides.end());
-
                 CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(mCudnnTensor,
                                             CudaContext::data_type<T>::value,
                                             dims.size(),
diff --git a/include/aidge/backend/cuda/operator/ConvImpl.hpp b/include/aidge/backend/cuda/operator/ConvImpl.hpp
index cb1202f4cffd814fac67453065a786c50f1eed18..07c8a7f0d4b8c9663b17a42e216619e8e927ba89 100644
--- a/include/aidge/backend/cuda/operator/ConvImpl.hpp
+++ b/include/aidge/backend/cuda/operator/ConvImpl.hpp
@@ -46,6 +46,7 @@ private:
 public:
     ConvImpl_cuda(const Conv_Op<DIM> &op) : mOp(op), mNbConsumedData({0, 0, 0}), mNbProducedData({0}) {
         CHECK_CUDNN_STATUS(cudnnCreateConvolutionDescriptor(&mConvDesc));
+        CHECK_CUDNN_STATUS(cudnnCreateFilterDescriptor(&mFilterDesc));
     }
 
     static std::unique_ptr<ConvImpl_cuda> create(const Conv_Op<2> &op) {
diff --git a/src/data/TensorImpl.cu b/src/data/TensorImpl.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4104d27321fb75500fa55dd696bf1c589d69e76e
--- /dev/null
+++ b/src/data/TensorImpl.cu
@@ -0,0 +1,24 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include "aidge/backend/cuda/data/TensorImpl.hpp"
+
+template <class T>
+bool Aidge::TensorImpl_cuda<T>::operator==(const TensorImpl &otherImpl) const {
+    const auto& otherImplCuda = static_cast<const TensorImpl_cuda<T>&>(otherImpl);
+
+    if (mTensor.size() != otherImplCuda.mTensor.size())
+        return false;
+
+    thrust::device_ptr<T> thrustData(mData);
+    thrust::device_ptr<T> thrustOtherData(otherImplCuda.mData);
+    return thrust::equal(thrustData, thrustData + mTensor.size(), thrustOtherData);
+}
diff --git a/src/operator/ConvImpl.cpp b/src/operator/ConvImpl.cpp
index fdfdc626cbbb86d03a2e24a0b2dc42134be8904c..7d46072725566d901dadd60b92a42e9b9491c3aa 100644
--- a/src/operator/ConvImpl.cpp
+++ b/src/operator/ConvImpl.cpp
@@ -78,9 +78,9 @@ void Aidge::ConvImpl_cuda<DIM>::forward() {
     assert(mOp.getInput(0) && "missing input #0");
     assert(mOp.getInput(1) && "missing input #1");
 
-    const std::vector<int> strides(mOp.template get<ConvParam::StrideDims>().rbegin(), mOp.template get<ConvParam::StrideDims>().rend());
+    const std::vector<int> strides(mOp.template get<ConvParam::StrideDims>().begin(), mOp.template get<ConvParam::StrideDims>().end());
     const std::vector<int> paddings(DIM, 0);
-    const std::vector<int> upscales(mOp.template get<ConvParam::DilationDims>().rbegin(), mOp.template get<ConvParam::DilationDims>().rend());
+    const std::vector<int> upscales(mOp.template get<ConvParam::DilationDims>().begin(), mOp.template get<ConvParam::DilationDims>().end());
 
     CHECK_CUDNN_STATUS(
         cudnnSetConvolutionNdDescriptor(mConvDesc,
@@ -89,17 +89,15 @@ void Aidge::ConvImpl_cuda<DIM>::forward() {
                                         &strides[0],
                                         &upscales[0],
                                         CUDNN_CROSS_CORRELATION,
-                                        DataTypeToCudnn(mOp.getInput(2)->dataType())));
+                                        DataTypeToCudnn(mOp.getOutput(0)->dataType())));
 
-    const std::vector<int> cudaKernelDims(mOp.getInput(1)->dims().rbegin(),
-                                          mOp.getInput(1)->dims().rend());
+    const std::vector<int> kernels(mOp.getInput(1)->dims().begin(), mOp.getInput(1)->dims().end());
 
-    CHECK_CUDNN_STATUS(cudnnCreateFilterDescriptor(&mFilterDesc));
     CHECK_CUDNN_STATUS(cudnnSetFilterNdDescriptor(mFilterDesc,
                                                 DataTypeToCudnn(mOp.getInput(1)->dataType()),
                                                 CUDNN_TENSOR_NCHW,
-                                                cudaKernelDims.size(),
-                                                &cudaKernelDims[0]));
+                                                kernels.size(),
+                                                &kernels[0]));
 
     int maxAlgoIterations = 0;
     cudnnGetConvolutionForwardAlgorithmMaxCount(CudaContext::cudnnHandle(),
@@ -221,10 +219,20 @@ void Aidge::ConvImpl_cuda<DIM>::forward_() {
                                 dynamic_cast<TensorImpl_cuda_*>(mOp.getOutput(0)->getImpl().get())->getCudnnTensorDesc(),
                                 mOp.getOutput(0)->getImpl()->rawPtr()));
 
-    if (mOp.getInput(2) != nullptr) {
+    if (mOp.getInput(2) && mOp.getInput(2)->size() > 0) {
+        // Bias tensor needs to have the same number of dims than output tensor for cudnnAddTensor()
+        std::vector<DimSize_t> biasDims(DIM+2, 1);
+        biasDims[1] = mOp.getInput(2)->size();
+
+        // Create a dummy tensor with the right dims in order to get a CuDNN tensor descriptor (with getCudnnTensorDesc())
+        Tensor bias(mOp.getInput(2)->dataType());
+        bias.setBackend("cuda");
+        bias.resize(biasDims);
+        // TODO: find a more elegant solution
+
         CHECK_CUDNN_STATUS(cudnnAddTensor(CudaContext::cudnnHandle(),
                                             &alpha,
-                                            dynamic_cast<TensorImpl_cuda_*>(mOp.getInput(2)->getImpl().get())->getCudnnTensorDesc(),
+                                            dynamic_cast<TensorImpl_cuda_*>(bias.getImpl().get())->getCudnnTensorDesc(),
                                             mOp.getInput(2)->getImpl()->rawPtr(),
                                             &alpha,
                                             dynamic_cast<TensorImpl_cuda_*>(mOp.getOutput(0)->getImpl().get())->getCudnnTensorDesc(),
@@ -234,7 +242,12 @@ void Aidge::ConvImpl_cuda<DIM>::forward_() {
 
 template <Aidge::DimIdx_t DIM>
 Aidge::ConvImpl_cuda<DIM>::~ConvImpl_cuda() {
+    cudnnDestroyConvolutionDescriptor(mConvDesc);
+    cudnnDestroyFilterDescriptor(mFilterDesc);
 
+    if (mWorkspace != nullptr) {
+        cudaFree(mWorkspace);
+    }
 }
 
 template <Aidge::DimIdx_t DIM>
diff --git a/unit_tests/Test_ConvImpl.cpp b/unit_tests/Test_ConvImpl.cpp
index 2746f82ffa9840eec7ceec15a1977d0e272a9bde..659528dd1b2a45fcdd67ca0bd3440391a0e79654 100644
--- a/unit_tests/Test_ConvImpl.cpp
+++ b/unit_tests/Test_ConvImpl.cpp
@@ -23,11 +23,61 @@
 using namespace Aidge;
 
 TEST_CASE("[gpu/operator] Conv(forward)") {
+    SECTION("Simple Conv no bias") {
+        std::shared_ptr<Node> myConv = Conv(1,1,{3,3}, "myconv");
+        myConv->getOperator()->setDatatype(DataType::Float32);
+        myConv->getOperator()->setBackend("cuda");
+        std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array4D<float,1,1,3,3> {
+            {
+                {
+                    {{  0,   1,   2},
+                    {  3,   4,   5},
+                    {  6,   7,   8}}
+                }
+            }
+        });
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,1,1,3,3> { //NCHW
+            {
+                {
+                    {{  0,   1,   2},
+                    {  3,   4,   5},
+                    {  6,   7,   8}}
+                }
+            }
+        });
+        const float myOutput = 0*0+1*1+2*2+3*3+4*4+5*5+6*6+7*7+8*8;
+
+        myInput->setBackend("cuda");
+        myWeights->setBackend("cuda");
+
+        myConv->getOperator()->associateInput(0,myInput);
+        myConv->getOperator()->associateInput(1,myWeights);
+        myConv->getOperator()->computeOutputDims();
+        myConv->forward();
+
+        REQUIRE(myConv->getOperator()->getOutput(0)->size() == 1);
+
+        std::array<float, 9> kernel;
+        cudaMemcpy(&kernel[0], myWeights->getImpl()->rawPtr(), 9 * sizeof(float), cudaMemcpyDeviceToHost);
+        std::array<float, 9> input;
+        cudaMemcpy(&input[0], myInput->getImpl()->rawPtr(), 9 * sizeof(float), cudaMemcpyDeviceToHost);
+
+        for (int i = 0; i < 9; ++i) {
+            REQUIRE(kernel[i] == i);
+            REQUIRE(input[i] == i);
+        }
+
+        float computedOutput;
+        cudaMemcpy(&computedOutput, myConv->getOperator()->getOutput(0)->getImpl()->rawPtr(), sizeof(float), cudaMemcpyDeviceToHost);
+
+        REQUIRE(fabs(computedOutput - myOutput) < 1e-6);
+    }
+
     SECTION("Classic Conv") {
         std::shared_ptr<Node> myConv = Conv(3,4,{3,3}, "myconv");
         myConv->getOperator()->setDatatype(DataType::Float32);
         myConv->getOperator()->setBackend("cuda");
-        std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array4D<int,4,3,3,3> {
+        std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array4D<float,4,3,3,3> {
             {
                 {
                     {{  0,   1,   2},
@@ -75,8 +125,8 @@ TEST_CASE("[gpu/operator] Conv(forward)") {
                 }
             }
         });
-        std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<int,4> {{7,0,9,0}});
-        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<int,2,3,5,5> { //NCHW
+        std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float,4> {{7,0,9,0}});
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,3,5,5> { //NCHW
             {
                 {
                     {{  0,   1,   2,   3,   4},
@@ -118,7 +168,7 @@ TEST_CASE("[gpu/operator] Conv(forward)") {
                 }
             }
         });
-        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<int,2,4,3,3> { 
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,4,3,3> { 
             {
                 {
                     {{ 15226,  15577,  15928},
@@ -150,12 +200,26 @@ TEST_CASE("[gpu/operator] Conv(forward)") {
                 }
             }
         });
+
+        myInput->setBackend("cuda");
+        myWeights->setBackend("cuda");
+        myBias->setBackend("cuda");
+
         myConv->getOperator()->associateInput(0,myInput);
         myConv->getOperator()->associateInput(1,myWeights);
         myConv->getOperator()->associateInput(2,myBias);
         myConv->getOperator()->computeOutputDims();
         myConv->forward();
         // myConv->getOperator()->getOutput(0)->print();
-        REQUIRE(*(myConv->getOperator()->getOutput(0)) == *myOutput);
+
+        float* computedOutput   = new float[myOutput->size()]();
+        cudaMemcpy(computedOutput, myConv->getOperator()->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+
+        for(int i = 0; i < myOutput->size(); i++){
+            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+            REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
+        }
+
+        delete[] computedOutput;
     }
 }